dgx-spark-playbooks/nvidia/txt2kg/assets/deploy/services/gpu-viz/semantic_clustering_service.py
2025-10-06 17:05:41 +00:00

601 lines
22 KiB
Python

"""
Semantic Clustering Service for Knowledge Graphs
Groups nodes by semantic similarity of names, types, and content rather than just spatial coordinates
"""
import asyncio
import logging
import time
from typing import Dict, List, Any, Tuple, Set, Optional
from dataclasses import dataclass
from collections import defaultdict
import numpy as np
import re
from difflib import SequenceMatcher
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
# Try to import GPU libraries
try:
import cupy as cp
import cuml
from cuml.cluster import KMeans as cuKMeans, DBSCAN as cuDBSCAN
HAS_GPU = True
print("✅ GPU libraries (CuPy, cuML) available for semantic clustering")
except ImportError:
HAS_GPU = False
print("⚠️ GPU libraries not available, using CPU for semantic clustering")
logger = logging.getLogger(__name__)
@dataclass
class SemanticClusterResult:
"""Result of semantic clustering operation"""
clustered_nodes: List[Dict[str, Any]]
cluster_info: Dict[str, Any]
similarity_matrix: Optional[np.ndarray] = None
cluster_labels: Optional[np.ndarray] = None
class SemanticSimilarityCalculator:
"""Calculate semantic similarity between node names and content"""
def __init__(self):
self.tfidf_vectorizer = TfidfVectorizer(
max_features=1000,
stop_words='english',
ngram_range=(1, 2),
lowercase=True
)
self.fitted = False
def calculate_name_similarity(self, name1: str, name2: str) -> float:
"""Calculate similarity between two node names using multiple methods"""
if not name1 or not name2:
return 0.0
name1_clean = self._clean_name(name1)
name2_clean = self._clean_name(name2)
# Method 1: Exact match
if name1_clean == name2_clean:
return 1.0
# Method 2: Substring match
if name1_clean in name2_clean or name2_clean in name1_clean:
return 0.8
# Method 3: Sequence similarity (Levenshtein-based)
seq_similarity = SequenceMatcher(None, name1_clean, name2_clean).ratio()
# Method 4: Word overlap (Jaccard similarity)
words1 = set(name1_clean.split())
words2 = set(name2_clean.split())
if words1 and words2:
jaccard_sim = len(words1.intersection(words2)) / len(words1.union(words2))
else:
jaccard_sim = 0.0
# Method 5: Common prefix/suffix
prefix_sim = self._prefix_similarity(name1_clean, name2_clean)
suffix_sim = self._suffix_similarity(name1_clean, name2_clean)
# Combine similarities with weights
combined_similarity = (
seq_similarity * 0.3 +
jaccard_sim * 0.4 +
prefix_sim * 0.15 +
suffix_sim * 0.15
)
return min(combined_similarity, 1.0)
def calculate_content_similarity(self, nodes: List[Dict[str, Any]]) -> np.ndarray:
"""Calculate content similarity matrix using TF-IDF"""
# Extract text content from nodes
texts = []
for node in nodes:
text_parts = []
# Add node name
if node.get('name'):
text_parts.append(str(node['name']))
# Add node type/group
if node.get('group') or node.get('type'):
text_parts.append(str(node.get('group', node.get('type', ''))))
# Add any description or content
for key in ['description', 'content', 'label', 'properties']:
if node.get(key):
text_parts.append(str(node[key]))
# Combine all text
combined_text = ' '.join(text_parts)
texts.append(combined_text if combined_text.strip() else node.get('name', 'unnamed'))
# Calculate TF-IDF similarity
if not self.fitted and texts:
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
self.fitted = True
else:
tfidf_matrix = self.tfidf_vectorizer.transform(texts)
# Calculate cosine similarity matrix
similarity_matrix = cosine_similarity(tfidf_matrix)
return similarity_matrix
def _clean_name(self, name: str) -> str:
"""Clean and normalize node name"""
if not name:
return ""
# Convert to lowercase
cleaned = name.lower().strip()
# Remove special characters but keep spaces and alphanumeric
cleaned = re.sub(r'[^\w\s-]', ' ', cleaned)
# Normalize whitespace
cleaned = re.sub(r'\s+', ' ', cleaned)
return cleaned.strip()
def _prefix_similarity(self, name1: str, name2: str) -> float:
"""Calculate similarity based on common prefix"""
min_len = min(len(name1), len(name2))
if min_len == 0:
return 0.0
common_prefix = 0
for i in range(min_len):
if name1[i] == name2[i]:
common_prefix += 1
else:
break
return common_prefix / min_len
def _suffix_similarity(self, name1: str, name2: str) -> float:
"""Calculate similarity based on common suffix"""
min_len = min(len(name1), len(name2))
if min_len == 0:
return 0.0
common_suffix = 0
for i in range(1, min_len + 1):
if name1[-i] == name2[-i]:
common_suffix += 1
else:
break
return common_suffix / min_len
class SemanticClusteringEngine:
"""Main semantic clustering engine"""
def __init__(self, use_gpu: bool = None):
self.use_gpu = use_gpu if use_gpu is not None else HAS_GPU
self.similarity_calc = SemanticSimilarityCalculator()
logger.info(f"Semantic clustering engine initialized (GPU: {self.use_gpu})")
def cluster_by_name_similarity(
self,
nodes: List[Dict[str, Any]],
algorithm: str = "hierarchical",
n_clusters: Optional[int] = None,
similarity_threshold: float = 0.7
) -> SemanticClusterResult:
"""
Cluster nodes based on name similarity
Args:
nodes: List of node dictionaries
algorithm: 'hierarchical', 'kmeans', 'dbscan'
n_clusters: Number of clusters (for kmeans/hierarchical)
similarity_threshold: Minimum similarity for clustering (for dbscan)
"""
start_time = time.time()
n_nodes = len(nodes)
logger.info(f"🧠 Starting semantic clustering of {n_nodes} nodes using {algorithm}")
if n_nodes < 2:
return self._create_single_cluster_result(nodes, start_time)
# Calculate name similarity matrix
similarity_matrix = self._calculate_name_similarity_matrix(nodes)
# Convert similarity to distance matrix
distance_matrix = 1.0 - similarity_matrix
# Apply clustering algorithm
if algorithm == "hierarchical":
cluster_labels = self._hierarchical_clustering(
distance_matrix, n_clusters or min(10, n_nodes // 2)
)
elif algorithm == "kmeans":
cluster_labels = self._kmeans_clustering(
similarity_matrix, n_clusters or min(10, n_nodes // 2)
)
elif algorithm == "dbscan":
cluster_labels = self._dbscan_clustering(
distance_matrix, similarity_threshold
)
else:
raise ValueError(f"Unknown clustering algorithm: {algorithm}")
# Create clustered nodes
clustered_nodes = []
for i, node in enumerate(nodes):
clustered_node = {
**node,
'cluster_id': int(cluster_labels[i]),
'node_index': i
}
clustered_nodes.append(clustered_node)
processing_time = time.time() - start_time
# Calculate cluster statistics
unique_clusters = len(set(cluster_labels))
cluster_sizes = defaultdict(int)
for label in cluster_labels:
cluster_sizes[label] += 1
cluster_info = {
'algorithm': f'semantic_{algorithm}',
'total_clusters': unique_clusters,
'processing_time': processing_time,
'gpu_accelerated': self.use_gpu,
'cluster_sizes': dict(cluster_sizes),
'average_cluster_size': n_nodes / unique_clusters if unique_clusters > 0 else 0,
'similarity_threshold': similarity_threshold if algorithm == 'dbscan' else None
}
logger.info(f"✅ Semantic clustering completed: {unique_clusters} clusters in {processing_time:.3f}s")
return SemanticClusterResult(
clustered_nodes=clustered_nodes,
cluster_info=cluster_info,
similarity_matrix=similarity_matrix,
cluster_labels=cluster_labels
)
def cluster_by_content_similarity(
self,
nodes: List[Dict[str, Any]],
algorithm: str = "kmeans",
n_clusters: Optional[int] = None
) -> SemanticClusterResult:
"""Cluster nodes based on content similarity using TF-IDF"""
start_time = time.time()
n_nodes = len(nodes)
logger.info(f"📄 Starting content-based clustering of {n_nodes} nodes")
if n_nodes < 2:
return self._create_single_cluster_result(nodes, start_time)
# Calculate content similarity
similarity_matrix = self.similarity_calc.calculate_content_similarity(nodes)
# Apply clustering
if algorithm == "kmeans":
n_clusters = n_clusters or min(10, n_nodes // 2)
if self.use_gpu and HAS_GPU:
cluster_labels = self._gpu_kmeans_clustering(similarity_matrix, n_clusters)
else:
cluster_labels = self._kmeans_clustering(similarity_matrix, n_clusters)
else:
distance_matrix = 1.0 - similarity_matrix
cluster_labels = self._hierarchical_clustering(
distance_matrix, n_clusters or min(10, n_nodes // 2)
)
# Create result
clustered_nodes = []
for i, node in enumerate(nodes):
clustered_node = {
**node,
'cluster_id': int(cluster_labels[i]),
'node_index': i
}
clustered_nodes.append(clustered_node)
processing_time = time.time() - start_time
unique_clusters = len(set(cluster_labels))
cluster_info = {
'algorithm': f'content_{algorithm}',
'total_clusters': unique_clusters,
'processing_time': processing_time,
'gpu_accelerated': self.use_gpu and algorithm == 'kmeans',
'average_cluster_size': n_nodes / unique_clusters if unique_clusters > 0 else 0
}
logger.info(f"✅ Content clustering completed: {unique_clusters} clusters in {processing_time:.3f}s")
return SemanticClusterResult(
clustered_nodes=clustered_nodes,
cluster_info=cluster_info,
similarity_matrix=similarity_matrix,
cluster_labels=cluster_labels
)
def hybrid_clustering(
self,
nodes: List[Dict[str, Any]],
name_weight: float = 0.6,
content_weight: float = 0.3,
spatial_weight: float = 0.1,
algorithm: str = "hierarchical",
n_clusters: Optional[int] = None
) -> SemanticClusterResult:
"""
Hybrid clustering combining name, content, and spatial similarities
Args:
name_weight: Weight for name similarity (0.0-1.0)
content_weight: Weight for content similarity (0.0-1.0)
spatial_weight: Weight for spatial similarity (0.0-1.0)
"""
start_time = time.time()
n_nodes = len(nodes)
logger.info(f"🔄 Starting hybrid clustering of {n_nodes} nodes")
logger.info(f" Weights: name={name_weight}, content={content_weight}, spatial={spatial_weight}")
if n_nodes < 2:
return self._create_single_cluster_result(nodes, start_time)
# Normalize weights
total_weight = name_weight + content_weight + spatial_weight
if total_weight > 0:
name_weight /= total_weight
content_weight /= total_weight
spatial_weight /= total_weight
# Calculate different similarity matrices
similarities = []
weights = []
if name_weight > 0:
name_similarity = self._calculate_name_similarity_matrix(nodes)
similarities.append(name_similarity)
weights.append(name_weight)
if content_weight > 0:
content_similarity = self.similarity_calc.calculate_content_similarity(nodes)
similarities.append(content_similarity)
weights.append(content_weight)
if spatial_weight > 0:
spatial_similarity = self._calculate_spatial_similarity_matrix(nodes)
similarities.append(spatial_similarity)
weights.append(spatial_weight)
# Combine similarities
if not similarities:
return self._create_single_cluster_result(nodes, start_time)
combined_similarity = np.zeros((n_nodes, n_nodes))
for similarity, weight in zip(similarities, weights):
combined_similarity += similarity * weight
# Apply clustering
distance_matrix = 1.0 - combined_similarity
if algorithm == "hierarchical":
cluster_labels = self._hierarchical_clustering(
distance_matrix, n_clusters or min(10, n_nodes // 2)
)
elif algorithm == "kmeans":
cluster_labels = self._kmeans_clustering(
combined_similarity, n_clusters or min(10, n_nodes // 2)
)
else:
cluster_labels = self._dbscan_clustering(distance_matrix, 0.3)
# Create result
clustered_nodes = []
for i, node in enumerate(nodes):
clustered_node = {
**node,
'cluster_id': int(cluster_labels[i]),
'node_index': i
}
clustered_nodes.append(clustered_node)
processing_time = time.time() - start_time
unique_clusters = len(set(cluster_labels))
cluster_info = {
'algorithm': f'hybrid_{algorithm}',
'total_clusters': unique_clusters,
'processing_time': processing_time,
'gpu_accelerated': self.use_gpu,
'weights': {
'name': name_weight,
'content': content_weight,
'spatial': spatial_weight
},
'average_cluster_size': n_nodes / unique_clusters if unique_clusters > 0 else 0
}
logger.info(f"✅ Hybrid clustering completed: {unique_clusters} clusters in {processing_time:.3f}s")
return SemanticClusterResult(
clustered_nodes=clustered_nodes,
cluster_info=cluster_info,
similarity_matrix=combined_similarity,
cluster_labels=cluster_labels
)
def _calculate_name_similarity_matrix(self, nodes: List[Dict[str, Any]]) -> np.ndarray:
"""Calculate pairwise name similarity matrix"""
n_nodes = len(nodes)
similarity_matrix = np.zeros((n_nodes, n_nodes))
for i in range(n_nodes):
for j in range(i, n_nodes):
if i == j:
similarity_matrix[i, j] = 1.0
else:
name1 = nodes[i].get('name', '')
name2 = nodes[j].get('name', '')
similarity = self.similarity_calc.calculate_name_similarity(name1, name2)
similarity_matrix[i, j] = similarity
similarity_matrix[j, i] = similarity # Symmetric
return similarity_matrix
def _calculate_spatial_similarity_matrix(self, nodes: List[Dict[str, Any]]) -> np.ndarray:
"""Calculate spatial similarity based on node positions"""
n_nodes = len(nodes)
similarity_matrix = np.zeros((n_nodes, n_nodes))
# Extract coordinates
coords = []
for node in nodes:
x = float(node.get('x', 0))
y = float(node.get('y', 0))
z = float(node.get('z', 0))
coords.append([x, y, z])
coords = np.array(coords)
# Calculate pairwise distances
for i in range(n_nodes):
for j in range(i, n_nodes):
if i == j:
similarity_matrix[i, j] = 1.0
else:
# Euclidean distance
dist = np.linalg.norm(coords[i] - coords[j])
# Convert distance to similarity (closer = more similar)
# Use exponential decay: similarity = exp(-distance/scale)
scale = 50.0 # Adjust based on your coordinate system
similarity = np.exp(-dist / scale)
similarity_matrix[i, j] = similarity
similarity_matrix[j, i] = similarity
return similarity_matrix
def _hierarchical_clustering(self, distance_matrix: np.ndarray, n_clusters: int) -> np.ndarray:
"""Apply hierarchical clustering"""
clusterer = AgglomerativeClustering(
n_clusters=n_clusters,
metric='precomputed',
linkage='average'
)
return clusterer.fit_predict(distance_matrix)
def _kmeans_clustering(self, similarity_matrix: np.ndarray, n_clusters: int) -> np.ndarray:
"""Apply K-means clustering"""
clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
return clusterer.fit_predict(similarity_matrix)
def _gpu_kmeans_clustering(self, similarity_matrix: np.ndarray, n_clusters: int) -> np.ndarray:
"""Apply GPU-accelerated K-means clustering"""
try:
gpu_matrix = cp.array(similarity_matrix, dtype=cp.float32)
clusterer = cuKMeans(n_clusters=n_clusters, random_state=42)
labels = clusterer.fit_predict(gpu_matrix)
return cp.asnumpy(labels)
except Exception as e:
logger.warning(f"GPU K-means failed, falling back to CPU: {e}")
return self._kmeans_clustering(similarity_matrix, n_clusters)
def _dbscan_clustering(self, distance_matrix: np.ndarray, eps: float) -> np.ndarray:
"""Apply DBSCAN clustering"""
clusterer = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
labels = clusterer.fit_predict(distance_matrix)
# DBSCAN uses -1 for noise points, convert to positive integers
unique_labels = set(labels)
if -1 in unique_labels:
# Assign noise points to individual clusters
max_label = max(labels) if len(unique_labels) > 1 else -1
noise_cluster = max_label + 1
labels = np.array([noise_cluster if label == -1 else label for label in labels])
return labels
def _create_single_cluster_result(self, nodes: List[Dict[str, Any]], start_time: float) -> SemanticClusterResult:
"""Create result for single cluster (when too few nodes)"""
clustered_nodes = []
for i, node in enumerate(nodes):
clustered_node = {
**node,
'cluster_id': 0,
'node_index': i
}
clustered_nodes.append(clustered_node)
processing_time = time.time() - start_time
cluster_info = {
'algorithm': 'single_cluster',
'total_clusters': 1,
'processing_time': processing_time,
'gpu_accelerated': False,
'average_cluster_size': len(nodes)
}
return SemanticClusterResult(
clustered_nodes=clustered_nodes,
cluster_info=cluster_info,
similarity_matrix=None,
cluster_labels=np.zeros(len(nodes), dtype=int)
)
# Convenience functions for easy integration
async def cluster_nodes_by_similarity(
nodes: List[Dict[str, Any]],
method: str = "hybrid",
algorithm: str = "hierarchical",
n_clusters: Optional[int] = None,
**kwargs
) -> SemanticClusterResult:
"""
Main entry point for semantic clustering
Args:
nodes: List of node dictionaries
method: 'name', 'content', 'hybrid'
algorithm: 'hierarchical', 'kmeans', 'dbscan'
n_clusters: Number of clusters (if applicable)
**kwargs: Additional parameters for specific methods
"""
engine = SemanticClusteringEngine()
if method == "name":
return engine.cluster_by_name_similarity(nodes, algorithm, n_clusters, **kwargs)
elif method == "content":
return engine.cluster_by_content_similarity(nodes, algorithm, n_clusters, **kwargs)
elif method == "hybrid":
return engine.hybrid_clustering(nodes, algorithm=algorithm, n_clusters=n_clusters, **kwargs)
else:
raise ValueError(f"Unknown clustering method: {method}")
if __name__ == "__main__":
# Example usage
test_nodes = [
{"name": "Machine Learning", "x": 0, "y": 0, "z": 0, "group": "AI"},
{"name": "Deep Learning", "x": 10, "y": 5, "z": 2, "group": "AI"},
{"name": "Neural Networks", "x": 15, "y": 8, "z": 3, "group": "AI"},
{"name": "Data Science", "x": 20, "y": 10, "z": 5, "group": "Data"},
{"name": "Statistics", "x": 25, "y": 15, "z": 8, "group": "Math"},
{"name": "Linear Algebra", "x": 30, "y": 20, "z": 10, "group": "Math"},
]
async def test():
result = await cluster_nodes_by_similarity(test_nodes, method="hybrid")
print("Cluster Result:", result.cluster_info)
for node in result.clustered_nodes:
print(f" {node['name']} -> Cluster {node['cluster_id']}")
asyncio.run(test())