mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-23 18:33:54 +00:00
617 lines
23 KiB
Python
617 lines
23 KiB
Python
#
|
|
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
"""
|
|
Semantic Clustering Service for Knowledge Graphs
|
|
Groups nodes by semantic similarity of names, types, and content rather than just spatial coordinates
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import time
|
|
from typing import Dict, List, Any, Tuple, Set, Optional
|
|
from dataclasses import dataclass
|
|
from collections import defaultdict
|
|
import numpy as np
|
|
import re
|
|
from difflib import SequenceMatcher
|
|
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import networkx as nx
|
|
|
|
# Try to import GPU libraries
|
|
try:
|
|
import cupy as cp
|
|
import cuml
|
|
from cuml.cluster import KMeans as cuKMeans, DBSCAN as cuDBSCAN
|
|
HAS_GPU = True
|
|
print("✅ GPU libraries (CuPy, cuML) available for semantic clustering")
|
|
except ImportError:
|
|
HAS_GPU = False
|
|
print("⚠️ GPU libraries not available, using CPU for semantic clustering")
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
@dataclass
|
|
class SemanticClusterResult:
|
|
"""Result of semantic clustering operation"""
|
|
clustered_nodes: List[Dict[str, Any]]
|
|
cluster_info: Dict[str, Any]
|
|
similarity_matrix: Optional[np.ndarray] = None
|
|
cluster_labels: Optional[np.ndarray] = None
|
|
|
|
class SemanticSimilarityCalculator:
|
|
"""Calculate semantic similarity between node names and content"""
|
|
|
|
def __init__(self):
|
|
self.tfidf_vectorizer = TfidfVectorizer(
|
|
max_features=1000,
|
|
stop_words='english',
|
|
ngram_range=(1, 2),
|
|
lowercase=True
|
|
)
|
|
self.fitted = False
|
|
|
|
def calculate_name_similarity(self, name1: str, name2: str) -> float:
|
|
"""Calculate similarity between two node names using multiple methods"""
|
|
if not name1 or not name2:
|
|
return 0.0
|
|
|
|
name1_clean = self._clean_name(name1)
|
|
name2_clean = self._clean_name(name2)
|
|
|
|
# Method 1: Exact match
|
|
if name1_clean == name2_clean:
|
|
return 1.0
|
|
|
|
# Method 2: Substring match
|
|
if name1_clean in name2_clean or name2_clean in name1_clean:
|
|
return 0.8
|
|
|
|
# Method 3: Sequence similarity (Levenshtein-based)
|
|
seq_similarity = SequenceMatcher(None, name1_clean, name2_clean).ratio()
|
|
|
|
# Method 4: Word overlap (Jaccard similarity)
|
|
words1 = set(name1_clean.split())
|
|
words2 = set(name2_clean.split())
|
|
if words1 and words2:
|
|
jaccard_sim = len(words1.intersection(words2)) / len(words1.union(words2))
|
|
else:
|
|
jaccard_sim = 0.0
|
|
|
|
# Method 5: Common prefix/suffix
|
|
prefix_sim = self._prefix_similarity(name1_clean, name2_clean)
|
|
suffix_sim = self._suffix_similarity(name1_clean, name2_clean)
|
|
|
|
# Combine similarities with weights
|
|
combined_similarity = (
|
|
seq_similarity * 0.3 +
|
|
jaccard_sim * 0.4 +
|
|
prefix_sim * 0.15 +
|
|
suffix_sim * 0.15
|
|
)
|
|
|
|
return min(combined_similarity, 1.0)
|
|
|
|
def calculate_content_similarity(self, nodes: List[Dict[str, Any]]) -> np.ndarray:
|
|
"""Calculate content similarity matrix using TF-IDF"""
|
|
# Extract text content from nodes
|
|
texts = []
|
|
for node in nodes:
|
|
text_parts = []
|
|
|
|
# Add node name
|
|
if node.get('name'):
|
|
text_parts.append(str(node['name']))
|
|
|
|
# Add node type/group
|
|
if node.get('group') or node.get('type'):
|
|
text_parts.append(str(node.get('group', node.get('type', ''))))
|
|
|
|
# Add any description or content
|
|
for key in ['description', 'content', 'label', 'properties']:
|
|
if node.get(key):
|
|
text_parts.append(str(node[key]))
|
|
|
|
# Combine all text
|
|
combined_text = ' '.join(text_parts)
|
|
texts.append(combined_text if combined_text.strip() else node.get('name', 'unnamed'))
|
|
|
|
# Calculate TF-IDF similarity
|
|
if not self.fitted and texts:
|
|
tfidf_matrix = self.tfidf_vectorizer.fit_transform(texts)
|
|
self.fitted = True
|
|
else:
|
|
tfidf_matrix = self.tfidf_vectorizer.transform(texts)
|
|
|
|
# Calculate cosine similarity matrix
|
|
similarity_matrix = cosine_similarity(tfidf_matrix)
|
|
return similarity_matrix
|
|
|
|
def _clean_name(self, name: str) -> str:
|
|
"""Clean and normalize node name"""
|
|
if not name:
|
|
return ""
|
|
|
|
# Convert to lowercase
|
|
cleaned = name.lower().strip()
|
|
|
|
# Remove special characters but keep spaces and alphanumeric
|
|
cleaned = re.sub(r'[^\w\s-]', ' ', cleaned)
|
|
|
|
# Normalize whitespace
|
|
cleaned = re.sub(r'\s+', ' ', cleaned)
|
|
|
|
return cleaned.strip()
|
|
|
|
def _prefix_similarity(self, name1: str, name2: str) -> float:
|
|
"""Calculate similarity based on common prefix"""
|
|
min_len = min(len(name1), len(name2))
|
|
if min_len == 0:
|
|
return 0.0
|
|
|
|
common_prefix = 0
|
|
for i in range(min_len):
|
|
if name1[i] == name2[i]:
|
|
common_prefix += 1
|
|
else:
|
|
break
|
|
|
|
return common_prefix / min_len
|
|
|
|
def _suffix_similarity(self, name1: str, name2: str) -> float:
|
|
"""Calculate similarity based on common suffix"""
|
|
min_len = min(len(name1), len(name2))
|
|
if min_len == 0:
|
|
return 0.0
|
|
|
|
common_suffix = 0
|
|
for i in range(1, min_len + 1):
|
|
if name1[-i] == name2[-i]:
|
|
common_suffix += 1
|
|
else:
|
|
break
|
|
|
|
return common_suffix / min_len
|
|
|
|
class SemanticClusteringEngine:
|
|
"""Main semantic clustering engine"""
|
|
|
|
def __init__(self, use_gpu: bool = None):
|
|
self.use_gpu = use_gpu if use_gpu is not None else HAS_GPU
|
|
self.similarity_calc = SemanticSimilarityCalculator()
|
|
logger.info(f"Semantic clustering engine initialized (GPU: {self.use_gpu})")
|
|
|
|
def cluster_by_name_similarity(
|
|
self,
|
|
nodes: List[Dict[str, Any]],
|
|
algorithm: str = "hierarchical",
|
|
n_clusters: Optional[int] = None,
|
|
similarity_threshold: float = 0.7
|
|
) -> SemanticClusterResult:
|
|
"""
|
|
Cluster nodes based on name similarity
|
|
|
|
Args:
|
|
nodes: List of node dictionaries
|
|
algorithm: 'hierarchical', 'kmeans', 'dbscan'
|
|
n_clusters: Number of clusters (for kmeans/hierarchical)
|
|
similarity_threshold: Minimum similarity for clustering (for dbscan)
|
|
"""
|
|
start_time = time.time()
|
|
n_nodes = len(nodes)
|
|
|
|
logger.info(f"🧠 Starting semantic clustering of {n_nodes} nodes using {algorithm}")
|
|
|
|
if n_nodes < 2:
|
|
return self._create_single_cluster_result(nodes, start_time)
|
|
|
|
# Calculate name similarity matrix
|
|
similarity_matrix = self._calculate_name_similarity_matrix(nodes)
|
|
|
|
# Convert similarity to distance matrix
|
|
distance_matrix = 1.0 - similarity_matrix
|
|
|
|
# Apply clustering algorithm
|
|
if algorithm == "hierarchical":
|
|
cluster_labels = self._hierarchical_clustering(
|
|
distance_matrix, n_clusters or min(10, n_nodes // 2)
|
|
)
|
|
elif algorithm == "kmeans":
|
|
cluster_labels = self._kmeans_clustering(
|
|
similarity_matrix, n_clusters or min(10, n_nodes // 2)
|
|
)
|
|
elif algorithm == "dbscan":
|
|
cluster_labels = self._dbscan_clustering(
|
|
distance_matrix, similarity_threshold
|
|
)
|
|
else:
|
|
raise ValueError(f"Unknown clustering algorithm: {algorithm}")
|
|
|
|
# Create clustered nodes
|
|
clustered_nodes = []
|
|
for i, node in enumerate(nodes):
|
|
clustered_node = {
|
|
**node,
|
|
'cluster_id': int(cluster_labels[i]),
|
|
'node_index': i
|
|
}
|
|
clustered_nodes.append(clustered_node)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
# Calculate cluster statistics
|
|
unique_clusters = len(set(cluster_labels))
|
|
cluster_sizes = defaultdict(int)
|
|
for label in cluster_labels:
|
|
cluster_sizes[label] += 1
|
|
|
|
cluster_info = {
|
|
'algorithm': f'semantic_{algorithm}',
|
|
'total_clusters': unique_clusters,
|
|
'processing_time': processing_time,
|
|
'gpu_accelerated': self.use_gpu,
|
|
'cluster_sizes': dict(cluster_sizes),
|
|
'average_cluster_size': n_nodes / unique_clusters if unique_clusters > 0 else 0,
|
|
'similarity_threshold': similarity_threshold if algorithm == 'dbscan' else None
|
|
}
|
|
|
|
logger.info(f"✅ Semantic clustering completed: {unique_clusters} clusters in {processing_time:.3f}s")
|
|
|
|
return SemanticClusterResult(
|
|
clustered_nodes=clustered_nodes,
|
|
cluster_info=cluster_info,
|
|
similarity_matrix=similarity_matrix,
|
|
cluster_labels=cluster_labels
|
|
)
|
|
|
|
def cluster_by_content_similarity(
|
|
self,
|
|
nodes: List[Dict[str, Any]],
|
|
algorithm: str = "kmeans",
|
|
n_clusters: Optional[int] = None
|
|
) -> SemanticClusterResult:
|
|
"""Cluster nodes based on content similarity using TF-IDF"""
|
|
start_time = time.time()
|
|
n_nodes = len(nodes)
|
|
|
|
logger.info(f"📄 Starting content-based clustering of {n_nodes} nodes")
|
|
|
|
if n_nodes < 2:
|
|
return self._create_single_cluster_result(nodes, start_time)
|
|
|
|
# Calculate content similarity
|
|
similarity_matrix = self.similarity_calc.calculate_content_similarity(nodes)
|
|
|
|
# Apply clustering
|
|
if algorithm == "kmeans":
|
|
n_clusters = n_clusters or min(10, n_nodes // 2)
|
|
if self.use_gpu and HAS_GPU:
|
|
cluster_labels = self._gpu_kmeans_clustering(similarity_matrix, n_clusters)
|
|
else:
|
|
cluster_labels = self._kmeans_clustering(similarity_matrix, n_clusters)
|
|
else:
|
|
distance_matrix = 1.0 - similarity_matrix
|
|
cluster_labels = self._hierarchical_clustering(
|
|
distance_matrix, n_clusters or min(10, n_nodes // 2)
|
|
)
|
|
|
|
# Create result
|
|
clustered_nodes = []
|
|
for i, node in enumerate(nodes):
|
|
clustered_node = {
|
|
**node,
|
|
'cluster_id': int(cluster_labels[i]),
|
|
'node_index': i
|
|
}
|
|
clustered_nodes.append(clustered_node)
|
|
|
|
processing_time = time.time() - start_time
|
|
unique_clusters = len(set(cluster_labels))
|
|
|
|
cluster_info = {
|
|
'algorithm': f'content_{algorithm}',
|
|
'total_clusters': unique_clusters,
|
|
'processing_time': processing_time,
|
|
'gpu_accelerated': self.use_gpu and algorithm == 'kmeans',
|
|
'average_cluster_size': n_nodes / unique_clusters if unique_clusters > 0 else 0
|
|
}
|
|
|
|
logger.info(f"✅ Content clustering completed: {unique_clusters} clusters in {processing_time:.3f}s")
|
|
|
|
return SemanticClusterResult(
|
|
clustered_nodes=clustered_nodes,
|
|
cluster_info=cluster_info,
|
|
similarity_matrix=similarity_matrix,
|
|
cluster_labels=cluster_labels
|
|
)
|
|
|
|
def hybrid_clustering(
|
|
self,
|
|
nodes: List[Dict[str, Any]],
|
|
name_weight: float = 0.6,
|
|
content_weight: float = 0.3,
|
|
spatial_weight: float = 0.1,
|
|
algorithm: str = "hierarchical",
|
|
n_clusters: Optional[int] = None
|
|
) -> SemanticClusterResult:
|
|
"""
|
|
Hybrid clustering combining name, content, and spatial similarities
|
|
|
|
Args:
|
|
name_weight: Weight for name similarity (0.0-1.0)
|
|
content_weight: Weight for content similarity (0.0-1.0)
|
|
spatial_weight: Weight for spatial similarity (0.0-1.0)
|
|
"""
|
|
start_time = time.time()
|
|
n_nodes = len(nodes)
|
|
|
|
logger.info(f"🔄 Starting hybrid clustering of {n_nodes} nodes")
|
|
logger.info(f" Weights: name={name_weight}, content={content_weight}, spatial={spatial_weight}")
|
|
|
|
if n_nodes < 2:
|
|
return self._create_single_cluster_result(nodes, start_time)
|
|
|
|
# Normalize weights
|
|
total_weight = name_weight + content_weight + spatial_weight
|
|
if total_weight > 0:
|
|
name_weight /= total_weight
|
|
content_weight /= total_weight
|
|
spatial_weight /= total_weight
|
|
|
|
# Calculate different similarity matrices
|
|
similarities = []
|
|
weights = []
|
|
|
|
if name_weight > 0:
|
|
name_similarity = self._calculate_name_similarity_matrix(nodes)
|
|
similarities.append(name_similarity)
|
|
weights.append(name_weight)
|
|
|
|
if content_weight > 0:
|
|
content_similarity = self.similarity_calc.calculate_content_similarity(nodes)
|
|
similarities.append(content_similarity)
|
|
weights.append(content_weight)
|
|
|
|
if spatial_weight > 0:
|
|
spatial_similarity = self._calculate_spatial_similarity_matrix(nodes)
|
|
similarities.append(spatial_similarity)
|
|
weights.append(spatial_weight)
|
|
|
|
# Combine similarities
|
|
if not similarities:
|
|
return self._create_single_cluster_result(nodes, start_time)
|
|
|
|
combined_similarity = np.zeros((n_nodes, n_nodes))
|
|
for similarity, weight in zip(similarities, weights):
|
|
combined_similarity += similarity * weight
|
|
|
|
# Apply clustering
|
|
distance_matrix = 1.0 - combined_similarity
|
|
|
|
if algorithm == "hierarchical":
|
|
cluster_labels = self._hierarchical_clustering(
|
|
distance_matrix, n_clusters or min(10, n_nodes // 2)
|
|
)
|
|
elif algorithm == "kmeans":
|
|
cluster_labels = self._kmeans_clustering(
|
|
combined_similarity, n_clusters or min(10, n_nodes // 2)
|
|
)
|
|
else:
|
|
cluster_labels = self._dbscan_clustering(distance_matrix, 0.3)
|
|
|
|
# Create result
|
|
clustered_nodes = []
|
|
for i, node in enumerate(nodes):
|
|
clustered_node = {
|
|
**node,
|
|
'cluster_id': int(cluster_labels[i]),
|
|
'node_index': i
|
|
}
|
|
clustered_nodes.append(clustered_node)
|
|
|
|
processing_time = time.time() - start_time
|
|
unique_clusters = len(set(cluster_labels))
|
|
|
|
cluster_info = {
|
|
'algorithm': f'hybrid_{algorithm}',
|
|
'total_clusters': unique_clusters,
|
|
'processing_time': processing_time,
|
|
'gpu_accelerated': self.use_gpu,
|
|
'weights': {
|
|
'name': name_weight,
|
|
'content': content_weight,
|
|
'spatial': spatial_weight
|
|
},
|
|
'average_cluster_size': n_nodes / unique_clusters if unique_clusters > 0 else 0
|
|
}
|
|
|
|
logger.info(f"✅ Hybrid clustering completed: {unique_clusters} clusters in {processing_time:.3f}s")
|
|
|
|
return SemanticClusterResult(
|
|
clustered_nodes=clustered_nodes,
|
|
cluster_info=cluster_info,
|
|
similarity_matrix=combined_similarity,
|
|
cluster_labels=cluster_labels
|
|
)
|
|
|
|
def _calculate_name_similarity_matrix(self, nodes: List[Dict[str, Any]]) -> np.ndarray:
|
|
"""Calculate pairwise name similarity matrix"""
|
|
n_nodes = len(nodes)
|
|
similarity_matrix = np.zeros((n_nodes, n_nodes))
|
|
|
|
for i in range(n_nodes):
|
|
for j in range(i, n_nodes):
|
|
if i == j:
|
|
similarity_matrix[i, j] = 1.0
|
|
else:
|
|
name1 = nodes[i].get('name', '')
|
|
name2 = nodes[j].get('name', '')
|
|
similarity = self.similarity_calc.calculate_name_similarity(name1, name2)
|
|
similarity_matrix[i, j] = similarity
|
|
similarity_matrix[j, i] = similarity # Symmetric
|
|
|
|
return similarity_matrix
|
|
|
|
def _calculate_spatial_similarity_matrix(self, nodes: List[Dict[str, Any]]) -> np.ndarray:
|
|
"""Calculate spatial similarity based on node positions"""
|
|
n_nodes = len(nodes)
|
|
similarity_matrix = np.zeros((n_nodes, n_nodes))
|
|
|
|
# Extract coordinates
|
|
coords = []
|
|
for node in nodes:
|
|
x = float(node.get('x', 0))
|
|
y = float(node.get('y', 0))
|
|
z = float(node.get('z', 0))
|
|
coords.append([x, y, z])
|
|
|
|
coords = np.array(coords)
|
|
|
|
# Calculate pairwise distances
|
|
for i in range(n_nodes):
|
|
for j in range(i, n_nodes):
|
|
if i == j:
|
|
similarity_matrix[i, j] = 1.0
|
|
else:
|
|
# Euclidean distance
|
|
dist = np.linalg.norm(coords[i] - coords[j])
|
|
# Convert distance to similarity (closer = more similar)
|
|
# Use exponential decay: similarity = exp(-distance/scale)
|
|
scale = 50.0 # Adjust based on your coordinate system
|
|
similarity = np.exp(-dist / scale)
|
|
similarity_matrix[i, j] = similarity
|
|
similarity_matrix[j, i] = similarity
|
|
|
|
return similarity_matrix
|
|
|
|
def _hierarchical_clustering(self, distance_matrix: np.ndarray, n_clusters: int) -> np.ndarray:
|
|
"""Apply hierarchical clustering"""
|
|
clusterer = AgglomerativeClustering(
|
|
n_clusters=n_clusters,
|
|
metric='precomputed',
|
|
linkage='average'
|
|
)
|
|
return clusterer.fit_predict(distance_matrix)
|
|
|
|
def _kmeans_clustering(self, similarity_matrix: np.ndarray, n_clusters: int) -> np.ndarray:
|
|
"""Apply K-means clustering"""
|
|
clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
|
return clusterer.fit_predict(similarity_matrix)
|
|
|
|
def _gpu_kmeans_clustering(self, similarity_matrix: np.ndarray, n_clusters: int) -> np.ndarray:
|
|
"""Apply GPU-accelerated K-means clustering"""
|
|
try:
|
|
gpu_matrix = cp.array(similarity_matrix, dtype=cp.float32)
|
|
clusterer = cuKMeans(n_clusters=n_clusters, random_state=42)
|
|
labels = clusterer.fit_predict(gpu_matrix)
|
|
return cp.asnumpy(labels)
|
|
except Exception as e:
|
|
logger.warning(f"GPU K-means failed, falling back to CPU: {e}")
|
|
return self._kmeans_clustering(similarity_matrix, n_clusters)
|
|
|
|
def _dbscan_clustering(self, distance_matrix: np.ndarray, eps: float) -> np.ndarray:
|
|
"""Apply DBSCAN clustering"""
|
|
clusterer = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
|
|
labels = clusterer.fit_predict(distance_matrix)
|
|
|
|
# DBSCAN uses -1 for noise points, convert to positive integers
|
|
unique_labels = set(labels)
|
|
if -1 in unique_labels:
|
|
# Assign noise points to individual clusters
|
|
max_label = max(labels) if len(unique_labels) > 1 else -1
|
|
noise_cluster = max_label + 1
|
|
labels = np.array([noise_cluster if label == -1 else label for label in labels])
|
|
|
|
return labels
|
|
|
|
def _create_single_cluster_result(self, nodes: List[Dict[str, Any]], start_time: float) -> SemanticClusterResult:
|
|
"""Create result for single cluster (when too few nodes)"""
|
|
clustered_nodes = []
|
|
for i, node in enumerate(nodes):
|
|
clustered_node = {
|
|
**node,
|
|
'cluster_id': 0,
|
|
'node_index': i
|
|
}
|
|
clustered_nodes.append(clustered_node)
|
|
|
|
processing_time = time.time() - start_time
|
|
|
|
cluster_info = {
|
|
'algorithm': 'single_cluster',
|
|
'total_clusters': 1,
|
|
'processing_time': processing_time,
|
|
'gpu_accelerated': False,
|
|
'average_cluster_size': len(nodes)
|
|
}
|
|
|
|
return SemanticClusterResult(
|
|
clustered_nodes=clustered_nodes,
|
|
cluster_info=cluster_info,
|
|
similarity_matrix=None,
|
|
cluster_labels=np.zeros(len(nodes), dtype=int)
|
|
)
|
|
|
|
# Convenience functions for easy integration
|
|
async def cluster_nodes_by_similarity(
|
|
nodes: List[Dict[str, Any]],
|
|
method: str = "hybrid",
|
|
algorithm: str = "hierarchical",
|
|
n_clusters: Optional[int] = None,
|
|
**kwargs
|
|
) -> SemanticClusterResult:
|
|
"""
|
|
Main entry point for semantic clustering
|
|
|
|
Args:
|
|
nodes: List of node dictionaries
|
|
method: 'name', 'content', 'hybrid'
|
|
algorithm: 'hierarchical', 'kmeans', 'dbscan'
|
|
n_clusters: Number of clusters (if applicable)
|
|
**kwargs: Additional parameters for specific methods
|
|
"""
|
|
engine = SemanticClusteringEngine()
|
|
|
|
if method == "name":
|
|
return engine.cluster_by_name_similarity(nodes, algorithm, n_clusters, **kwargs)
|
|
elif method == "content":
|
|
return engine.cluster_by_content_similarity(nodes, algorithm, n_clusters, **kwargs)
|
|
elif method == "hybrid":
|
|
return engine.hybrid_clustering(nodes, algorithm=algorithm, n_clusters=n_clusters, **kwargs)
|
|
else:
|
|
raise ValueError(f"Unknown clustering method: {method}")
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage
|
|
test_nodes = [
|
|
{"name": "Machine Learning", "x": 0, "y": 0, "z": 0, "group": "AI"},
|
|
{"name": "Deep Learning", "x": 10, "y": 5, "z": 2, "group": "AI"},
|
|
{"name": "Neural Networks", "x": 15, "y": 8, "z": 3, "group": "AI"},
|
|
{"name": "Data Science", "x": 20, "y": 10, "z": 5, "group": "Data"},
|
|
{"name": "Statistics", "x": 25, "y": 15, "z": 8, "group": "Math"},
|
|
{"name": "Linear Algebra", "x": 30, "y": 20, "z": 10, "group": "Math"},
|
|
]
|
|
|
|
async def test():
|
|
result = await cluster_nodes_by_similarity(test_nodes, method="hybrid")
|
|
print("Cluster Result:", result.cluster_info)
|
|
for node in result.clustered_nodes:
|
|
print(f" {node['name']} -> Cluster {node['cluster_id']}")
|
|
|
|
asyncio.run(test())
|