mirror of
https://github.com/NVIDIA/dgx-spark-playbooks.git
synced 2026-04-25 11:23:52 +00:00
88 lines
2.8 KiB
Python
88 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Download and process the MTEB raw_biorxiv dataset for txt2kg demo.
|
|
Filter for genetics/genomics categories and create individual txt files.
|
|
"""
|
|
|
|
import os
|
|
import re
|
|
from pathlib import Path
|
|
from datasets import load_dataset
|
|
|
|
def sanitize_filename(text, max_length=100):
|
|
"""Convert text to a safe filename."""
|
|
# Remove special characters and replace with underscores
|
|
filename = re.sub(r'[^\w\s-]', '', text)
|
|
filename = re.sub(r'[-\s]+', '_', filename)
|
|
filename = filename.strip('_')
|
|
|
|
# Truncate if too long
|
|
if len(filename) > max_length:
|
|
filename = filename[:max_length]
|
|
|
|
return filename
|
|
|
|
def main():
|
|
print("Loading MTEB raw_biorxiv dataset...")
|
|
|
|
# Load the dataset
|
|
ds = load_dataset("mteb/raw_biorxiv")
|
|
|
|
# Get the train split
|
|
train_data = ds['train']
|
|
|
|
print(f"Total dataset size: {len(train_data)} papers")
|
|
|
|
# Filter for genetics or genomics categories
|
|
genetics_genomics_data = []
|
|
for item in train_data:
|
|
category = item['category'].lower()
|
|
if 'genetic' in category or 'genomic' in category:
|
|
genetics_genomics_data.append(item)
|
|
|
|
print(f"Found {len(genetics_genomics_data)} papers with genetics/genomics categories")
|
|
|
|
if len(genetics_genomics_data) == 0:
|
|
# Let's check what categories are available
|
|
categories = set(item['category'] for item in train_data)
|
|
print("Available categories:")
|
|
for cat in sorted(categories):
|
|
print(f" - {cat}")
|
|
return
|
|
|
|
# Create output directory
|
|
output_dir = Path("biorxiv_genetics_genomics")
|
|
output_dir.mkdir(exist_ok=True)
|
|
|
|
print(f"Creating txt files in {output_dir}/")
|
|
|
|
# Process each paper
|
|
for i, item in enumerate(genetics_genomics_data):
|
|
# Create filename from title and ID
|
|
title_part = sanitize_filename(item['title'], max_length=50)
|
|
paper_id = item['id'].replace('/', '_')
|
|
filename = f"{i+1:03d}_{title_part}_{paper_id}.txt"
|
|
|
|
# Create file content
|
|
content = f"Title: {item['title']}\n"
|
|
content += f"ID: {item['id']}\n"
|
|
content += f"Category: {item['category']}\n"
|
|
content += f"\nAbstract:\n{item['abstract']}\n"
|
|
|
|
# Write to file
|
|
file_path = output_dir / filename
|
|
with open(file_path, 'w', encoding='utf-8') as f:
|
|
f.write(content)
|
|
|
|
print(f"Successfully created {len(genetics_genomics_data)} txt files in {output_dir}/")
|
|
|
|
# Show some statistics
|
|
categories_found = set(item['category'] for item in genetics_genomics_data)
|
|
print(f"\nCategories included:")
|
|
for cat in sorted(categories_found):
|
|
count = sum(1 for item in genetics_genomics_data if item['category'] == cat)
|
|
print(f" - {cat}: {count} papers")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|