dgx-spark-playbooks/nvidia/txt2kg/assets/examples/download_biorxiv_dataset.py

#!/usr/bin/env python3
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Download and process the MTEB raw_biorxiv dataset for txt2kg demo.
Filter for genetics/genomics categories and create individual txt files.
"""

import os
import re
from pathlib import Path
from datasets import load_dataset

def sanitize_filename(text, max_length=100):
    """Convert text to a safe filename."""
    # Remove special characters and replace with underscores
    filename = re.sub(r'[^\w\s-]', '', text)
    filename = re.sub(r'[-\s]+', '_', filename)
    filename = filename.strip('_')

    # Truncate if too long
    if len(filename) > max_length:
        filename = filename[:max_length]

    return filename

def main():
    print("Loading MTEB raw_biorxiv dataset...")

    # Load the dataset
    ds = load_dataset("mteb/raw_biorxiv")

    # Get the train split
    train_data = ds['train']

    print(f"Total dataset size: {len(train_data)} papers")

    # Filter for genetics or genomics categories
    genetics_genomics_data = []
    for item in train_data:
        category = item['category'].lower()
        if 'genetic' in category or 'genomic' in category:
            genetics_genomics_data.append(item)

    print(f"Found {len(genetics_genomics_data)} papers with genetics/genomics categories")

    if len(genetics_genomics_data) == 0:
        # Let's check what categories are available
        categories = set(item['category'] for item in train_data)
        print("Available categories:")
        for cat in sorted(categories):
            print(f"  - {cat}")
        return

    # Create output directory
    output_dir = Path("biorxiv_genetics_genomics")
    output_dir.mkdir(exist_ok=True)

    print(f"Creating txt files in {output_dir}/")

    # Process each paper
    for i, item in enumerate(genetics_genomics_data):
        # Create filename from title and ID
        title_part = sanitize_filename(item['title'], max_length=50)
        paper_id = item['id'].replace('/', '_')
        filename = f"{i+1:03d}_{title_part}_{paper_id}.txt"

        # Create file content
        content = f"Title: {item['title']}\n"
        content += f"ID: {item['id']}\n"
        content += f"Category: {item['category']}\n"
        content += f"\nAbstract:\n{item['abstract']}\n"

        # Write to file
        file_path = output_dir / filename
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)

    print(f"Successfully created {len(genetics_genomics_data)} txt files in {output_dir}/")

    # Show some statistics
    categories_found = set(item['category'] for item in genetics_genomics_data)
    print(f"\nCategories included:")
    for cat in sorted(categories_found):
        count = sum(1 for item in genetics_genomics_data if item['category'] == cat)
        print(f"  - {cat}: {count} papers")

if __name__ == "__main__":
    main()