From 505cacdbd6ab154a0a6b322c5ede26ef440d8981 Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Sat, 18 Oct 2025 21:28:42 +0000
Subject: [PATCH 01/10] chore: Regenerate all playbooks

---
 README.md                       |   2 +-
 nvidia/nemo-fine-tune/README.md | 161 ++++++++++----------------------
 nvidia/vibe-coding/README.md    |   5 +-
 nvidia/vllm/README.md           |   2 +-
 nvidia/vss/README.md            |  10 +-
 5 files changed, 64 insertions(+), 116 deletions(-)

diff --git a/README.md b/README.md
index 500bfd4..27897b7 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ Each playbook includes prerequisites, step-by-step instructions, troubleshooting
 - [Install and Use vLLM for Inference](nvidia/vllm/)
 - [Vision-Language Model Fine-tuning](nvidia/vlm-finetuning/)
 - [VS Code](nvidia/vscode/)
-- [Video Search and Summarization](nvidia/vss/)
+- [Build a Video Search and Summarization (VSS) Agent](nvidia/vss/)
 
 ## Resources
 
diff --git a/nvidia/nemo-fine-tune/README.md b/nvidia/nemo-fine-tune/README.md
index 28fd1d4..d1abc48 100644
--- a/nvidia/nemo-fine-tune/README.md
+++ b/nvidia/nemo-fine-tune/README.md
@@ -52,7 +52,7 @@ All necessary files for the playbook can be found [here on GitHub](https://githu
 
 ## Step 1. Verify system requirements
 
-Check your NVIDIA Spark device meets the prerequisites for [NeMo AutoModel](https://github.com/NVIDIA-NeMo/Automodel) installation. This step runs on the host system to confirm CUDA toolkit availability and Python version compatibility.
+Check your NVIDIA Spark device meets the prerequisites for NeMo AutoModel installation. This step runs on the host system to confirm CUDA toolkit availability and Python version compatibility.
 
 ```bash
 ## Verify CUDA installation
@@ -169,19 +169,6 @@ uv run --frozen --no-sync python -c "import nemo_automodel; print('✅ NeMo Auto
 
 ## Check available examples
 ls -la examples/
-
-## Example output:
-$ ls -la examples/
-total 36
-drwxr-xr-x  9 akoumparouli domain-users 4096 Oct 16 14:52 .
-drwxr-xr-x 16 akoumparouli domain-users 4096 Oct 16 14:52 ..
-drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 16 14:52 benchmark
-drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 16 14:52 diffusion
-drwxr-xr-x 20 akoumparouli domain-users 4096 Oct 16 14:52 llm_finetune
-drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 14 09:27 llm_kd
-drwxr-xr-x  2 akoumparouli domain-users 4096 Oct 16 14:52 llm_pretrain
-drwxr-xr-x  6 akoumparouli domain-users 4096 Oct 14 09:27 vlm_finetune
-drwxr-xr-x  2 akoumparouli domain-users 4096 Oct 14 09:27 vlm_generate
 ```
 
 ## Step 8. Explore available examples
@@ -206,37 +193,36 @@ First, export your HF_TOKEN so that gated models can be downloaded.
 export HF_TOKEN=<your_huggingface_token>
 ```
 > [!NOTE]
-> Replace `<your_huggingface_token>` with your personal Hugging Face access token. A valid token is required to download any gated model.
->
-> - Generate a token: [Hugging Face tokens](https://huggingface.co/settings/tokens), guide available [here](https://huggingface.co/docs/hub/en/security-tokens).
-> - Request and receive access on each model's page (and accept license/terms) before attempting downloads.
->   - Llama-3.1-8B: [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
->   - Qwen3-8B: [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)
->   - Mixtral-8x7B: [mistralai/Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B)
->
-> The same steps apply for any other gated model you use: visit its model card on Hugging Face, request access, accept the license, and wait for approval.
+> Please Replace `<your_huggingface_token>` with your Hugging Face access token to access gated models (e.g., Llama).
+
+**Full Fine-tuning example:**
+
+Once inside the `Automodel` directory you cloned from github, run:
+
+```bash
+uv run --frozen --no-sync \
+examples/llm_finetune/finetune.py \
+-c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
+--step_scheduler.local_batch_size 1 \
+--loss_fn._target_ nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy \
+--model.pretrained_model_name_or_path Qwen/Qwen3-8B
+```
+These overrides ensure the Qwen3-8B SFT run behaves as expected:
+- `--model.pretrained_model_name_or_path`: selects the Qwen/Qwen3-8B model to fine-tune (weights fetched via your Hugging Face token).
+- `--loss_fn._target_`: uses the TransformerEngine-parallel cross-entropy loss variant compatible with tensor-parallel training for large LLMs.
+- `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe.
 
 **LoRA fine-tuning example:**
 
 Execute a basic fine-tuning example to validate the complete setup. This demonstrates parameter-efficient fine-tuning using a small model suitable for testing.
-For the examples below, we are using YAML for configuration, and parameter overrides are passed as command line arguments.
 
 ```bash
 ## Run basic LLM fine-tuning example
 uv run --frozen --no-sync \
 examples/llm_finetune/finetune.py \
 -c examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml \
---model.pretrained_model_name_or_path meta-llama/Llama-3.1-8B \
---packed_sequence.packed_sequence_size 1024 \
---step_scheduler.max_steps 100
+--model.pretrained_model_name_or_path meta-llama/Llama-3.1-8B
 ```
-
-These overrides ensure the Llama-3.1-8B LoRA run behaves as expected:
-- `--model.pretrained_model_name_or_path`: selects the Llama-3.1-8B model to fine-tune from the Hugging Face model hub (weights fetched via your Hugging Face token).
-- `--packed_sequence.packed_sequence_size`: sets the packed sequence size to 1024 to enable packed sequence training.
-- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
-
-
 **QLoRA fine-tuning example:**
 
 We can use QLoRA to fine-tune large models in a memory-efficient manner.
@@ -247,61 +233,50 @@ examples/llm_finetune/finetune.py \
 -c examples/llm_finetune/llama3_1/llama3_1_8b_squad_qlora.yaml \
 --model.pretrained_model_name_or_path meta-llama/Meta-Llama-3-70B \
 --loss_fn._target_ nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy \
---step_scheduler.local_batch_size 1 \
---packed_sequence.packed_sequence_size 1024 \
---step_scheduler.max_steps 100
+--step_scheduler.local_batch_size 1
 ```
 
 These overrides ensure the 70B QLoRA run behaves as expected:
 - `--model.pretrained_model_name_or_path`: selects the 70B base model to fine-tune (weights fetched via your Hugging Face token).
 - `--loss_fn._target_`: uses the TransformerEngine-parallel cross-entropy loss variant compatible with tensor-parallel training for large LLMs.
 - `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit 70B in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe. 
-- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
-- `--packed_sequence.packed_sequence_size`: sets the packed sequence size to 1024 to enable packed sequence training.
 
-**Full Fine-tuning example:**
+## Step 10. Validate training output
 
-Once inside the `Automodel` directory you cloned from GitHub, run:
+Check that fine-tuning completed successfully and inspect the generated model artifacts. This confirms the training pipeline works correctly on your Spark device.
 
 ```bash
-uv run --frozen --no-sync \
-examples/llm_finetune/finetune.py \
--c examples/llm_finetune/qwen/qwen3_8b_squad_spark.yaml \
---model.pretrained_model_name_or_path Qwen/Qwen3-8B \
---step_scheduler.local_batch_size 1 \
---step_scheduler.max_steps 100 \
---packed_sequence.packed_sequence_size 1024
+## Check training logs
+ls -la logs/
+
+## Verify model checkpoint creation
+ls -la checkpoints/
+
+## Test model inference (if applicable)
+uv run python -c "
+import torch
+print('GPU available:', torch.cuda.is_available())
+print('GPU count:', torch.cuda.device_count())
+"
 ```
-These overrides ensure the Qwen3-8B SFT run behaves as expected:
-- `--model.pretrained_model_name_or_path`: selects the Qwen/Qwen3-8B model to fine-tune from the Hugging Face model hub (weights fetched via your Hugging Face token). Adjust this if you want to fine-tune a different model.
-- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
-- `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe.
 
+## Step 11. Validate complete setup
 
-## Step 10. Validate successful training completion
-
-Validate the fine-tuned model by inspecting artifacts contained in the checkpoint directory.
+Perform final validation to ensure all components are working correctly. This comprehensive check confirms the environment is ready for production fine-tuning workflows.
 
 ```bash
-## Inspect logs and checkpoint output.
-## The LATEST is a symlink pointing to the latest checkpoint.
-## The checkpoint is the one that was saved during training.
-## below is an example of the expected output (username and domain-users are placeholders).
-ls -lah checkpoints/LATEST/
-
-## $ ls -lah checkpoints/LATEST/
-## total 32K
-## drwxr-xr-x 6 akoumparouli domain-users 4.0K Oct 16 22:33 .
-## drwxr-xr-x 4 akoumparouli domain-users 4.0K Oct 16 22:33 ..
-## -rw-r--r-- 1 akoumparouli domain-users 1.6K Oct 16 22:33 config.yaml
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 dataloader
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 model
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 optim
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 rng
-## -rw-r--r-- 1 akoumparouli domain-users 1.3K Oct 16 22:33 step_scheduler.pt
+## Test complete pipeline
+uv run python -c "
+import nemo_automodel
+import torch
+print('✅ NeMo AutoModel version:', nemo_automodel.__version__)
+print('✅ CUDA available:', torch.cuda.is_available())
+print('✅ GPU count:', torch.cuda.device_count())
+print('✅ Setup complete')
+"
 ```
 
-## Step 11. Cleanup and rollback (Optional)
+## Step 13. Cleanup and rollback
 
 Remove the installation and restore the original environment if needed. These commands safely remove all installed components.
 
@@ -322,42 +297,8 @@ pip3 uninstall uv
 ## Clear Python cache
 rm -rf ~/.cache/pip
 ```
-## Step 12. Optional: Publish your fine-tuned model checkpoint on Hugging Face Hub
 
-Publish your fine-tuned model checkpoint on Hugging Face Hub.
-> [!NOTE]
-> This is an optional step and is not required for using the fine-tuned model.
-> It is useful if you want to share your fine-tuned model with others or use it in other projects.
-> You can also use the fine-tuned model in other projects by cloning the repository and using the checkpoint.
-> To use the fine-tuned model in other projects, you need to have the Hugging Face CLI installed.
-> You can install the Hugging Face CLI by running `pip install huggingface-cli`.
-> For more information, please refer to the [Hugging Face CLI documentation](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
-
-> [!TIP]
-> You can use the `hf` command to upload the fine-tuned model checkpoint to Hugging Face Hub.
-> For more information, please refer to the [Hugging Face CLI documentation](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
-
-```bash
-## Publish the fine-tuned model checkpoint to Hugging Face Hub
-## will be published under the namespace <your_huggingface_username>/my-cool-model, adjust name as needed.
-hf upload my-cool-model checkpoints/LATEST/model
-```
-
-> [!TIP]
-> The above command can fail if you don't have write permissions to the Hugging Face Hub, with the HF_TOKEN you used.
-> Sample error message:
-> ```bash
-> akoumparouli@1604ab7-lcedt:/mnt/4tb/auto/Automodel8$ hf upload my-cool-model checkpoints/LATEST/model
-> Traceback (most recent call last):
->   File "/home/akoumparouli/.local/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
->     response.raise_for_status()
->   File "/home/akoumparouli/.local/lib/python3.10/site-packages/requests/models.py", line 1024, in raise_for_status
->     raise HTTPError(http_error_msg, response=self)
-> requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/api/repos/create
-> ```
-> To fix this, you need to create an access token with *write* permissions, please see the Hugging Face guide [here](https://huggingface.co/docs/hub/en/security-tokens) for instructions.
-
-## Step 12. Next steps
+## Step 14. Next steps
 
 Begin using NeMo AutoModel for your specific fine-tuning tasks. Start with provided recipes and customize based on your model requirements and dataset.
 
@@ -369,7 +310,7 @@ cp recipes/llm_finetune/finetune.py my_custom_training.py
 ## Then run: uv run my_custom_training.py
 ```
 
-Explore the [NeMo AutoModel GitHub repository](https://github.com/NVIDIA-NeMo/Automodel) for more recipes, documentation, and community examples. Consider setting up custom datasets, experimenting with different model architectures, and scaling to multi-node distributed training for larger models.
+Explore the [NeMo AutoModel GitHub repository](https://github.com/NVIDIA-NeMo/Automodel) for advanced recipes, documentation, and community examples. Consider setting up custom datasets, experimenting with different model architectures, and scaling to multi-node distributed training for larger models.
 
 ## Troubleshooting
 
@@ -383,8 +324,8 @@ Explore the [NeMo AutoModel GitHub repository](https://github.com/NVIDIA-NeMo/Au
 | Cannot access gated repo for URL | Certain HuggingFace models have restricted access | Regenerate your [HuggingFace token](https://huggingface.co/docs/hub/en/security-tokens); and request access to the [gated model](https://huggingface.co/docs/hub/en/models-gated#customize-requested-information) on your web browser |
 
 > [!NOTE]
-> DGX Spark uses a Unified Memory Architecture (UMA), which enables dynamic memory sharing between the GPU and CPU.
-> With many applications still updating to take advantage of UMA, you may encounter memory issues even when within
+> DGX Spark uses a Unified Memory Architecture (UMA), which enables dynamic memory sharing between the GPU and CPU. 
+> With many applications still updating to take advantage of UMA, you may encounter memory issues even when within 
 > the memory capacity of DGX Spark. If that happens, manually flush the buffer cache with:
 ```bash
 sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'
diff --git a/nvidia/vibe-coding/README.md b/nvidia/vibe-coding/README.md
index 4658a79..f54f5e3 100644
--- a/nvidia/vibe-coding/README.md
+++ b/nvidia/vibe-coding/README.md
@@ -167,7 +167,10 @@ Add additional model entries for any other Ollama models you wish to host remote
 ## Common Issues
 
 **1. Ollama not starting**
-- Verify Docker and GPU drivers are installed correctly. 
+- Verify GPU drivers are installed correctly.
+  Run `nvidia-smi` in the terminal. If the command fails check DGX Dashboard for updates to your DGX Spark.
+  If there are no updates or updates do not correct the issue, create a thread on the DGX Spark/GB10 User forum here :
+    https://forums.developer.nvidia.com/c/accelerated-computing/dgx-spark-gb10/dgx-spark-gb10/
 - Run `ollama serve` on the DGX Spark to view Ollama logs.
 
 **2. Continue can't connect over the network**
diff --git a/nvidia/vllm/README.md b/nvidia/vllm/README.md
index 899a3af..25e733c 100644
--- a/nvidia/vllm/README.md
+++ b/nvidia/vllm/README.md
@@ -340,7 +340,7 @@ http://192.168.100.10:8265
 | Container registry authentication fails | Invalid or expired GitLab token | Generate new auth token |
 | SM_121a architecture not recognized | Missing LLVM patches | Verify SM_121a patches applied to LLVM source |
 
-## Common Issues for running on two Starks
+## Common Issues for running on two Sparks
 | Symptom | Cause | Fix |
 |---------|--------|-----|
 | Node 2 not visible in Ray cluster | Network connectivity issue | Verify QSFP cable connection, check IP configuration |
diff --git a/nvidia/vss/README.md b/nvidia/vss/README.md
index 2922582..5679cd7 100644
--- a/nvidia/vss/README.md
+++ b/nvidia/vss/README.md
@@ -1,4 +1,4 @@
-# Video Search and Summarization
+# Build a Video Search and Summarization (VSS) Agent
 
 > Run the VSS Blueprint on your Spark
 
@@ -30,8 +30,8 @@ You will deploy NVIDIA's VSS AI Blueprint on NVIDIA Spark hardware with Blackwel
 ## Prerequisites
 
 - NVIDIA Spark device with ARM64 architecture and Blackwell GPU
-- FastOS 1.81.38 or compatible ARM64 system
-- Driver version 580.82.09 or higher installed: `nvidia-smi | grep "Driver Version"`
+- NVIDIA DGX OS 7.2.3 or higher
+- Driver version 580.95.05 or higher installed: `nvidia-smi | grep "Driver Version"`
 - CUDA version 13.0 installed: `nvcc --version`
 - Docker installed and running: `docker --version && docker compose version`
 - Access to NVIDIA Container Registry with [NGC API Key](https://org.ngc.nvidia.com/setup/api-keys)
@@ -278,6 +278,10 @@ Open these URLs in your browser:
 
 In this hybrid deployment, we would use NIMs from [build.nvidia.com](https://build.nvidia.com/). Alternatively, you can configure your own hosted endpoints by following the instructions in the [VSS remote deployment guide](https://docs.nvidia.com/vss/latest/content/installation-remote-docker-compose.html).
 
+> [!NOTE]
+> Fully local deployment using smaller LLM (Llama 3.1 8B) is also possible.  
+> To set up a fully local VSS deployment, follow the [instructions in the VSS documentation](https://docs.nvidia.com/vss/latest/content/vss_dep_docker_compose_arm.html#local-deployment-single-gpu-dgx-spark).
+
 **9.1 Get NVIDIA API Key**
 
 - Log in to https://build.nvidia.com/explore/discover.

From 752eada0cb1cb59929671f67a83a5dafbd4eef16 Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Sat, 18 Oct 2025 21:48:15 +0000
Subject: [PATCH 02/10] chore: Regenerate all playbooks

---
 nvidia/dgx-dashboard/README.md | 5 +++++
 nvidia/trt-llm/README.md       | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/nvidia/dgx-dashboard/README.md b/nvidia/dgx-dashboard/README.md
index 60971ad..f45db66 100644
--- a/nvidia/dgx-dashboard/README.md
+++ b/nvidia/dgx-dashboard/README.md
@@ -126,6 +126,11 @@ Verify your setup by running a simple Stable Diffusion XL image generation examp
 3. Add a new cell and paste the following code:
 
 ```python
+import warnings
+warnings.filterwarnings('ignore', message='.*cuda capability.*')
+import tqdm.auto
+tqdm.auto.tqdm = tqdm.std.tqdm
+
 from diffusers import DiffusionPipeline
 import torch
 from PIL import Image
diff --git a/nvidia/trt-llm/README.md b/nvidia/trt-llm/README.md
index 806fc5b..8b4a422 100644
--- a/nvidia/trt-llm/README.md
+++ b/nvidia/trt-llm/README.md
@@ -414,7 +414,7 @@ docker rmi nvcr.io/nvidia/tensorrt-llm/release:spark-single-gpu-dev
 
 ### Step 1. Configure network connectivity
 
-Follow the network setup instructions from the [Connect two Sparks](https://build.nvidia.com/spark/stack-sparks/stacked-sparks) playbook to establish connectivity between your DGX Spark nodes.
+Follow the network setup instructions from the [Connect two Sparks](https://build.nvidia.com/spark/connect-two-sparks/stacked-sparks) playbook to establish connectivity between your DGX Spark nodes.
 
 This includes:
 - Physical QSFP cable connection

From 11f2a77ea79d7920997a237050b29b2af8550d73 Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Tue, 21 Oct 2025 00:57:26 +0000
Subject: [PATCH 03/10] chore: Regenerate all playbooks

---
 nvidia/cuda-x-data-science/README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/nvidia/cuda-x-data-science/README.md b/nvidia/cuda-x-data-science/README.md
index f936cf2..2d9132f 100644
--- a/nvidia/cuda-x-data-science/README.md
+++ b/nvidia/cuda-x-data-science/README.md
@@ -29,7 +29,7 @@ You will accelerate popular machine learning algorithms and data analytics opera
 
 ## Time & risk
 * **Duration:** 20-30 minutes setup time and 2-3 minutes to run each notebook. 
-* **Risk level:** 
+* **Risks:** 
   * Data download slowness or failure due to network issues
   * Kaggle API generation failure requiring retries
 * **Rollback:** No permanent system changes made during normal usage.
@@ -42,19 +42,18 @@ You will accelerate popular machine learning algorithms and data analytics opera
 - Create Kaggle API key using [these instructions](https://www.kaggle.com/discussions/general/74235) and place the **kaggle.json** file in the same folder as the notebook
 
 ## Step 2. Installing Data Science libraries
-- Use the following command to install the CUDA-X libraries (this will create a new conda environment)
+Use the following command to install the CUDA-X libraries (this will create a new conda environment)
   ```bash
     conda create -n rapids-test -c rapidsai-nightly -c conda-forge -c nvidia  \
     rapids=25.10 python=3.12 'cuda-version=13.0' \
     jupyter hdbscan umap-learn
   ```
 ## Step 3. Activate the conda environment
-- Activate the conda environment
   ```bash
     conda activate rapids-test
   ```
 ## Step 4. Cloning the playbook repository
-- Clone the github repository and go the assets folder place in cuda-x-data-science folder
+- Clone the github repository and go the assets folder place in **cuda-x-data-science** folder
   ```bash
     git clone https://github.com/NVIDIA/dgx-spark-playbooks
   ```
@@ -63,12 +62,12 @@ You will accelerate popular machine learning algorithms and data analytics opera
 ## Step 5. Run the notebooks
 There are two notebooks in the GitHub repository. 
 One runs an example of a large strings data processing workflow with pandas code on GPU.
-- Run the cudf_pandas_demo.ipynb notebook and use `localhost:8888` in your browser to access the notebook
+- Run the **cudf_pandas_demo.ipynb** notebook and use `localhost:8888` in your browser to access the notebook
   ```bash
     jupyter notebook cudf_pandas_demo.ipynb
   ```
 The other goes over an example of machine learning algorithms including UMAP and HDBSCAN.
-- Run the cuml_sklearn_demo.ipynb notebook and use `localhost:8888` in your browser to access the notebook
+- Run the **cuml_sklearn_demo.ipynb** notebook and use `localhost:8888` in your browser to access the notebook
   ```bash
     jupyter notebook cuml_sklearn_demo.ipynb
   ```

From 3c3578c62038b2eada5c4cf9be2aa7118ac7e1f4 Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Tue, 21 Oct 2025 03:40:46 +0000
Subject: [PATCH 04/10] chore: Regenerate all playbooks

---
 nvidia/vibe-coding/README.md | 91 +++++++++++++++++-------------------
 1 file changed, 42 insertions(+), 49 deletions(-)

diff --git a/nvidia/vibe-coding/README.md b/nvidia/vibe-coding/README.md
index f54f5e3..cb5eb5a 100644
--- a/nvidia/vibe-coding/README.md
+++ b/nvidia/vibe-coding/README.md
@@ -7,7 +7,7 @@
 - [Overview](#overview)
   - [What You'll Accomplish](#what-youll-accomplish)
   - [Prerequisites](#prerequisites)
-  - [Requirements](#requirements)
+  - [Time & risk](#time-risk)
 - [Instructions](#instructions)
 - [Troubleshooting](#troubleshooting)
 
@@ -15,10 +15,10 @@
 
 ## Overview
 
-## DGX Spark Vibe Coding
+## Basic idea
 
 This playbook walks you through setting up DGX Spark as a **Vibe Coding assistant** — locally or as a remote coding companion for VSCode with Continue.dev.  
-This guide uses **Ollama** with **GPT-OSS 120B** to provide easy deployment of a coding assistant to VSCode. Included is advanced instructions to allow DGX Spark and Ollama to provide the coding assistant to be available over your local network. This guide is also written on a **fresh installation* of the OS. If your OS is not freshly installed and you have issues, see the troubleshooting section at the bottom of the document.
+This guide uses **Ollama** with **GPT-OSS 120B** to provide easy deployment of a coding assistant to VSCode. Included is advanced instructions to allow DGX Spark and Ollama to provide the coding assistant to be available over your local network. This guide is also written on a **fresh installation** of the OS. If your OS is not freshly installed and you have issues, see the troubleshooting section at the bottom of the document.
 
 ### What You'll Accomplish
 
@@ -30,17 +30,19 @@ You'll have a fully configured DGX Spark system capable of:
 ### Prerequisites
 
 - DGX Spark (128GB unified memory recommended)
-- Internet access for model downloads
-- Basic familiarity with the terminal
-- Optional: firewall control for remote access configuration
-
-### Requirements
-
 - **Ollama** and an LLM of your choice (e.g., `gpt-oss:120b`)
 - **VSCode**
 - **Continue** VSCode extension
+- Internet access for model downloads
 - Basic familiarity with opening the Linux terminal, copying and pasting commands.
 - Having sudo access.
+- Optional: firewall control for remote access configuration
+
+### Time & risk
+* **Duration:** About 30 minutes 
+* **Risks:** 
+  * Data download slowness or failure due to network issues
+* **Rollback:** No permanent system changes made during normal usage.
 
 ## Instructions
 
@@ -91,8 +93,8 @@ Verify that the workstation can connect to your DGX Spark's Ollama server:
   ```bash
   curl -v http://YOUR_SPARK_IP:11434/api/version
   ```
- Replace YOUR_SPARK_IP with your DGX Spark's IP address.
- If the connection fails please see the troubleshooting section at the bottom of this document.
+ Replace **YOUR_SPARK_IP** with your DGX Spark's IP address.
+ If the connection fails please see the Troubleshooting tab.
 
 ## Step 3. Install VSCode
 
@@ -107,15 +109,16 @@ If using a remote workstation, **install VSCode appropriate for your system arch
 
 ## Step 4. Install Continue.dev Extension
 
-Open VSCode and install **Continue.dev** from the Marketplace.  
+Open VSCode and install **Continue.dev** from the Marketplace:
+- Go to **Extensions view** in VSCode
+- Search for **Continue** published by [Continue.dev](https://www.continue.dev/) and install the extension.
 After installation, click the Continue icon on the right-hand bar.
 
-
 ## Step 5. Local Inference Setup
-- Click Select **Or, configure your own models**
-- Click **Click here to view more providers**
-- Choose **Ollama** as the provider.
-- For **Model**, select **Autodetect**.
+- Click `Or, configure your own models`
+- Click `Click here to view more providers`
+- Choose `Ollama` as the Provider
+- For Model, select `Autodetect`
 - Test inference by sending a test prompt.
 
 Your downloaded model will now be the default (e.g., `gpt-oss:120b`) for inference.
@@ -123,18 +126,18 @@ Your downloaded model will now be the default (e.g., `gpt-oss:120b`) for inferen
 ## Step 6. Setting up a Workstation to Connect to the DGX Spark' Ollama Server
 
 To connect a workstation running VSCode to a remote DGX Spark instance the following must be completed on that workstation:
-  - Install Continue from the marketplace.
-  - Click on the Continue icon on the left pane.
-  - Click ***Or, configure your own models***
-  - Click **Click here to view more providers.
-  - Select ***Ollama*** from the provider list.
-  - Select ***Autodetect*** as the model.
+  - Install Continue as instructed in Step 4
+  - Click on the `Continue` icon on the left pane
+  - Click `Or, configure your own models`
+  - Click `Click here to view more providers`
+  - Select `Ollama` as the Provider
+  - Select `Autodetect` as the Model.
 
-Continue **wil** fail to detect the model as it is attempting to connect to a locally hosted Ollama server.
-  - Find the **gear** icon in the upper right corner of the chat window and click on it.
+Continue **will** fail to detect the model as it is attempting to connect to a locally hosted Ollama server.
+  - Find the `**gear**` icon in the upper right corner of the Continue window and click on it.
   - On the left pane, click **Models**
   - Next to the first dropdown menu under **Chat** click the gear icon.
-  - Continue's config.yaml will open. Take note of your DGX Spark's IP address.
+  - Continue's `**config.yaml**` will open. Take note of your DGX Spark's IP address.
   - Replace the configuration with the following. **YOUR_SPARK_IP** should be replaced with your DGX Spark's IP.
 
 
@@ -164,27 +167,17 @@ Add additional model entries for any other Ollama models you wish to host remote
 
 ## Troubleshooting
 
-## Common Issues
+| Symptom | Cause | Fix |
+|---------|-------|-----|
+|Ollama not starting|GPU drivers may not be installed correctly|Run `nvidia-smi` in the terminal. If the command fails check DGX Dashboard for updates to your DGX Spark.|
+|Continue can't connect over the network|Port 11434 may not be open or accessible|Run command `ss -tuln | grep 11434`. If the output does not reflect " tcp   LISTEN 0      4096               *:11434            *:*  ", go back to step 2 and run the ufw command.|
+|Continue can't detect a locally running Ollama model|Configuration not properly set or detected|Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf` file. If `OLLAMA_HOST` and `OLLAMA_ORIGINS` are set correctly, add these lines to your `~/.bashrc` file.|
+|High memory usage|Model size too big|Confirm no other large models or containers are running with `nvidia-smi`. Use smaller models such as `gpt-oss:20b` for lightweight usage.|
 
-**1. Ollama not starting**
-- Verify GPU drivers are installed correctly.
-  Run `nvidia-smi` in the terminal. If the command fails check DGX Dashboard for updates to your DGX Spark.
-  If there are no updates or updates do not correct the issue, create a thread on the DGX Spark/GB10 User forum here :
-    https://forums.developer.nvidia.com/c/accelerated-computing/dgx-spark-gb10/dgx-spark-gb10/
-- Run `ollama serve` on the DGX Spark to view Ollama logs.
-
-**2. Continue can't connect over the network**
-- Ensure port 11434 is open and accessible from your workstation. 
-    ```bash
-    ss -tuln | grep 11434
-    ```
-      If the output does not reflect " tcp   LISTEN 0      4096               *:11434            *:*  "
-      go back to step 2 and run the ufw command.
-
-**3. Continue can't detect a locally running Ollama model
-- Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf`.
-- If `OLLAMA_HOST` and `OLLAMA_ORIGINS` are set correctly you should add these lines to your .bashrc.
-
-**4. High memory usage**
-- Use smaller models such as `gpt-oss:20b` for lightweight usage.
-- Confirm no other large models or containers are running with `nvidia-smi`.
+> [!NOTE]
+> DGX Spark uses a Unified Memory Architecture (UMA), which enables dynamic memory sharing between the GPU and CPU.
+> With many applications still updating to take advantage of UMA, you may encounter memory issues even when within
+> the memory capacity of DGX Spark. If that happens, manually flush the buffer cache with:
+```bash
+sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'
+```

From 8ca84d63e96230b85afbbd2e5f9cd69eb9b5e7ee Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Tue, 21 Oct 2025 03:50:02 +0000
Subject: [PATCH 05/10] chore: Regenerate all playbooks

---
 nvidia/vibe-coding/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nvidia/vibe-coding/README.md b/nvidia/vibe-coding/README.md
index cb5eb5a..474f04d 100644
--- a/nvidia/vibe-coding/README.md
+++ b/nvidia/vibe-coding/README.md
@@ -134,10 +134,10 @@ To connect a workstation running VSCode to a remote DGX Spark instance the follo
   - Select `Autodetect` as the Model.
 
 Continue **will** fail to detect the model as it is attempting to connect to a locally hosted Ollama server.
-  - Find the `**gear**` icon in the upper right corner of the Continue window and click on it.
+  - Find the `gear` icon in the upper right corner of the Continue window and click on it.
   - On the left pane, click **Models**
   - Next to the first dropdown menu under **Chat** click the gear icon.
-  - Continue's `**config.yaml**` will open. Take note of your DGX Spark's IP address.
+  - Continue's `config.yaml` will open. Take note of your DGX Spark's IP address.
   - Replace the configuration with the following. **YOUR_SPARK_IP** should be replaced with your DGX Spark's IP.
 
 
@@ -170,7 +170,7 @@ Add additional model entries for any other Ollama models you wish to host remote
 | Symptom | Cause | Fix |
 |---------|-------|-----|
 |Ollama not starting|GPU drivers may not be installed correctly|Run `nvidia-smi` in the terminal. If the command fails check DGX Dashboard for updates to your DGX Spark.|
-|Continue can't connect over the network|Port 11434 may not be open or accessible|Run command `ss -tuln | grep 11434`. If the output does not reflect " tcp   LISTEN 0      4096               *:11434            *:*  ", go back to step 2 and run the ufw command.|
+|Continue can't connect over the network|Port 11434 may not be open or accessible|Run command `ss -tuln &#124; grep 11434`. If the output does not reflect " tcp   LISTEN 0      4096               *:11434            *:*  ", go back to step 2 and run the ufw command.|
 |Continue can't detect a locally running Ollama model|Configuration not properly set or detected|Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf` file. If `OLLAMA_HOST` and `OLLAMA_ORIGINS` are set correctly, add these lines to your `~/.bashrc` file.|
 |High memory usage|Model size too big|Confirm no other large models or containers are running with `nvidia-smi`. Use smaller models such as `gpt-oss:20b` for lightweight usage.|
 

From c66572a74b0c9b5adb7f8c42daac6272bf8b61ac Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Tue, 21 Oct 2025 03:53:26 +0000
Subject: [PATCH 06/10] chore: Regenerate all playbooks

---
 nvidia/vibe-coding/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nvidia/vibe-coding/README.md b/nvidia/vibe-coding/README.md
index 474f04d..8fe837f 100644
--- a/nvidia/vibe-coding/README.md
+++ b/nvidia/vibe-coding/README.md
@@ -170,7 +170,7 @@ Add additional model entries for any other Ollama models you wish to host remote
 | Symptom | Cause | Fix |
 |---------|-------|-----|
 |Ollama not starting|GPU drivers may not be installed correctly|Run `nvidia-smi` in the terminal. If the command fails check DGX Dashboard for updates to your DGX Spark.|
-|Continue can't connect over the network|Port 11434 may not be open or accessible|Run command `ss -tuln &#124; grep 11434`. If the output does not reflect " tcp   LISTEN 0      4096               *:11434            *:*  ", go back to step 2 and run the ufw command.|
+|Continue can't connect over the network|Port 11434 may not be open or accessible|Run command `ss -tuln \| grep 11434`. If the output does not reflect ` tcp   LISTEN 0      4096               *:11434            *:*  `, go back to step 2 and run the ufw command.|
 |Continue can't detect a locally running Ollama model|Configuration not properly set or detected|Check `OLLAMA_HOST` and `OLLAMA_ORIGINS` in `/etc/systemd/system/ollama.service.d/override.conf` file. If `OLLAMA_HOST` and `OLLAMA_ORIGINS` are set correctly, add these lines to your `~/.bashrc` file.|
 |High memory usage|Model size too big|Confirm no other large models or containers are running with `nvidia-smi`. Use smaller models such as `gpt-oss:20b` for lightweight usage.|
 

From 15beb4e9fcaaab8f7f297ea6b96dd1ecc8755222 Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Tue, 21 Oct 2025 13:09:58 +0000
Subject: [PATCH 07/10] chore: Regenerate all playbooks

---
 nvidia/vibe-coding/README.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/nvidia/vibe-coding/README.md b/nvidia/vibe-coding/README.md
index 8fe837f..ddd9b4b 100644
--- a/nvidia/vibe-coding/README.md
+++ b/nvidia/vibe-coding/README.md
@@ -18,7 +18,7 @@
 ## Basic idea
 
 This playbook walks you through setting up DGX Spark as a **Vibe Coding assistant** — locally or as a remote coding companion for VSCode with Continue.dev.  
-This guide uses **Ollama** with **GPT-OSS 120B** to provide easy deployment of a coding assistant to VSCode. Included is advanced instructions to allow DGX Spark and Ollama to provide the coding assistant to be available over your local network. This guide is also written on a **fresh installation** of the OS. If your OS is not freshly installed and you have issues, see the troubleshooting section at the bottom of the document.
+This guide uses **Ollama** with **GPT-OSS 120B** to provide easy deployment of a coding assistant to VSCode. Included is advanced instructions to allow DGX Spark and Ollama to provide the coding assistant to be available over your local network. This guide is also written on a **fresh installation** of the OS. If your OS is not freshly installed and you have issues, see the troubleshooting tab.
 
 ### What You'll Accomplish
 
@@ -40,8 +40,7 @@ You'll have a fully configured DGX Spark system capable of:
 
 ### Time & risk
 * **Duration:** About 30 minutes 
-* **Risks:** 
-  * Data download slowness or failure due to network issues
+* **Risks:** Data download slowness or failure due to network issues
 * **Rollback:** No permanent system changes made during normal usage.
 
 ## Instructions

From d301ca4f845e4c9ca17f07fa181bed6920011d16 Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Wed, 22 Oct 2025 16:17:25 +0000
Subject: [PATCH 08/10] chore: Regenerate all playbooks

---
 nvidia/nemo-fine-tune/README.md | 161 ++++++++++++++++++++++----------
 1 file changed, 110 insertions(+), 51 deletions(-)

diff --git a/nvidia/nemo-fine-tune/README.md b/nvidia/nemo-fine-tune/README.md
index d1abc48..28fd1d4 100644
--- a/nvidia/nemo-fine-tune/README.md
+++ b/nvidia/nemo-fine-tune/README.md
@@ -52,7 +52,7 @@ All necessary files for the playbook can be found [here on GitHub](https://githu
 
 ## Step 1. Verify system requirements
 
-Check your NVIDIA Spark device meets the prerequisites for NeMo AutoModel installation. This step runs on the host system to confirm CUDA toolkit availability and Python version compatibility.
+Check your NVIDIA Spark device meets the prerequisites for [NeMo AutoModel](https://github.com/NVIDIA-NeMo/Automodel) installation. This step runs on the host system to confirm CUDA toolkit availability and Python version compatibility.
 
 ```bash
 ## Verify CUDA installation
@@ -169,6 +169,19 @@ uv run --frozen --no-sync python -c "import nemo_automodel; print('✅ NeMo Auto
 
 ## Check available examples
 ls -la examples/
+
+## Example output:
+$ ls -la examples/
+total 36
+drwxr-xr-x  9 akoumparouli domain-users 4096 Oct 16 14:52 .
+drwxr-xr-x 16 akoumparouli domain-users 4096 Oct 16 14:52 ..
+drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 16 14:52 benchmark
+drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 16 14:52 diffusion
+drwxr-xr-x 20 akoumparouli domain-users 4096 Oct 16 14:52 llm_finetune
+drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 14 09:27 llm_kd
+drwxr-xr-x  2 akoumparouli domain-users 4096 Oct 16 14:52 llm_pretrain
+drwxr-xr-x  6 akoumparouli domain-users 4096 Oct 14 09:27 vlm_finetune
+drwxr-xr-x  2 akoumparouli domain-users 4096 Oct 14 09:27 vlm_generate
 ```
 
 ## Step 8. Explore available examples
@@ -193,36 +206,37 @@ First, export your HF_TOKEN so that gated models can be downloaded.
 export HF_TOKEN=<your_huggingface_token>
 ```
 > [!NOTE]
-> Please Replace `<your_huggingface_token>` with your Hugging Face access token to access gated models (e.g., Llama).
-
-**Full Fine-tuning example:**
-
-Once inside the `Automodel` directory you cloned from github, run:
-
-```bash
-uv run --frozen --no-sync \
-examples/llm_finetune/finetune.py \
--c examples/llm_finetune/llama3_2/llama3_2_1b_squad.yaml \
---step_scheduler.local_batch_size 1 \
---loss_fn._target_ nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy \
---model.pretrained_model_name_or_path Qwen/Qwen3-8B
-```
-These overrides ensure the Qwen3-8B SFT run behaves as expected:
-- `--model.pretrained_model_name_or_path`: selects the Qwen/Qwen3-8B model to fine-tune (weights fetched via your Hugging Face token).
-- `--loss_fn._target_`: uses the TransformerEngine-parallel cross-entropy loss variant compatible with tensor-parallel training for large LLMs.
-- `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe.
+> Replace `<your_huggingface_token>` with your personal Hugging Face access token. A valid token is required to download any gated model.
+>
+> - Generate a token: [Hugging Face tokens](https://huggingface.co/settings/tokens), guide available [here](https://huggingface.co/docs/hub/en/security-tokens).
+> - Request and receive access on each model's page (and accept license/terms) before attempting downloads.
+>   - Llama-3.1-8B: [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
+>   - Qwen3-8B: [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)
+>   - Mixtral-8x7B: [mistralai/Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B)
+>
+> The same steps apply for any other gated model you use: visit its model card on Hugging Face, request access, accept the license, and wait for approval.
 
 **LoRA fine-tuning example:**
 
 Execute a basic fine-tuning example to validate the complete setup. This demonstrates parameter-efficient fine-tuning using a small model suitable for testing.
+For the examples below, we are using YAML for configuration, and parameter overrides are passed as command line arguments.
 
 ```bash
 ## Run basic LLM fine-tuning example
 uv run --frozen --no-sync \
 examples/llm_finetune/finetune.py \
 -c examples/llm_finetune/llama3_2/llama3_2_1b_squad_peft.yaml \
---model.pretrained_model_name_or_path meta-llama/Llama-3.1-8B
+--model.pretrained_model_name_or_path meta-llama/Llama-3.1-8B \
+--packed_sequence.packed_sequence_size 1024 \
+--step_scheduler.max_steps 100
 ```
+
+These overrides ensure the Llama-3.1-8B LoRA run behaves as expected:
+- `--model.pretrained_model_name_or_path`: selects the Llama-3.1-8B model to fine-tune from the Hugging Face model hub (weights fetched via your Hugging Face token).
+- `--packed_sequence.packed_sequence_size`: sets the packed sequence size to 1024 to enable packed sequence training.
+- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
+
+
 **QLoRA fine-tuning example:**
 
 We can use QLoRA to fine-tune large models in a memory-efficient manner.
@@ -233,50 +247,61 @@ examples/llm_finetune/finetune.py \
 -c examples/llm_finetune/llama3_1/llama3_1_8b_squad_qlora.yaml \
 --model.pretrained_model_name_or_path meta-llama/Meta-Llama-3-70B \
 --loss_fn._target_ nemo_automodel.components.loss.te_parallel_ce.TEParallelCrossEntropy \
---step_scheduler.local_batch_size 1
+--step_scheduler.local_batch_size 1 \
+--packed_sequence.packed_sequence_size 1024 \
+--step_scheduler.max_steps 100
 ```
 
 These overrides ensure the 70B QLoRA run behaves as expected:
 - `--model.pretrained_model_name_or_path`: selects the 70B base model to fine-tune (weights fetched via your Hugging Face token).
 - `--loss_fn._target_`: uses the TransformerEngine-parallel cross-entropy loss variant compatible with tensor-parallel training for large LLMs.
 - `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit 70B in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe. 
+- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
+- `--packed_sequence.packed_sequence_size`: sets the packed sequence size to 1024 to enable packed sequence training.
 
-## Step 10. Validate training output
+**Full Fine-tuning example:**
 
-Check that fine-tuning completed successfully and inspect the generated model artifacts. This confirms the training pipeline works correctly on your Spark device.
+Once inside the `Automodel` directory you cloned from GitHub, run:
 
 ```bash
-## Check training logs
-ls -la logs/
-
-## Verify model checkpoint creation
-ls -la checkpoints/
-
-## Test model inference (if applicable)
-uv run python -c "
-import torch
-print('GPU available:', torch.cuda.is_available())
-print('GPU count:', torch.cuda.device_count())
-"
+uv run --frozen --no-sync \
+examples/llm_finetune/finetune.py \
+-c examples/llm_finetune/qwen/qwen3_8b_squad_spark.yaml \
+--model.pretrained_model_name_or_path Qwen/Qwen3-8B \
+--step_scheduler.local_batch_size 1 \
+--step_scheduler.max_steps 100 \
+--packed_sequence.packed_sequence_size 1024
 ```
+These overrides ensure the Qwen3-8B SFT run behaves as expected:
+- `--model.pretrained_model_name_or_path`: selects the Qwen/Qwen3-8B model to fine-tune from the Hugging Face model hub (weights fetched via your Hugging Face token). Adjust this if you want to fine-tune a different model.
+- `--step_scheduler.max_steps`: sets the maximum number of training steps. We set it to 100 for demonstation purposes, please adjust this based on your needs.
+- `--step_scheduler.local_batch_size`: sets the per-GPU micro-batch size to 1 to fit in memory; overall effective batch size is still driven by gradient accumulation and data/tensor parallel settings from the recipe.
 
-## Step 11. Validate complete setup
 
-Perform final validation to ensure all components are working correctly. This comprehensive check confirms the environment is ready for production fine-tuning workflows.
+## Step 10. Validate successful training completion
+
+Validate the fine-tuned model by inspecting artifacts contained in the checkpoint directory.
 
 ```bash
-## Test complete pipeline
-uv run python -c "
-import nemo_automodel
-import torch
-print('✅ NeMo AutoModel version:', nemo_automodel.__version__)
-print('✅ CUDA available:', torch.cuda.is_available())
-print('✅ GPU count:', torch.cuda.device_count())
-print('✅ Setup complete')
-"
+## Inspect logs and checkpoint output.
+## The LATEST is a symlink pointing to the latest checkpoint.
+## The checkpoint is the one that was saved during training.
+## below is an example of the expected output (username and domain-users are placeholders).
+ls -lah checkpoints/LATEST/
+
+## $ ls -lah checkpoints/LATEST/
+## total 32K
+## drwxr-xr-x 6 akoumparouli domain-users 4.0K Oct 16 22:33 .
+## drwxr-xr-x 4 akoumparouli domain-users 4.0K Oct 16 22:33 ..
+## -rw-r--r-- 1 akoumparouli domain-users 1.6K Oct 16 22:33 config.yaml
+## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 dataloader
+## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 model
+## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 optim
+## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 rng
+## -rw-r--r-- 1 akoumparouli domain-users 1.3K Oct 16 22:33 step_scheduler.pt
 ```
 
-## Step 13. Cleanup and rollback
+## Step 11. Cleanup and rollback (Optional)
 
 Remove the installation and restore the original environment if needed. These commands safely remove all installed components.
 
@@ -297,8 +322,42 @@ pip3 uninstall uv
 ## Clear Python cache
 rm -rf ~/.cache/pip
 ```
+## Step 12. Optional: Publish your fine-tuned model checkpoint on Hugging Face Hub
 
-## Step 14. Next steps
+Publish your fine-tuned model checkpoint on Hugging Face Hub.
+> [!NOTE]
+> This is an optional step and is not required for using the fine-tuned model.
+> It is useful if you want to share your fine-tuned model with others or use it in other projects.
+> You can also use the fine-tuned model in other projects by cloning the repository and using the checkpoint.
+> To use the fine-tuned model in other projects, you need to have the Hugging Face CLI installed.
+> You can install the Hugging Face CLI by running `pip install huggingface-cli`.
+> For more information, please refer to the [Hugging Face CLI documentation](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
+
+> [!TIP]
+> You can use the `hf` command to upload the fine-tuned model checkpoint to Hugging Face Hub.
+> For more information, please refer to the [Hugging Face CLI documentation](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
+
+```bash
+## Publish the fine-tuned model checkpoint to Hugging Face Hub
+## will be published under the namespace <your_huggingface_username>/my-cool-model, adjust name as needed.
+hf upload my-cool-model checkpoints/LATEST/model
+```
+
+> [!TIP]
+> The above command can fail if you don't have write permissions to the Hugging Face Hub, with the HF_TOKEN you used.
+> Sample error message:
+> ```bash
+> akoumparouli@1604ab7-lcedt:/mnt/4tb/auto/Automodel8$ hf upload my-cool-model checkpoints/LATEST/model
+> Traceback (most recent call last):
+>   File "/home/akoumparouli/.local/lib/python3.10/site-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
+>     response.raise_for_status()
+>   File "/home/akoumparouli/.local/lib/python3.10/site-packages/requests/models.py", line 1024, in raise_for_status
+>     raise HTTPError(http_error_msg, response=self)
+> requests.exceptions.HTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/api/repos/create
+> ```
+> To fix this, you need to create an access token with *write* permissions, please see the Hugging Face guide [here](https://huggingface.co/docs/hub/en/security-tokens) for instructions.
+
+## Step 12. Next steps
 
 Begin using NeMo AutoModel for your specific fine-tuning tasks. Start with provided recipes and customize based on your model requirements and dataset.
 
@@ -310,7 +369,7 @@ cp recipes/llm_finetune/finetune.py my_custom_training.py
 ## Then run: uv run my_custom_training.py
 ```
 
-Explore the [NeMo AutoModel GitHub repository](https://github.com/NVIDIA-NeMo/Automodel) for advanced recipes, documentation, and community examples. Consider setting up custom datasets, experimenting with different model architectures, and scaling to multi-node distributed training for larger models.
+Explore the [NeMo AutoModel GitHub repository](https://github.com/NVIDIA-NeMo/Automodel) for more recipes, documentation, and community examples. Consider setting up custom datasets, experimenting with different model architectures, and scaling to multi-node distributed training for larger models.
 
 ## Troubleshooting
 
@@ -324,8 +383,8 @@ Explore the [NeMo AutoModel GitHub repository](https://github.com/NVIDIA-NeMo/Au
 | Cannot access gated repo for URL | Certain HuggingFace models have restricted access | Regenerate your [HuggingFace token](https://huggingface.co/docs/hub/en/security-tokens); and request access to the [gated model](https://huggingface.co/docs/hub/en/models-gated#customize-requested-information) on your web browser |
 
 > [!NOTE]
-> DGX Spark uses a Unified Memory Architecture (UMA), which enables dynamic memory sharing between the GPU and CPU. 
-> With many applications still updating to take advantage of UMA, you may encounter memory issues even when within 
+> DGX Spark uses a Unified Memory Architecture (UMA), which enables dynamic memory sharing between the GPU and CPU.
+> With many applications still updating to take advantage of UMA, you may encounter memory issues even when within
 > the memory capacity of DGX Spark. If that happens, manually flush the buffer cache with:
 ```bash
 sudo sh -c 'sync; echo 3 > /proc/sys/vm/drop_caches'

From ab0cb00e0b1813239a313d98eb3c28cec8e7ff09 Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Wed, 22 Oct 2025 18:54:29 +0000
Subject: [PATCH 09/10] chore: Regenerate all playbooks

---
 nvidia/nemo-fine-tune/README.md | 40 ++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/nvidia/nemo-fine-tune/README.md b/nvidia/nemo-fine-tune/README.md
index 28fd1d4..cc29e59 100644
--- a/nvidia/nemo-fine-tune/README.md
+++ b/nvidia/nemo-fine-tune/README.md
@@ -170,18 +170,18 @@ uv run --frozen --no-sync python -c "import nemo_automodel; print('✅ NeMo Auto
 ## Check available examples
 ls -la examples/
 
-## Example output:
-$ ls -la examples/
-total 36
-drwxr-xr-x  9 akoumparouli domain-users 4096 Oct 16 14:52 .
-drwxr-xr-x 16 akoumparouli domain-users 4096 Oct 16 14:52 ..
-drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 16 14:52 benchmark
-drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 16 14:52 diffusion
-drwxr-xr-x 20 akoumparouli domain-users 4096 Oct 16 14:52 llm_finetune
-drwxr-xr-x  3 akoumparouli domain-users 4096 Oct 14 09:27 llm_kd
-drwxr-xr-x  2 akoumparouli domain-users 4096 Oct 16 14:52 llm_pretrain
-drwxr-xr-x  6 akoumparouli domain-users 4096 Oct 14 09:27 vlm_finetune
-drwxr-xr-x  2 akoumparouli domain-users 4096 Oct 14 09:27 vlm_generate
+## Below is an example of the expected output (username and domain-users are placeholders).
+## $ ls -la examples/
+## total 36
+## drwxr-xr-x  9 username domain-users 4096 Oct 16 14:52 .
+## drwxr-xr-x 16 username domain-users 4096 Oct 16 14:52 ..
+## drwxr-xr-x  3 username domain-users 4096 Oct 16 14:52 benchmark
+## drwxr-xr-x  3 username domain-users 4096 Oct 16 14:52 diffusion
+## drwxr-xr-x 20 username domain-users 4096 Oct 16 14:52 llm_finetune
+## drwxr-xr-x  3 username domain-users 4096 Oct 14 09:27 llm_kd
+## drwxr-xr-x  2 username domain-users 4096 Oct 16 14:52 llm_pretrain
+## drwxr-xr-x  6 username domain-users 4096 Oct 14 09:27 vlm_finetune
+## drwxr-xr-x  2 username domain-users 4096 Oct 14 09:27 vlm_generate
 ```
 
 ## Step 8. Explore available examples
@@ -291,14 +291,14 @@ ls -lah checkpoints/LATEST/
 
 ## $ ls -lah checkpoints/LATEST/
 ## total 32K
-## drwxr-xr-x 6 akoumparouli domain-users 4.0K Oct 16 22:33 .
-## drwxr-xr-x 4 akoumparouli domain-users 4.0K Oct 16 22:33 ..
-## -rw-r--r-- 1 akoumparouli domain-users 1.6K Oct 16 22:33 config.yaml
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 dataloader
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 model
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 optim
-## drwxr-xr-x 2 akoumparouli domain-users 4.0K Oct 16 22:33 rng
-## -rw-r--r-- 1 akoumparouli domain-users 1.3K Oct 16 22:33 step_scheduler.pt
+## drwxr-xr-x 6 username domain-users 4.0K Oct 16 22:33 .
+## drwxr-xr-x 4 username domain-users 4.0K Oct 16 22:33 ..
+## -rw-r--r-- 1 username domain-users 1.6K Oct 16 22:33 config.yaml
+## drwxr-xr-x 2 username domain-users 4.0K Oct 16 22:33 dataloader
+## drwxr-xr-x 2 username domain-users 4.0K Oct 16 22:33 model
+## drwxr-xr-x 2 username domain-users 4.0K Oct 16 22:33 optim
+## drwxr-xr-x 2 username domain-users 4.0K Oct 16 22:33 rng
+## -rw-r--r-- 1 username domain-users 1.3K Oct 16 22:33 step_scheduler.pt
 ```
 
 ## Step 11. Cleanup and rollback (Optional)

From 6a34e251698454b0d70983cedad97ecf416107ea Mon Sep 17 00:00:00 2001
From: GitLab CI <automaton@nvidia.com>
Date: Wed, 22 Oct 2025 19:44:23 +0000
Subject: [PATCH 10/10] chore: Regenerate all playbooks

---
 nvidia/multi-agent-chatbot/README.md | 2 +-
 nvidia/txt2kg/README.md              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/nvidia/multi-agent-chatbot/README.md b/nvidia/multi-agent-chatbot/README.md
index 1fe2b94..be2597f 100644
--- a/nvidia/multi-agent-chatbot/README.md
+++ b/nvidia/multi-agent-chatbot/README.md
@@ -73,7 +73,7 @@ newgrp docker
 
 ```bash
 git clone https://github.com/NVIDIA/dgx-spark-playbooks
-cd multi-agent-chatbot/assets
+cd dgx-spark-playbooks/nvidia/multi-agent-chatbot/assets
 ```
 
 ## Step 3. Run the model download script
diff --git a/nvidia/txt2kg/README.md b/nvidia/txt2kg/README.md
index ede81ea..216a57d 100644
--- a/nvidia/txt2kg/README.md
+++ b/nvidia/txt2kg/README.md
@@ -62,7 +62,7 @@ In a terminal, clone the txt2kg repository and navigate to the project directory
 
 ```bash
 git clone https://github.com/NVIDIA/dgx-spark-playbooks
-cd nvidia/txt2kg/assets
+cd dgx-spark-playbook/nvidia/txt2kg/assets
 ```
 
 ## Step 2. Start the txt2kg services