diff --git a/nvidia/pytorch-fine-tune/README.md b/nvidia/pytorch-fine-tune/README.md
index 60140ac..3a9e5f9 100644
--- a/nvidia/pytorch-fine-tune/README.md
+++ b/nvidia/pytorch-fine-tune/README.md
@@ -134,8 +134,8 @@ python Llama3_3B_full_finetuning.py
 ## LoRA fine-tuning on Llama 3.1 8B
 python Llama3_8B_LoRA_finetuning.py
 
-## LoRA fine-tuning on Llama 3.1 70B
-python Llama3_70B_LoRA_finetuning.py
+## qLoRA fine-tuning on Llama 3.1 70B
+python Llama3_70B_qLoRA_finetuning.py
 ```
 
 #### Common Command-Line Arguments
@@ -158,7 +158,7 @@ All scripts support the following command-line arguments for customization:
 - `--lora_rank`: LoRA rank - higher values = more trainable parameters (default: `8`)
 
 ##### Dataset Configuration
-- `--dataset_size`: Number of samples to use from the Alpaca dataset (default: `500`)
+- `--dataset_size`: Number of samples to use from the Alpaca dataset (default: `512`)
 
 ##### Logging Configuration
 - `--logging_steps`: Log metrics every N steps (default: `1`)
@@ -167,13 +167,6 @@ All scripts support the following command-line arguments for customization:
 ##### Model Saving
 - `--output_dir`: Directory to save the fine-tuned model (default: `None` - model not saved)
 
-##### Performance Optimization
-- `--use_torch_compile`: Enable `torch.compile()` for faster training (flag)
-
-> [!WARNING]
-> **Important:** The `--use_torch_compile` flag is **not compatible with QLoRA** (`Llama3_70B_qLoRA_finetuning.py`). 
-> Only use this flag with full fine-tuning and standard LoRA scripts.
-
 #### Usage Examples
 ```bash
 python Llama3_8B_LoRA_finetuning.py \
diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py
index b40a5ca..af6494e 100644
--- a/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_3B_full_finetuning.py
@@ -31,7 +31,7 @@ ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paire
 
 ### Response: {}"""
 
-def get_alpaca_dataset(eos_token, dataset_size=500):
+def get_alpaca_dataset(eos_token, dataset_size=512):
     # Preprocess the dataset
     def preprocess(x):
         texts = [
@@ -69,7 +69,7 @@ def main(args):
     # Configure the SFT config
     config = {
         "per_device_train_batch_size": args.batch_size,
-        "num_train_epochs": 0.01,  # Warmup epoch
+        "num_train_epochs": 0.05,  # Warmup epoch
         "gradient_accumulation_steps": args.gradient_accumulation_steps,
         "learning_rate": args.learning_rate,
         "optim": "adamw_torch",
@@ -79,26 +79,24 @@ def main(args):
         "dataset_text_field": "text",
         "packing": False,
         "max_length": args.seq_length,
-        "torch_compile": False,
         "report_to": "none",
         "logging_dir": args.log_dir,
         "logging_steps": args.logging_steps,
         "gradient_checkpointing": args.gradient_checkpointing,  # Save memory
     }
 
-    # Compile model if requested
-    if args.use_torch_compile:
-        print("Compiling model with torch.compile()...")
-        model = torch.compile(model)
+    # Compile model for faster training
+    print("Compiling model with torch.compile()...")
+    model = torch.compile(model)
 
-        # Warmup for torch compile
-        print("Running warmup for torch.compile()...")
-        SFTTrainer(
-            model=model,
-            processing_class=tokenizer,
-            train_dataset=dataset,
-            args=SFTConfig(**config),
-        ).train()
+    # Warmup for torch compile
+    print("Running warmup for torch.compile()...")
+    SFTTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        args=SFTConfig(**config),
+    ).train()
 
     # Train the model
     print(f"\nStarting full fine-tuning for {args.num_epochs} epoch(s)...")
@@ -124,13 +122,6 @@ def main(args):
     print(f"Train loss: {trainer_stats.metrics['train_loss']:.4f}")
     print(f"{'='*60}\n")
 
-    # Save model if requested
-    if args.output_dir:
-        print(f"Saving model to {args.output_dir}...")
-        trainer.save_model(args.output_dir)
-        tokenizer.save_pretrained(args.output_dir)
-        print("Model saved successfully!")
-
 
 def parse_arguments():
     parser = argparse.ArgumentParser(description="Llama 3.2 3B Full Fine-tuning (SFT)")
@@ -157,7 +148,7 @@ def parse_arguments():
                         help="Enable gradient checkpointing to save memory")
 
     # Dataset configuration
-    parser.add_argument("--dataset_size", type=int, default=500,
+    parser.add_argument("--dataset_size", type=int, default=512,
                         help="Number of samples to use from dataset")
 
     # Logging configuration
@@ -166,12 +157,6 @@ def parse_arguments():
     parser.add_argument("--log_dir", type=str, default="logs",
                         help="Directory for logs")
 
-    # Compilation and saving
-    parser.add_argument("--use_torch_compile", action="store_true",
-                        help="Use torch.compile() for faster training")
-    parser.add_argument("--output_dir", type=str, default=None,
-                        help="Directory to save the fine-tuned model")
-
     return parser.parse_args()
 
 
@@ -190,7 +175,6 @@ if __name__ == "__main__":
     print(f"Learning rate: {args.learning_rate}")
     print(f"Dataset size: {args.dataset_size}")
     print(f"Gradient checkpointing: {args.gradient_checkpointing}")
-    print(f"Torch compile: {args.use_torch_compile}")
     print(f"{'='*60}\n")
 
     main(args)
diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py
index f6e001b..f507f4e 100644
--- a/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_70B_qLoRA_finetuning.py
@@ -32,7 +32,7 @@ ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paire
 
 ### Response: {}"""
 
-def get_alpaca_dataset(eos_token, dataset_size=500):
+def get_alpaca_dataset(eos_token, dataset_size=512):
     # Preprocess the dataset
     def preprocess(x):
         texts = [
@@ -67,15 +67,14 @@ def main(args):
         args.model_name,
         quantization_config=quantization_config,
         dtype=args.dtype,
-        device_map=device_map_config,
-        trust_remote_code=True
+        device_map="cuda",
     )
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name, trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
     tokenizer.pad_token = tokenizer.eos_token
 
     # Prepare model for QLoRA training
     print(f"Preparing model for QLoRA (4-bit) with rank {args.lora_rank}...")
-    # model = prepare_model_for_kbit_training(model)
+    model = prepare_model_for_kbit_training(model)
     
     peft_config = LoraConfig(
         r=args.lora_rank,
@@ -96,7 +95,7 @@ def main(args):
     # Configure the SFT config
     config = {
         "per_device_train_batch_size": args.batch_size,
-        "num_train_epochs": 0.01,  # Warmup epoch
+        "num_train_epochs": args.num_epochs,
         "gradient_accumulation_steps": args.gradient_accumulation_steps,
         "learning_rate": args.learning_rate,
         "optim": "adamw_torch",
@@ -106,30 +105,14 @@ def main(args):
         "dataset_text_field": "text",
         "packing": False,
         "max_length": args.seq_length,
-        "torch_compile": False,
         "report_to": "none",
         "logging_dir": args.log_dir,
         "logging_steps": args.logging_steps,
         "gradient_checkpointing": args.gradient_checkpointing
     }
 
-    # Compile model if requested
-    if args.use_torch_compile:
-        print("Compiling model with torch.compile()...")
-        model = torch.compile(model)
-        
-        # Warmup for torch compile
-        print("Running warmup for torch.compile()...")
-        SFTTrainer(
-            model=model,
-            processing_class=tokenizer,
-            train_dataset=dataset,
-            args=SFTConfig(**config),
-        ).train()
-
     # Train the model
     print(f"\nStarting QLoRA fine-tuning for {args.num_epochs} epoch(s)...")
-    config["num_train_epochs"] = args.num_epochs
     config["report_to"] = "tensorboard"
     
     trainer = SFTTrainer(
@@ -164,7 +147,7 @@ def parse_arguments():
     parser = argparse.ArgumentParser(description="Llama 3.1 70B Fine-tuning with QLoRA")
     
     # Model configuration
-    parser.add_argument("--model_name", type=str, default="meta-llama/Llama-3.1-70B-Instruct",
+    parser.add_argument("--model_name", type=str, default="unsloth/Meta-Llama-3.1-70B-bnb-4bit",
                         help="Model name or path")
     parser.add_argument("--dtype", type=str, default="bfloat16",
                         help="Model dtype (e.g., float32, float16, bfloat16)")
@@ -190,7 +173,7 @@ def parse_arguments():
                         help="LoRA rank")
     
     # Dataset configuration
-    parser.add_argument("--dataset_size", type=int, default=500,
+    parser.add_argument("--dataset_size", type=int, default=512,
                         help="Number of samples to use from dataset")
     
     # Logging configuration
@@ -199,12 +182,6 @@ def parse_arguments():
     parser.add_argument("--log_dir", type=str, default="logs",
                         help="Directory for logs")
     
-    # Compilation and saving
-    parser.add_argument("--use_torch_compile", action="store_true",
-                        help="Use torch.compile() for faster training")
-    parser.add_argument("--output_dir", type=str, default=None,
-                        help="Directory to save the fine-tuned model")
-    
     return parser.parse_args()
 
 
@@ -224,7 +201,6 @@ if __name__ == "__main__":
     print(f"LoRA rank: {args.lora_rank}")
     print(f"Dataset size: {args.dataset_size}")
     print(f"Gradient checkpointing: {args.gradient_checkpointing}")
-    print(f"Torch compile: {args.use_torch_compile}")
     print(f"{'='*60}\n")
     
     main(args)
diff --git a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
index 4023b84..6114594 100644
--- a/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
+++ b/nvidia/pytorch-fine-tune/assets/Llama3_8B_LoRA_finetuning.py
@@ -31,7 +31,7 @@ ALPACA_PROMPT_TEMPLATE = """Below is an instruction that describes a task, paire
 
 ### Response: {}"""
 
-def get_alpaca_dataset(eos_token, dataset_size=500):
+def get_alpaca_dataset(eos_token, dataset_size=512):
     # Preprocess the dataset
     def preprocess(x):
         texts = [
@@ -83,25 +83,23 @@ def main(args):
         "dataset_text_field": "text",
         "packing": False,
         "max_length": args.seq_length,
-        "torch_compile": False,
         "report_to": "none",
         "logging_dir": args.log_dir,
         "logging_steps": args.logging_steps
     }
 
-    # Compile model if requested
-    if args.use_torch_compile:
-        print("Compiling model with torch.compile()...")
-        model = torch.compile(model)
-        
-        # Warmup for torch compile
-        print("Running warmup for torch.compile()...")
-        SFTTrainer(
-            model=model,
-            processing_class=tokenizer,
-            train_dataset=dataset,
-            args=SFTConfig(**config),
-        ).train()
+    # Compile model for faster training
+    print("Compiling model with torch.compile()...")
+    model = torch.compile(model)
+    
+    # Warmup for torch compile
+    print("Running warmup for torch.compile()...")
+    SFTTrainer(
+        model=model,
+        processing_class=tokenizer,
+        train_dataset=dataset,
+        args=SFTConfig(**config),
+    ).train()
 
     # Train the model
     print(f"\nStarting LoRA fine-tuning for {args.num_epochs} epoch(s)...")
@@ -138,7 +136,7 @@ def parse_arguments():
                         help="Model dtype")
     
     # Training configuration
-    parser.add_argument("--batch_size", type=int, default=4,
+    parser.add_argument("--batch_size", type=int, default=8,
                         help="Per device training batch size")
     parser.add_argument("--seq_length", type=int, default=2048,
                         help="Maximum sequence length")
@@ -154,7 +152,7 @@ def parse_arguments():
                         help="LoRA rank")
     
     # Dataset configuration
-    parser.add_argument("--dataset_size", type=int, default=500,
+    parser.add_argument("--dataset_size", type=int, default=512,
                         help="Number of samples to use from dataset")
     
     # Logging configuration
@@ -162,9 +160,6 @@ def parse_arguments():
                         help="Log every N steps")
     parser.add_argument("--log_dir", type=str, default="logs",
                         help="Directory for logs")
-    # Compilation
-    parser.add_argument("--use_torch_compile", action="store_true",
-                        help="Use torch.compile() for faster training")
     
     return parser.parse_args()
 
@@ -181,7 +176,6 @@ if __name__ == "__main__":
     print(f"Learning rate: {args.learning_rate}")
     print(f"LoRA rank: {args.lora_rank}")
     print(f"Dataset size: {args.dataset_size}")
-    print(f"Torch compile: {args.use_torch_compile}")
     print(f"{'='*60}\n")
     
     main(args)
\ No newline at end of file