```
├── .github/
   ├── FUNDING.yml (200 tokens)
   ├── ISSUE_TEMPLATE/
      ├── bug---issue.md (100 tokens)
      ├── feature-request.md (100 tokens)
├── .gitignore (700 tokens)
├── CODE_OF_CONDUCT.md (1100 tokens)
├── CONTRIBUTING.md (400 tokens)
├── LICENSE (omitted)
├── README.md (5.2k tokens)
├── images/
   ├── Assistant.png
   ├── Colab.png
   ├── Discord button.png
   ├── Discord.png
   ├── Documentation Button.png
   ├── Free version button.png
   ├── Kaggle.png
   ├── Kofi button.png
   ├── LAION 2GPU.png
   ├── Merge.png
   ├── Run.png
   ├── Slim Orca 2GPUs.png
   ├── Terminal_Type.png
   ├── Where_Terminal.png
   ├── buy me a coffee button.png
   ├── documentation github button.png
   ├── documentation green button.png
   ├── documentation lighter.png
   ├── documentation white button.png
   ├── made with unsloth.png
   ├── ollama.png
   ├── peft x trl button.png
   ├── start free finetune button.png
   ├── unsloth end.png
   ├── unsloth loading page render.png
   ├── unsloth logo black text.png
   ├── unsloth logo only.png
   ├── unsloth logo white text.png
   ├── unsloth made with love.png
   ├── unsloth new logo.png
   ├── unsloth sticker.png
├── pyproject.toml (8.4k tokens)
├── tests/
   ├── __init__.py
   ├── qlora/
      ├── README.md (500 tokens)
      ├── test_hf_qlora_train_and_merge.py (1000 tokens)
      ├── test_unsloth_qlora_train_and_merge.py (1200 tokens)
   ├── saving/
      ├── language_models/
         ├── test_merge_model_perplexity_llama-3.2.py (1500 tokens)
         ├── test_merge_model_perplexity_mistral.py (1800 tokens)
         ├── test_merge_model_perplexity_phi_4.py (1500 tokens)
         ├── test_merged_model_perplexity_llama-3.1-8b.py (1500 tokens)
         ├── test_merged_model_perplexity_qwen_2.5.py (1800 tokens)
         ├── test_push_to_hub_merged.py (1100 tokens)
         ├── test_push_to_hub_merged_sharded_index_file.py (1200 tokens)
         ├── test_save_merged_grpo_model.py (5.1k tokens)
      ├── non_peft/
         ├── test_mistral_non_peft.py (400 tokens)
         ├── test_whisper_non_peft.py (400 tokens)
      ├── test_unsloth_save.py (1100 tokens)
      ├── text_to_speech_models/
         ├── test_csm.py (1000 tokens)
         ├── test_lasa.py (1400 tokens)
         ├── test_orpheus.py (1700 tokens)
         ├── test_whisper.py (1300 tokens)
      ├── vision_models/
         ├── test_index_file_sharded_model.py (1900 tokens)
         ├── test_push_to_hub_merged.py (1800 tokens)
         ├── test_save_merge_vision_model_ocr_benchmark.py (1900 tokens)
   ├── test_model_registry.py (600 tokens)
   ├── utils/
      ├── __init__.py (200 tokens)
      ├── aime_eval.md (1300 tokens)
      ├── aime_eval.py (3.8k tokens)
      ├── cleanup_utils.py (1400 tokens)
      ├── data_utils.py (1100 tokens)
      ├── hf_utils.py (1600 tokens)
      ├── ocr_eval.md (600 tokens)
      ├── ocr_eval.py (2.4k tokens)
      ├── os_utils.py (800 tokens)
      ├── perplexity_eval.md (100 tokens)
      ├── perplexity_eval.py (600 tokens)
├── unsloth-cli.py (2.4k tokens)
├── unsloth/
   ├── __init__.py (2.1k tokens)
   ├── _auto_install.py (300 tokens)
   ├── chat_templates.py (18.1k tokens)
   ├── dataprep/
      ├── __init__.py (100 tokens)
      ├── synthetic.py (2.1k tokens)
      ├── synthetic_configs.py (800 tokens)
   ├── kernels/
      ├── __init__.py (400 tokens)
      ├── cross_entropy_loss.py (2.9k tokens)
      ├── fast_lora.py (3.7k tokens)
      ├── flex_attention.py (1400 tokens)
      ├── geglu.py (1400 tokens)
      ├── layernorm.py (1400 tokens)
      ├── moe/
         ├── LICENSE (6.9k tokens)
         ├── README.md (1200 tokens)
         ├── __init__.py
         ├── benchmark/
            ├── benchmark_fused_moe.py (2.8k tokens)
            ├── utils.py (1400 tokens)
         ├── grouped_gemm/
            ├── LICENSE (6.9k tokens)
            ├── __init__.py
            ├── interface.py (7.7k tokens)
            ├── kernels/
               ├── __init__.py
               ├── autotuning.py (2000 tokens)
               ├── backward.py (4.7k tokens)
               ├── forward.py (2.2k tokens)
               ├── tuning.py (1700 tokens)
            ├── reference/
               ├── __init__.py
               ├── layers/
                  ├── llama4_moe.py (3.4k tokens)
                  ├── qwen3_moe.py (2.6k tokens)
               ├── moe_block.py (1200 tokens)
               ├── moe_ops.py (800 tokens)
         ├── requirements.txt
         ├── tests/
            ├── __init__.py
            ├── common.py (2000 tokens)
            ├── moe_utils.py (3.7k tokens)
            ├── run_qwen3_moe_tests.sh (200 tokens)
            ├── test_grouped_gemm.py (8.7k tokens)
            ├── test_llama4_moe.py (1600 tokens)
            ├── test_qwen3_moe.py (2000 tokens)
      ├── rms_layernorm.py (2000 tokens)
      ├── rope_embedding.py (1400 tokens)
      ├── swiglu.py (700 tokens)
      ├── utils.py (3.6k tokens)
   ├── models/
      ├── __init__.py (200 tokens)
      ├── _utils.py (10.5k tokens)
      ├── cohere.py (3.8k tokens)
      ├── dpo.py (200 tokens)
      ├── gemma.py (3.3k tokens)
      ├── gemma2.py (4.4k tokens)
      ├── granite.py (4.5k tokens)
      ├── llama.py (24k tokens)
      ├── llama4.py (100 tokens)
      ├── loader.py (7.2k tokens)
      ├── loader_utils.py (1300 tokens)
      ├── mapper.py (6.2k tokens)
      ├── mistral.py (3.3k tokens)
      ├── qwen2.py (800 tokens)
      ├── qwen3.py (3.5k tokens)
      ├── qwen3_moe.py (1900 tokens)
      ├── rl.py (7.1k tokens)
      ├── rl_replacements.py (3.6k tokens)
      ├── vision.py (5.8k tokens)
   ├── registry/
      ├── REGISTRY.md (700 tokens)
      ├── __init__.py (400 tokens)
      ├── _deepseek.py (1300 tokens)
      ├── _gemma.py (500 tokens)
      ├── _llama.py (800 tokens)
      ├── _mistral.py (600 tokens)
      ├── _phi.py (500 tokens)
      ├── _qwen.py (800 tokens)
      ├── registry.py (1100 tokens)
   ├── save.py (19.9k tokens)
   ├── tokenizer_utils.py (8.2k tokens)
   ├── trainer.py (1700 tokens)
   ├── utils/
      ├── __init__.py
      ├── hf_hub.py (400 tokens)
```


## /.github/FUNDING.yml

```yml path="/.github/FUNDING.yml" 
# These are supported funding model platforms

github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: # Replace with a single Open Collective username
ko_fi: unsloth
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']

```

## /.github/ISSUE_TEMPLATE/bug---issue.md

---
name: Bug / Issue
about: Bug / Issue
title: "[Bug]"
labels: bug
assignees: ''

---

1. Did you update? `pip install --upgrade unsloth unsloth_zoo`
2. `Colab` or `Kaggle` or local / cloud
3. Number GPUs used, use `nvidia-smi`
4. Which notebook?
5. Paste `Unsloth` printout with :sloth: sloth emoji
6. Which trainer? `SFTTrainer`, `GRPOTrainer` etc
7. **Minimal code to reproduce error Remove Hugging Face token!**

🦥 You can also ask via our Reddit page: https://www.reddit.com/r/unsloth/


## /.github/ISSUE_TEMPLATE/feature-request.md

---
name: Feature Request
about: New features, model support, ideas
title: "[Feature]"
labels: feature request
assignees: ''

---

For new models, have you tried:
```python
from unsloth import FastModel
model, tokenizer = FastModel.from_pretrained(
    "microsoft/Phi-4-multimodal-instruct",
    trust_remote_code = True,
)
from transformers import AutoModelForSequenceClassification
model, tokenizer = FastModel.from_pretrained(
    auto_model = AutoModelForSequenceClassification,
)
```


## /.gitignore

```gitignore path="/.gitignore" 
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# UV
#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#uv.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Ruff stuff:
.ruff_cache/

# PyPI configuration file
.pypirc
.vscode

```

## /CODE_OF_CONDUCT.md


# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
  community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or advances of
  any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
  without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at
feedback@huggingface.co.
All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series of
actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within the
community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].

Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].

For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].

[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations


## /CONTRIBUTING.md

# 🦥 Contributing to Unsloth

Thank you for not only using Unsloth but also for being interested in helping out! We value all contributions, whether they come in the form of code, ideas, support for others or just by simply spreading the word of Unsloth! 💕

- **[Support the Community](https://github.com/unslothai/unsloth/issues)**: Answer questions, review pull requests, or assist others in discussions.
- **Fix Bugs**: Identify and resolve issues with the existing codebase.
- **Submit Ideas**: Request new features or share enhancements you'd like to see.
- **Develop Features**: Implement new functionality or improve existing tools which can be done via PRs.
- **[Improve Documentation](https://docs.unsloth.ai/)**: Help by creating guides, FAQs, or enhancing clarity.

One of the best ways to support us is by spreading the word about Unsloth! Share how it’s powering your amazing projects in blog posts or social media, and inspire others to explore its potential. Even a simple star on our repo goes a long way in showing your support and helping the community grow. 🌟

## Submitting Issues
If you find a bug or have a feature idea, we’d love to hear from you! Here’s how to make your submission stand out:

### Reporting Bugs
1. **Search First**: Check if the issue has already been reported using GitHub’s search bar under Issues.
2. **Details Matter**: Is this on Google Colab, Kaggle, or on another platform service? Are you using Unsloth's official notebook? Include your OS, Python version, and other relevant details. For bugs, a concise code snippet that reproduces the issue is incredibly helpful.
3. **Be Thorough**: Attach screenshots, traceback logs, or any additional information that might speed up resolution.

## Spread the Word
Your support extends beyond code:
- Spread the word by writing about Unsloth in blogs or social media.
- Share how Unsloth powers your projects.
- Star our repository to show your appreciation.

Finally, please be mindful of our [Code of Conduct](https://github.com/unslothai/unsloth/blob/main/CODE_OF_CONDUCT.md) to ensure a welcoming and inclusive environment for everyone.

Thank you so much for reading and we hope you have lots of fun using Unsloth! 🦥


## /README.md

<div align="center">

  <a href="https://unsloth.ai"><picture>
    <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20white%20text.png">
    <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png">
    <img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
  </picture></a>
  
<a href="https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-Alpaca.ipynb"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" height="48"></a>
<a href="https://discord.com/invite/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" height="48"></a>
<a href="https://docs.unsloth.ai"><img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Documentation%20Button.png" height="48"></a>

### Finetune Qwen3, Llama 4, Gemma 3, Phi-4 & Mistral 2x faster with 80% less VRAM!

![](https://i.ibb.co/sJ7RhGG/image-41.png)

</div>

## ✨ Finetune for Free

Notebooks are beginner friendly. Read our [guide](https://docs.unsloth.ai/get-started/fine-tuning-guide). Add your dataset, click "Run All", and export your finetuned model to GGUF, Ollama, vLLM or Hugging Face.

| Unsloth supports | Free Notebooks | Performance | Memory use |
|-----------|---------|--------|----------|
| **Qwen3 (14B)**      | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(14B)-Reasoning-Conversational.ipynb)               | 2x faster | 70% less |
| **Qwen3 (4B): GRPO**      | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)               | 2x faster | 80% less |
| **Gemma 3 (4B)**      | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(4B).ipynb)               | 1.6x faster | 60% less |
| **Llama 3.2 (3B)**      | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(1B_and_3B)-Conversational.ipynb)               | 2x faster | 70% less |
| **Phi-4 (14B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Phi_4-Conversational.ipynb)               | 2x faster | 70% less |
| **Llama 3.2 Vision (11B)**      | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb)               | 2x faster | 50% less |
| **Llama 3.1 (8B)**      | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-Alpaca.ipynb)               | 2x faster | 70% less |
| **Mistral v0.3 (7B)**    | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Mistral_v0.3_(7B)-Conversational.ipynb)               | 2.2x faster | 75% less |
| **Sesame-CSM (1B)**     | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Sesame_CSM_(1B)-TTS.ipynb)               | 1.5x faster | 50% less |

- See all our notebooks for: [Kaggle](https://github.com/unslothai/notebooks?tab=readme-ov-file#-kaggle-notebooks), [GRPO](https://docs.unsloth.ai/get-started/unsloth-notebooks#grpo-reasoning-notebooks), **[TTS](https://docs.unsloth.ai/get-started/unsloth-notebooks#text-to-speech-tts-notebooks)** & [Vision](https://docs.unsloth.ai/get-started/unsloth-notebooks#vision-multimodal-notebooks)
- See [all our models](https://docs.unsloth.ai/get-started/all-our-models) and our [Synthetic Dataset notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Meta_Synthetic_Data_Llama3_2_%283B%29.ipynb) in collaboration with Meta
- See detailed documentation for Unsloth [here](https://docs.unsloth.ai/)

## ⚡ Quickstart

- **Install with pip (recommended)** for Linux devices:
```
pip install unsloth
```
For Windows install instructions, see [here](https://docs.unsloth.ai/get-started/installing-+-updating/windows-installation).

## 🦥 Unsloth.ai News
- 📣 NEW! **[Text-to-Speech (TTS)](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning)** is now supported, including `sesame/csm-1b` and STT `openai/whisper-large-v3`.
- 📣 NEW! **[Qwen3](https://docs.unsloth.ai/basics/qwen3-how-to-run-and-fine-tune)** is now supported. Qwen3-30B-A3B fits on 17.5GB VRAM.
- 📣 NEW! Introducing **[Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs)** quants that set new benchmarks on 5-shot MMLU & KL Divergence.
- 📣 **[Llama 4](https://unsloth.ai/blog/llama4)** by Meta, including Scout & Maverick are now supported.
- 📣 [**EVERYTHING** is now supported](https://unsloth.ai/blog/gemma3#everything) - all models (BERT, diffusion, Cohere, Mamba), FFT, etc. MultiGPU coming soon. Enable FFT with `full_finetuning = True`, 8-bit with `load_in_8bit = True`.
- 📣 **Gemma 3** by Google: [Read Blog](https://unsloth.ai/blog/gemma3). We [uploaded GGUFs, 4-bit models](https://huggingface.co/collections/unsloth/gemma-3-67d12b7e8816ec6efa7e4e5b).
- 📣 Introducing Long-context [Reasoning (GRPO)](https://unsloth.ai/blog/grpo) in Unsloth. Train your own reasoning model with just 5GB VRAM. Transform Llama, Phi, Mistral etc. into reasoning LLMs!
- 📣 [DeepSeek-R1](https://unsloth.ai/blog/deepseek-r1) - run or fine-tune them [with our guide](https://unsloth.ai/blog/deepseek-r1). All model uploads: [here](https://huggingface.co/collections/unsloth/deepseek-r1-all-versions-678e1c48f5d2fce87892ace5).
<details>
  <summary>Click for more news</summary>

- 📣 Introducing Unsloth [Dynamic 4-bit Quantization](https://unsloth.ai/blog/dynamic-4bit)! We dynamically opt not to quantize certain parameters and this greatly increases accuracy while only using <10% more VRAM than BnB 4-bit. See our collection on [Hugging Face here.](https://huggingface.co/collections/unsloth/unsloth-4-bit-dynamic-quants-67503bb873f89e15276c44e7)
- 📣 [Phi-4](https://unsloth.ai/blog/phi4) by Microsoft: We also [fixed bugs](https://unsloth.ai/blog/phi4) in Phi-4 and [uploaded GGUFs, 4-bit](https://huggingface.co/collections/unsloth/phi-4-all-versions-677eecf93784e61afe762afa).
- 📣 [Vision models](https://unsloth.ai/blog/vision) now supported! [Llama 3.2 Vision (11B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb), [Qwen 2.5 VL (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_VL_(7B)-Vision.ipynb) and [Pixtral (12B) 2409](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Pixtral_(12B)-Vision.ipynb)
- 📣 [Llama 3.3 (70B)](https://huggingface.co/collections/unsloth/llama-33-all-versions-67535d7d994794b9d7cf5e9f), Meta's latest model is supported.
- 📣 We worked with Apple to add [Cut Cross Entropy](https://arxiv.org/abs/2411.09009). Unsloth now supports 89K context for Meta's Llama 3.3 (70B) on a 80GB GPU - 13x longer than HF+FA2. For Llama 3.1 (8B), Unsloth enables 342K context, surpassing its native 128K support.
- 📣 We found and helped fix a [gradient accumulation bug](https://unsloth.ai/blog/gradient)! Please update Unsloth and transformers.
- 📣 We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support [4x longer context windows](https://unsloth.ai/blog/long-context)!
</details>

## 🔗 Links and Resources
| Type                            | Links                               |
| ------------------------------- | --------------------------------------- |
| 📚 **Documentation & Wiki**              | [Read Our Docs](https://docs.unsloth.ai) |
| <img height="14" src="https://upload.wikimedia.org/wikipedia/commons/6/6f/Logo_of_Twitter.svg" />&nbsp; **Twitter (aka X)**              |  [Follow us on X](https://twitter.com/unslothai)|
| 💾 **Installation**               | [Pip install](https://docs.unsloth.ai/get-started/installing-+-updating)|
| 🔮 **Our Models**            | [Unsloth Releases](https://docs.unsloth.ai/get-started/all-our-models)|
| ✍️ **Blog**                    | [Read our Blogs](https://unsloth.ai/blog)|
| <img height="14" src="https://redditinc.com/hs-fs/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" />&nbsp; **Reddit**                    | [Join our Reddit page](https://reddit.com/r/unsloth)|

## ⭐ Key Features
- Supports **full-finetuning**, pretraining, 4b-bit, 16-bit and **8-bit** training
- Supports **all transformer-style models** including [TTS, STT](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning), multimodal, diffusion, [BERT](https://docs.unsloth.ai/get-started/unsloth-notebooks#other-important-notebooks) and more!
- All kernels written in [OpenAI's Triton](https://openai.com/index/triton/) language. **Manual backprop engine**.
- **0% loss in accuracy** - no approximation methods - all exact.
- No change of hardware. Supports NVIDIA GPUs since 2018+. Minimum CUDA Capability 7.0 (V100, T4, Titan V, RTX 20, 30, 40x, A100, H100, L40 etc) [Check your GPU!](https://developer.nvidia.com/cuda-gpus) GTX 1070, 1080 works, but is slow.
- Works on **Linux** and **Windows**
- If you trained a model with 🦥Unsloth, you can use this cool sticker! &nbsp; <img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/made with unsloth.png" height="50" align="center" />

## 💾 Install Unsloth
You can also see our documentation for more detailed installation and updating instructions [here](https://docs.unsloth.ai/get-started/installing-+-updating).

### Pip Installation
**Install with pip (recommended) for Linux devices:**
```
pip install unsloth
```
**To update Unsloth:**
```
pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
```
See [here](https://github.com/unslothai/unsloth/edit/main/README.md#advanced-pip-installation) for advanced pip install instructions.
### Windows Installation
> [!warning]
> Python 3.13 does not support Unsloth. Use 3.12, 3.11 or 3.10

1. **Install NVIDIA Video Driver:**
  You should install the latest version of your GPUs driver. Download drivers here: [NVIDIA GPU Drive](https://www.nvidia.com/Download/index.aspx).

3. **Install Visual Studio C++:**
   You will need Visual Studio, with C++ installed. By default, C++ is not installed with [Visual Studio](https://visualstudio.microsoft.com/vs/community/), so make sure you select all of the C++ options. Also select options for Windows 10/11 SDK. For detailed instructions with options, see [here](https://docs.unsloth.ai/get-started/installing-+-updating).

5. **Install CUDA Toolkit:**
   Follow the instructions to install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive).

6. **Install PyTorch:**
   You will need the correct version of PyTorch that is compatible with your CUDA drivers, so make sure to select them carefully.
   [Install PyTorch](https://pytorch.org/get-started/locally/).

7. **Install Unsloth:**
   
```python
pip install unsloth
```

#### Notes
To run Unsloth directly on Windows:
- Install Triton from this Windows fork and follow the instructions [here](https://github.com/woct0rdho/triton-windows) (be aware that the Windows fork requires PyTorch >= 2.4 and CUDA 12)
- In the SFTTrainer, set `dataset_num_proc=1` to avoid a crashing issue:
```python
trainer = SFTTrainer(
    dataset_num_proc=1,
    ...
)
```

#### Advanced/Troubleshooting

For **advanced installation instructions** or if you see weird errors during installations:

1. Install `torch` and `triton`. Go to https://pytorch.org to install it. For example `pip install torch torchvision torchaudio triton`
2. Confirm if CUDA is installed correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
3. Install `xformers` manually. You can try installing `vllm` and seeing if `vllm` succeeds. Check if `xformers` succeeded with `python -m xformers.info` Go to https://github.com/facebookresearch/xformers. Another option is to install `flash-attn` for Ampere GPUs.
4. Double check that your versions of Python, CUDA, CUDNN, `torch`, `triton`, and `xformers` are compatible with one another. The [PyTorch Compatibility Matrix](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix) may be useful. 
5. Finally, install `bitsandbytes` and check it with `python -m bitsandbytes`

### Conda Installation (Optional)
`⚠️Only use Conda if you have it. If not, use Pip`. Select either `pytorch-cuda=11.8,12.1` for CUDA 11.8 or CUDA 12.1. We support `python=3.10,3.11,3.12`.
```bash
conda create --name unsloth_env \
    python=3.11 \
    pytorch-cuda=12.1 \
    pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
    -y
conda activate unsloth_env

pip install unsloth
```

<details>
  <summary>If you're looking to install Conda in a Linux environment, <a href="https://docs.anaconda.com/miniconda/">read here</a>, or run the below 🔽</summary>
  
  ```bash
  mkdir -p ~/miniconda3
  wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
  bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
  rm -rf ~/miniconda3/miniconda.sh
  ~/miniconda3/bin/conda init bash
  ~/miniconda3/bin/conda init zsh
  ```
</details>

### Advanced Pip Installation
`⚠️Do **NOT** use this if you have Conda.` Pip is a bit more complex since there are dependency issues. The pip command is different for `torch 2.2,2.3,2.4,2.5` and CUDA versions.

For other torch versions, we support `torch211`, `torch212`, `torch220`, `torch230`, `torch240` and for CUDA versions, we support `cu118` and `cu121` and `cu124`. For Ampere devices (A100, H100, RTX3090) and above, use `cu118-ampere` or `cu121-ampere` or `cu124-ampere`.

For example, if you have `torch 2.4` and `CUDA 12.1`, use:
```bash
pip install --upgrade pip
pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
```

Another example, if you have `torch 2.5` and `CUDA 12.4`, use:
```bash
pip install --upgrade pip
pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"
```

And other examples:
```bash
pip install "unsloth[cu121-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu118-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu118-torch240] @ git+https://github.com/unslothai/unsloth.git"

pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"

pip install "unsloth[cu121-torch250] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu124-ampere-torch250] @ git+https://github.com/unslothai/unsloth.git"
```

Or, run the below in a terminal to get the **optimal** pip installation command:
```bash
wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python -
```

Or, run the below manually in a Python REPL:
```python
try: import torch
except: raise ImportError('Install torch via `pip install torch`')
from packaging.version import Version as V
v = V(torch.__version__)
cuda = str(torch.version.cuda)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": raise RuntimeError(f"CUDA = {cuda} not supported!")
if   v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
elif v  < V('2.3.0'): x = 'cu{}{}-torch220'
elif v  < V('2.4.0'): x = 'cu{}{}-torch230'
elif v  < V('2.5.0'): x = 'cu{}{}-torch240'
elif v  < V('2.6.0'): x = 'cu{}{}-torch250'
else: raise RuntimeError(f"Torch = {v} too new!")
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
```

## 📜 Documentation
- Go to our official [Documentation](https://docs.unsloth.ai) for saving to GGUF, checkpointing, evaluation and more!
- We support Huggingface's TRL, Trainer, Seq2SeqTrainer or even Pytorch code!
- We're in 🤗Hugging Face's official docs! Check out the [SFT docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth) and [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)!
- If you want to download models from the ModelScope community, please use an environment variable: `UNSLOTH_USE_MODELSCOPE=1`, and install the modelscope library by: `pip install modelscope -U`.

> unsloth_cli.py also supports `UNSLOTH_USE_MODELSCOPE=1` to download models and datasets. please remember to use the model and dataset id in the ModelScope community.

```python
from unsloth import FastLanguageModel, FastModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!
# Get LAION dataset
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files = {"train" : url}, split = "train")

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 4bit for 405b!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # NEW! Llama 3.3 70B!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4B-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    tokenizer = tokenizer,
    args = SFTConfig(
        max_seq_length = max_seq_length,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 10,
        max_steps = 60,
        logging_steps = 1,
        output_dir = "outputs",
        optim = "adamw_8bit",
        seed = 3407,
    ),
)
trainer.train()

# Go to https://github.com/unslothai/unsloth/wiki for advanced tips like
# (1) Saving to GGUF / merging to 16bit for vLLM
# (2) Continued training from a saved LoRA adapter
# (3) Adding an evaluation loop / OOMs
# (4) Customized chat templates
```

<a name="RL"></a>
## 💡 Reinforcement Learning
RL including DPO, GRPO, PPO, Reward Modelling, Online DPO all work with Unsloth. We're in 🤗Hugging Face's official docs! We're on the [GRPO docs](https://huggingface.co/learn/nlp-course/en/chapter12/6) and the [DPO docs](https://huggingface.co/docs/trl/main/en/dpo_trainer#accelerate-dpo-fine-tuning-using-unsloth)! List of RL notebooks:

- Advanced Qwen3 GRPO notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
- ORPO notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-ORPO.ipynb)
- DPO Zephyr notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_(7B)-DPO.ipynb)
- KTO notebook: [Link](https://colab.research.google.com/drive/1MRgGtLWuZX4ypSfGguFgC-IblTvO2ivM?usp=sharing)
- SimPO notebook: [Link](https://colab.research.google.com/drive/1Hs5oQDovOay4mFA6Y9lQhVJ8TnbFLFh2?usp=sharing)

<details>
  <summary>Click for DPO code</summary>
  
```python
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Optional set GPU device ID

from unsloth import FastLanguageModel
import torch
from trl import DPOTrainer, DPOConfig
max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/zephyr-sft-bnb-4bit",
    max_seq_length = max_seq_length,
    load_in_4bit = True,
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    max_seq_length = max_seq_length,
)

dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    train_dataset = YOUR_DATASET_HERE,
    # eval_dataset = YOUR_DATASET_HERE,
    tokenizer = tokenizer,
    args = DPOConfig(
        per_device_train_batch_size = 4,
        gradient_accumulation_steps = 8,
        warmup_ratio = 0.1,
        num_train_epochs = 3,
        logging_steps = 1,
        optim = "adamw_8bit",
        seed = 42,
        output_dir = "outputs",
        max_length = 1024,
        max_prompt_length = 512,
        beta = 0.1,
    ),
)
dpo_trainer.train()
```
</details>

## 🥇 Performance Benchmarking
- For our most detailed benchmarks, read our [Llama 3.3 Blog](https://unsloth.ai/blog/llama3-3).
- Benchmarking of Unsloth was also conducted by [🤗Hugging Face](https://huggingface.co/blog/unsloth-trl).

We tested using the Alpaca  Dataset, a batch size of 2, gradient accumulation steps of 4, rank = 32, and applied QLoRA on all linear layers (q, k, v, o, gate, up, down):
  
| Model          | VRAM  | 🦥 Unsloth speed | 🦥 VRAM reduction | 🦥 Longer context | 😊 Hugging Face + FA2 |
|----------------|-------|-----------------|----------------|----------------|--------------------|
| Llama 3.3 (70B)| 80GB  | 2x              | >75%           | 13x longer     | 1x                 |
| Llama 3.1 (8B) | 80GB  | 2x              | >70%           | 12x longer     | 1x                 |

### Context length benchmarks

#### Llama 3.1 (8B) max. context length
We tested Llama 3.1 (8B) Instruct and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.
| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
|----------|-----------------------|-----------------|
| 8 GB     | 2,972                 | OOM             |
| 12 GB    | 21,848                | 932             |
| 16 GB    | 40,724                | 2,551           |
| 24 GB    | 78,475                | 5,789           |
| 40 GB    | 153,977               | 12,264          |
| 48 GB    | 191,728               | 15,502          |
| 80 GB    | 342,733               | 28,454          |

#### Llama 3.3 (70B) max. context length
We tested Llama 3.3 (70B) Instruct on a 80GB A100 and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.

| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
|----------|------------------------|------------------|
| 48 GB    | 12,106                | OOM              |
| 80 GB    | 89,389                | 6,916            |

<br>

![](https://i.ibb.co/sJ7RhGG/image-41.png)
<br>

### Citation

You can cite the Unsloth repo as follows:
```bibtex
@software{unsloth,
  author = {Daniel Han, Michael Han and Unsloth team},
  title = {Unsloth},
  url = {http://github.com/unslothai/unsloth},
  year = {2023}
}
```

### Thank You to
- The [llama.cpp library](https://github.com/ggml-org/llama.cpp) that lets users save models with Unsloth
- The Hugging Face team and their [TRL library](https://github.com/huggingface/trl)
- [Erik](https://github.com/erikwijmans) for his help adding [Apple's ML Cross Entropy](https://github.com/apple/ml-cross-entropy) in Unsloth
- [Etherl](https://github.com/Etherll) for adding support for [TTS, diffusion and BERT models](https://github.com/unslothai/notebooks/pull/34)
- And of course for every single person who has contributed or has used Unsloth!


## /images/Assistant.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Assistant.png

## /images/Colab.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Colab.png

## /images/Discord button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Discord button.png

## /images/Discord.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Discord.png

## /images/Documentation Button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Documentation Button.png

## /images/Free version button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Free version button.png

## /images/Kaggle.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Kaggle.png

## /images/Kofi button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Kofi button.png

## /images/LAION 2GPU.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/LAION 2GPU.png

## /images/Merge.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Merge.png

## /images/Run.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Run.png

## /images/Slim Orca 2GPUs.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Slim Orca 2GPUs.png

## /images/Terminal_Type.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Terminal_Type.png

## /images/Where_Terminal.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Where_Terminal.png

## /images/buy me a coffee button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/buy me a coffee button.png

## /images/documentation github button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation github button.png

## /images/documentation green button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation green button.png

## /images/documentation lighter.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation lighter.png

## /images/documentation white button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation white button.png

## /images/made with unsloth.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/made with unsloth.png

## /images/ollama.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/ollama.png

## /images/peft x trl button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/peft x trl button.png

## /images/start free finetune button.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/start free finetune button.png

## /images/unsloth end.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth end.png

## /images/unsloth loading page render.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth loading page render.png

## /images/unsloth logo black text.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth logo black text.png

## /images/unsloth logo only.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth logo only.png

## /images/unsloth logo white text.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth logo white text.png

## /images/unsloth made with love.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth made with love.png

## /images/unsloth new logo.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth new logo.png

## /images/unsloth sticker.png

Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth sticker.png

## /pyproject.toml

```toml path="/pyproject.toml" 
[build-system]
requires = ["setuptools", "setuptools-scm"]
build-backend = "setuptools.build_meta"

[project]
name = "unsloth"
dynamic = ["version"]
description = "2-5X faster LLM finetuning"
readme = "README.md"
requires-python = ">=3.9,<3.13"
license = "Apache-2.0"
keywords = ["ai", "llm",]
authors = [
    {email = "info@unsloth.ai"},
    {name = "Unsloth AI team"},
]
maintainers = [
    {name = "Daniel Han", email = "danielhanchen@gmail.com"},
    {name = "Michael Han", email = "info@unsloth.ai"},
]
classifiers = [
    "Programming Language :: Python",
]

[tool.setuptools.dynamic]
version = {attr = "unsloth.models._utils.__version__"}

[tool.setuptools]
include-package-data = false

[tool.setuptools.packages.find]
exclude = ["images*", "tests*", "kernels/moe*"]

[project.optional-dependencies]
triton = [
    "triton-windows ; platform_system == 'Windows'",
]

huggingface = [
    "unsloth_zoo>=2025.6.1",
    "packaging",
    "tyro",
    "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2",
    "datasets>=3.4.1",
    "sentencepiece>=0.2.0",
    "tqdm",
    "psutil",
    "wheel>=0.42.0",
    "numpy",
    "accelerate>=0.34.1",
    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0",
    "peft>=0.7.1,!=0.11.0",
    "protobuf",
    "huggingface_hub",
    "hf_transfer",
    "unsloth[triton]",
]
windows=[
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5 ; platform_system == 'Windows'",
    "xformers>=0.0.22.post7 ; platform_system == 'Windows'",
]
cu118only = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu121only = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu118onlytorch211 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu121onlytorch211 = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu118onlytorch212 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu121onlytorch212 = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu118onlytorch220 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu121onlytorch220 = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
]
cu118onlytorch230 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu121onlytorch230 = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu118onlytorch240 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu121onlytorch240 = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu124onlytorch240 = [
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
]
cu118onlytorch250 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu121onlytorch250 = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu124onlytorch250 = [
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
]
cu118onlytorch251 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu121onlytorch251 = [
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu124onlytorch251 = [
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
]
cu118onlytorch260 = [
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
]
cu124onlytorch260 = [
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
]
cu126onlytorch260 = [
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
]
cu126onlytorch270 = [
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
]
cu128onlytorch270 = [
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and platform_system == 'Linux'",
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp39-cp39-win_amd64.whl ; python_version=='3.9' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp310-cp310-win_amd64.whl ; python_version=='3.10' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and platform_system == 'Windows'",
    "xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and platform_system == 'Windows'",
]
cu118 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118only]",
]
cu121 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121only]",
]
cu118-torch211 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu118onlytorch211]",
]
cu121-torch211 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu121onlytorch211]",
]
cu118-torch212 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu118onlytorch212]",
]
cu121-torch212 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu121onlytorch212]",
]
cu118-torch220 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch220]",
]
cu121-torch220 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch220]",
]
cu118-torch230 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch230]",
]
cu121-torch230 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch230]",
]
cu118-torch240 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch240]",
]
cu121-torch240 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch240]",
]
cu124-torch240 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch240]",
]
cu118-torch250 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch250]",
]
cu121-torch250 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch250]",
]
cu124-torch250 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch250]",
]
cu118-torch251 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch251]",
]
cu121-torch251 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch251]",
]
cu124-torch251 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch251]",
]
cu118-torch260 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch260]",
]
cu124-torch260 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch260]",
]
cu126-torch260 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu126onlytorch260]",
]
cu126-torch270 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu126onlytorch270]",
]
cu128-torch270 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu128onlytorch270]",
]
kaggle = [
    "unsloth[huggingface]",
]
kaggle-new = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
]
conda = [
    "unsloth[huggingface]",
]
colab-torch211 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu121onlytorch211]",
]
colab-ampere-torch211 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu121onlytorch211]",
    "packaging",
    "ninja",
    "flash-attn>=2.6.3",
]
colab-torch220 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch220]",
]
colab-ampere-torch220 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch220]",
    "packaging",
    "ninja",
    "flash-attn>=2.6.3",
]
colab-new = [
    "unsloth_zoo>=2025.5.11",
    "packaging",
    "tyro",
    "transformers>=4.51.3,!=4.47.0,!=4.52.0,!=4.52.1,!=4.52.2",
    "datasets>=3.4.1",
    "sentencepiece>=0.2.0",
    "tqdm",
    "psutil",
    "wheel>=0.42.0",
    "numpy",
    "protobuf",
    "huggingface_hub",
    "hf_transfer",
    "bitsandbytes>=0.45.5",
    "unsloth[triton]",
]
colab-no-deps = [
    "accelerate>=0.34.1",
    "trl>=0.7.9,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,!=0.15.0",
    "peft>=0.7.1",
    "xformers",
    "bitsandbytes>=0.45.5",
    "protobuf",
]
colab = [
    "unsloth[cu121]",
]
flashattention = [
    "packaging ; platform_system == 'Linux'",
    "ninja ; platform_system == 'Linux'",
    "flash-attn>=2.6.3 ; platform_system == 'Linux'",
]
colab-ampere = [
    "unsloth[colab-ampere-torch220]",
    "unsloth[flashattention]",
]
cu118-ampere = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118only]",
    "unsloth[flashattention]",
]
cu121-ampere = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121only]",
    "unsloth[flashattention]",
]
cu118-ampere-torch211 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu118onlytorch211]",
    "unsloth[flashattention]",
]
cu121-ampere-torch211 = [
    "unsloth[huggingface]",
    "bitsandbytes==0.45.5",
    "unsloth[cu121onlytorch211]",
    "unsloth[flashattention]",
]
cu118-ampere-torch220 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch220]",
    "unsloth[flashattention]",
]
cu121-ampere-torch220 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch220]",
    "unsloth[flashattention]",
]
cu118-ampere-torch230 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch230]",
    "unsloth[flashattention]",
]
cu121-ampere-torch230 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch230]",
    "unsloth[flashattention]",
]
cu118-ampere-torch240 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch240]",
    "unsloth[flashattention]",
]
cu121-ampere-torch240 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch240]",
    "unsloth[flashattention]",
]
cu124-ampere-torch240 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch240]",
    "unsloth[flashattention]",
]
cu118-ampere-torch250 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch250]",
    "unsloth[flashattention]",
]
cu121-ampere-torch250 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch250]",
    "unsloth[flashattention]",
]
cu124-ampere-torch250 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch250]",
    "unsloth[flashattention]",
]
cu118-ampere-torch251 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch251]",
    "unsloth[flashattention]",
]
cu121-ampere-torch251 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu121onlytorch251]",
    "unsloth[flashattention]",
]
cu124-ampere-torch251 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch251]",
    "unsloth[flashattention]",
]
cu118-ampere-torch260 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu118onlytorch260]",
    "unsloth[flashattention]",
]
cu124-ampere-torch260 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu124onlytorch260]",
    "unsloth[flashattention]",
]
cu126-ampere-torch260 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu126onlytorch260]",
    "unsloth[flashattention]",
]
cu126-ampere-torch270 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu126onlytorch270]",
    "unsloth[flashattention]",
]
cu128-ampere-torch270 = [
    "unsloth[huggingface]",
    "bitsandbytes>=0.45.5",
    "unsloth[cu128onlytorch270]",
    "unsloth[flashattention]",
]

flashattentiontorch260abiFALSEcu12x = [
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.11'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.12'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.13'",
]
flashattentiontorch260abiTRUEcu12x = [
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.11'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.12'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.13'",
]
flashattentiontorch250abiFALSEcu12x = [
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.11'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.12'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.13'",
]
flashattentiontorch250abiTRUEcu12x = [
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.11'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.12'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.13'",
]
flashattentiontorch240abiFALSEcu12x = [
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.11'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.12'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.13'",
]
flashattentiontorch240abiTRUEcu12x = [
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.11'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.12'",
    "flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.13'",
]
intel-gpu-torch260 = [
    "unsloth[huggingface]",

    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp39-cp39-linux_x86_64.whl#sha256=147607f190a7d7aa24ba454def5977fbbfec792fdae18e4ed278cfec29b69271 ; platform_system == 'Linux' and python_version == '3.9' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp310-cp310-linux_x86_64.whl#sha256=23aa423fa1542afc34f67eb3ba8ef20060f6d1b3a4697eaeab22b11c92b30f2b ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp311-cp311-linux_x86_64.whl#sha256=bcfa995229bbfd9ffd8d6c8d9f6428d393e876fa6e23ee3c20e3c0d73ca75ca5 ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp312-cp312-linux_x86_64.whl#sha256=bd340903d03470708df3442438acb8b7e08087ab9e61fbe349b2872bf9257ab0 ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp313-cp313-linux_x86_64.whl#sha256=814dccc8a07159e6eca74bed70091bc8fea2d9dd87b0d91845f9f38cde62f01c ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",

    "torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp39-cp39-linux_x86_64.whl#sha256=6a8adf6dc4c089406e8b3a7e58ab57a463bddf9b07130d2576e76eced43e92af ; platform_system == 'Linux' and python_version == '3.9' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp310-cp310-linux_x86_64.whl#sha256=ff4561cbf07c83bbccaa0f6e9bb0e6dcf721bacd53c9c43c4eb0e7331b4792f9 ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp311-cp311-linux_x86_64.whl#sha256=12005f66b810ddd3ab93f86c4522bcfdd412cbd27fc9d189b661ff7509bc5e8a ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp312-cp312-linux_x86_64.whl#sha256=c4c5c67625cdacf35765c2b94e61fe166e3c3f4a14521b1212a59ad1b3eb0f2e ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp313-cp313-linux_x86_64.whl#sha256=e6864f7a60a5ecc43d5d38f59a16e5dd132384f73dfd3a697f74944026038f7b ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
]
intel-gpu-torch270 = [
    "unsloth[huggingface]",

    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=749a7098492c6a27b356c97149a4a62973b953eae60bc1b6259260974f344913 ; platform_system == 'Linux' and python_version == '3.9' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=44362e80abd752471a08341093321955b066daa2cfb4810e73b8e3b240850f93 ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=faa6b8c945a837a080f641bc8ccc77a98fa66980dcd7e62e715fd853737343fd ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=40f6fb65b345dc9a61813abe7ac9a585f2c9808f414d140cc2a5f11f53ee063c ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
    "pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=9821fe059de58e827ffc6aa10d69369b16c2f8c2a988b86bef9c2c6e396ab3aa ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",

    "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp39-cp39-linux_x86_64.whl#sha256=f8ee75e50fcbb37ed5b498299ca2264da99ab278a93fae2358e921e4a6e28273 ; platform_system == 'Linux' and python_version == '3.9' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp310-cp310-linux_x86_64.whl#sha256=d6fdc342961d98fdcd9d03dfd491a3208bb5f7fbb435841f8f72ce9fdcd2d026 ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp311-cp311-linux_x86_64.whl#sha256=74d07f9357df5cf2bf223ad3c84de16346bfaa0504f988fdd5590d3e177e5e86 ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp312-cp312-linux_x86_64.whl#sha256=c806d44aa2ca5d225629f6fbc6c994d5deaac2d2cde449195bc8e3522ddd219a ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
    "torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp313-cp313-linux_x86_64.whl#sha256=25d8277b7f01d42e2e014ccbab57a2692b6ec4eff8dcf894eda1b297407cf97a ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
]

[project.urls]
homepage = "http://www.unsloth.ai"
documentation = "https://github.com/unslothai/unsloth"
repository = "https://github.com/unslothai/unsloth"

```

## /tests/__init__.py

```py path="/tests/__init__.py" 

```

## /tests/qlora/README.md

## QLoRA Train and Merge Tests

### Overview
Tests that performing QLoRA training and merging weights to 16-bits post-training maintains same behavior as trained model.

- `test_unsloth_qlora_train_and_merge.py`: Test Unsloth QLoRA train and merge using `FastLanguageModel.from_pretrained`, `FastLanguageModel.get_peft_model`, and `FastLanguageModel.save_pretrained_merged` apis
- `test_hf_qlora_train_and_merge.py`: Test Hugging Face QLoRA train and merge using `from_pretrained`, `get_peft_model`, and `merge_and_unload` apis.
   - Demonstrates that `peft`'s `merge_and_unload` results in loss of accuracy as it requantizes the base layer after merging adapter weights so that the model still contains `Linear4Bit` layers post merging.
   - I (@jeromeku) implemented a custom merge function that replaces all `LoraLayers` with `Linear` layers whose weights are the dequantized base layer weights with adapter weights merged (compute done in fp32, cast to original dtype after merging), roughly equivalent to `FastLanguageModel.save_pretrained_merged`.

### Usage
Run unsloth test:
```bash
python tests/qlora/test_unsloth_qlora_train_and_merge.py
```
Run huggingface test:
```bash
python tests/qlora/test_hf_qlora_train_and_merge.py
```

### Details
The tests train a QLoRA model on a single prompt dataset
```
QUESTION = "What day was I born?"
ANSWER = "January 1, 2058"
USER_MESSAGE = {"role": "user", "content": QUESTION}
ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
```

Given that the answer is impossible to answer accurately without finetuning, we can only expect the model to answer the question correctly if the model has been trained on the question.

To check this behavior, we check the model's response to the question before and after training and after merging, checking that the model's response contains the answer after training and merging but not before training.

### Results

For the unsloth test, the model's behavior is as expected: 
- before training, the model's response does not contain the answer
- after training, the model's response contains the answer
- after merging, the model's response contains the answer

For the huggingface test, the model's behavior is as expected:
- before training, the model's response does not contain the answer
- after training, the model's response contains the answer
- after using peft's `merge_and_unload`, the model's response does not contain the answer
- after using my custom merge function, the model's response contains the answer

The scripts should output training params, training logs, as well as model responses before and after training and after merging (only prints model responses if answer is not contained in response).

## /tests/qlora/test_hf_qlora_train_and_merge.py

```py path="/tests/qlora/test_hf_qlora_train_and_merge.py" 
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# ruff: noqa
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).parents[2]
sys.path.append(str(REPO_ROOT))

import itertools
from copy import deepcopy

import torch
from datasets import Dataset
from trl import SFTConfig
from tests.utils import header_footer_context
from tests.utils.data_utils import (
    ANSWER,
    DEFAULT_MESSAGES,
    USER_MESSAGE,
    check_responses,
    create_dataset,
    describe_peft_weights,
)
from tests.utils.hf_utils import (
    convert_lora_to_linear,
    fix_llama3_tokenizer,
    get_peft_config,
    sample_responses,
    setup_model,
    setup_tokenizer,
    setup_trainer,
)

if __name__ == "__main__":
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    dtype = torch.bfloat16
    max_steps = 100
    num_examples = 1000
    lora_rank = 64
    output_dir = "sft_test"
    seed = 42
    batch_size = 5
    num_generations = 5
    tokenizer = setup_tokenizer(model_name, fixup_funcs=[fix_llama3_tokenizer])
    temperature = 0.8
    max_new_tokens = 20

    peft_config = get_peft_config(lora_rank=lora_rank, target_modules="all-linear")
    model = setup_model(model_name, quantize=True, dtype=dtype, peft_config=peft_config)

    prompt = tokenizer.apply_chat_template(
        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
    )
    with header_footer_context("Test Prompt and Answer"):
        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")

    dataset: Dataset = create_dataset(
        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
    )
    with header_footer_context("Dataset"):
        print(f"Dataset: {next(iter(dataset))}")

    training_args = SFTConfig(
        output_dir=output_dir,
        max_steps=max_steps,
        per_device_train_batch_size=batch_size,
        log_level="info",
        report_to="none",
        num_train_epochs=1,
        logging_steps=1,
        seed=seed,
        bf16=dtype == torch.bfloat16,
        fp16=dtype == torch.float16,
        save_strategy="no",
    )

    with header_footer_context("Train Args"):
        print(training_args)
        print(peft_config)

    trainer = setup_trainer(
        model, tokenizer, dataset, training_args, peft_config=peft_config
    )

    with header_footer_context("Model"):
        print(type(model.model))

    generation_args = {
        "num_generations": num_generations,
        "max_new_tokens": max_new_tokens,
        "temperature": temperature,
        "skip_special_tokens": False,
        "dtype": dtype,
    }
    responses = sample_responses(
        model,
        tokenizer,
        prompt=prompt,
        **generation_args,
    )
    with header_footer_context("Responses before training"):
        check_responses(responses, answer=ANSWER, prompt=prompt)

    with header_footer_context("Peft Weights before training"):
        for name, stats in itertools.islice(describe_peft_weights(model), 2):
            print(f"{name}:\n{stats}")

    output = trainer.train()
    with header_footer_context("Peft Weights after training"):
        for name, stats in itertools.islice(describe_peft_weights(model), 2):
            print(f"{name}:\n{stats}")

    with header_footer_context("Trainer Output"):
        print(output)

    responses = sample_responses(
        model,
        tokenizer,
        prompt=prompt,
        **generation_args,
    )
    with header_footer_context("Responses after training"):
        check_responses(responses, answer=ANSWER, prompt=prompt)

    model_copy = deepcopy(model)

    merged_model = convert_lora_to_linear(model)

    responses = sample_responses(
        merged_model,
        tokenizer,
        prompt=prompt,
        **generation_args,
    )
    with header_footer_context("Responses after custom merging to 16bit"):
        check_responses(responses, answer=ANSWER, prompt=prompt)

    merged_model_peft = model_copy.merge_and_unload()
    responses = sample_responses(
        merged_model_peft,
        tokenizer,
        prompt=prompt,
        **generation_args,
    )
    with header_footer_context("Responses after peft merge_and_unload"):
        check_responses(responses, answer=ANSWER, prompt=prompt)

```

## /tests/qlora/test_unsloth_qlora_train_and_merge.py

```py path="/tests/qlora/test_unsloth_qlora_train_and_merge.py" 
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# ruff: noqa
import sys
from pathlib import Path

REPO_ROOT = Path(__file__).parents[2]
sys.path.append(str(REPO_ROOT))

import itertools
from unsloth import FastLanguageModel

import torch
from datasets import Dataset
from trl import SFTConfig
from tests.utils import header_footer_context
from tests.utils.data_utils import (
    DEFAULT_MESSAGES,
    USER_MESSAGE,
    ANSWER,
    create_dataset,
    describe_peft_weights,
    check_responses,
)
from tests.utils.hf_utils import (
    sample_responses,
    setup_trainer,
)


def get_unsloth_model_and_tokenizer(
    model_name: str,
    max_seq_length: int,
    load_in_4bit: bool,
    fast_inference: bool,
    max_lora_rank: int = None,
    gpu_memory_utilization: float = 0.5,
    dtype: torch.dtype = torch.bfloat16,
):
    return FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        load_in_4bit=load_in_4bit,
        fast_inference=fast_inference,
        max_lora_rank=max_lora_rank,
        gpu_memory_utilization=gpu_memory_utilization,
        dtype=dtype,
    )


def get_unsloth_peft_model(
    model,
    lora_rank: int,
    target_modules: list[str] = "all-linear",
    use_gradient_checkpointing: str = False,
    random_state: int = 42,
):
    return FastLanguageModel.get_peft_model(
        model,
        r=lora_rank,
        target_modules=target_modules,
        lora_alpha=lora_rank,
        use_gradient_checkpointing=use_gradient_checkpointing,
        random_state=random_state,
    )


if __name__ == "__main__":
    model_name = "meta-llama/Llama-3.2-1B-Instruct"
    dtype = torch.bfloat16
    max_steps = 100
    num_examples = 1000
    lora_rank = 64
    output_dir = "sft_test"
    seed = 42
    batch_size = 5
    num_generations = 5
    target_modules = [
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ]
    gradient_checkpointing = False
    unsloth_merged_path = "unsloth_merged_16bit"

    model, tokenizer = get_unsloth_model_and_tokenizer(
        model_name,
        max_seq_length=512,
        load_in_4bit=True,
        fast_inference=False,
        max_lora_rank=lora_rank,
        dtype=dtype,
    )
    temperature = 0.8
    max_new_tokens = 20

    model = get_unsloth_peft_model(
        model,
        lora_rank=lora_rank,
        target_modules=target_modules,
        use_gradient_checkpointing=gradient_checkpointing,
        random_state=seed,
    )

    prompt = tokenizer.apply_chat_template(
        [USER_MESSAGE], tokenize=False, add_generation_prompt=True
    )

    with header_footer_context("Test Prompt and Answer"):
        print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")

    dataset: Dataset = create_dataset(
        tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
    )
    with header_footer_context("Dataset"):
        print(f"Dataset: {next(iter(dataset))}")

    training_args = SFTConfig(
        output_dir=output_dir,
        max_steps=max_steps,
        per_device_train_batch_size=batch_size,
        log_level="info",
        report_to="none",
        num_train_epochs=1,
        logging_steps=1,
        seed=seed,
        bf16=dtype == torch.bfloat16,
        fp16=dtype == torch.float16,
        save_strategy="no",
    )

    with header_footer_context("Train Args"):
        print(training_args)

    trainer = setup_trainer(model, tokenizer, dataset, training_args)

    with header_footer_context("Model"):
        print(type(model.model))

    generation_args = {
        "num_generations": num_generations,
        "max_new_tokens": max_new_tokens,
        "temperature": temperature,
        "skip_special_tokens": False,
        "dtype": dtype,
    }
    responses = sample_responses(
        model,
        tokenizer,
        prompt=prompt,
        **generation_args,
    )
    with header_footer_context("Responses before training"):
        check_responses(responses, answer=ANSWER, prompt=prompt)
    with header_footer_context("Peft Weights before training"):
        for name, stats in itertools.islice(describe_peft_weights(model), 2):
            print(f"{name}:\n{stats}")

    output = trainer.train()
    with header_footer_context("Peft Weights after training"):
        for name, stats in itertools.islice(describe_peft_weights(model), 2):
            print(f"{name}:\n{stats}")

    with header_footer_context("Trainer Output"):
        print(output)

    responses = sample_responses(
        model,
        tokenizer,
        prompt=prompt,
        **generation_args,
    )
    with header_footer_context("Responses after training"):
        check_responses(responses, answer=ANSWER, prompt=prompt)

    model.save_pretrained_merged(
        unsloth_merged_path,
        tokenizer,
        save_method="merged_16bit",
    )
    merged_model_unsloth, tokenizer = get_unsloth_model_and_tokenizer(
        unsloth_merged_path,
        max_seq_length=512,
        load_in_4bit=False,
        fast_inference=False,
        dtype=dtype,
    )
    responses = sample_responses(
        merged_model_unsloth,
        tokenizer,
        prompt=prompt,
        **generation_args,
    )
    with header_footer_context("Responses after unsloth merge to 16bit"):
        check_responses(responses, answer=ANSWER, prompt=prompt)

```

## /tests/saving/language_models/test_merge_model_perplexity_llama-3.2.py

```py path="/tests/saving/language_models/test_merge_model_perplexity_llama-3.2.py" 
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc

# ruff: noqa
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))


from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison

# Define helper functions outside of main
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
    """Load model and compute perplexity in subprocess"""
    from unsloth import FastLanguageModel
    from unsloth.chat_templates import get_chat_template
    from tests.utils.perplexity_eval import ppl_model

    # Load model
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_llama_text_model",
        max_seq_length=2048,
        load_in_4bit=load_in_4bit,
        load_in_8bit=load_in_8bit,
    )
    # Set up tokenizer
    merged_tokenizer = get_chat_template(
        merged_tokenizer,
        chat_template="llama-3.1",
    )

    # Load dataset fresh in subprocess
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    # Format the dataset
    def formatting_prompts_func(examples):
        convos = examples["messages"]
        texts = [merged_tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
        return {"text": texts}

    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    # Compute perplexity using the passed dataset
    ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)


    # IMPORTANT: Convert to Python float if it's a tensor
    if torch.is_tensor(ppl_value):
        ppl_value = ppl_value.cpu().item()  # Move to CPU and convert to Python scalar
    elif hasattr(ppl_value, 'item'):
        ppl_value = ppl_value.item()  # Convert numpy or other array types
    else:
        ppl_value = float(ppl_value)  # Ensure it's a float

    # Return only the perplexity value
    result_queue.put(ppl_value)

    # Clean up
    del merged_model
    del merged_tokenizer
    del dataset_ppl
    torch.cuda.empty_cache()
    gc.collect()

# Main execution code should be wrapped in this guard
if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)

    if torch.cuda.is_bf16_supported():
        compute_dtype = torch.bfloat16
        attn_implementation = 'flash_attention_2'
    else:
        compute_dtype = torch.float16
        attn_implementation = 'sdpa'

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-Instruct",
        max_seq_length=2048,
        dtype=compute_dtype,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        attn_implementation=attn_implementation
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="llama-3.1",
    )

    from unsloth.chat_templates import standardize_sharegpt
    dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))

    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    from unsloth import is_bfloat16_supported

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset_train,
        dataset_text_field="text",
        max_seq_length=2048,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,
            max_steps=10,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=50,
            optim="adamw_8bit",
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs",
            report_to="none",
        ),
    )

    from unsloth.chat_templates import train_on_responses_only
    trainer = train_on_responses_only(
        trainer,
        instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
        response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
    )

    # run training
    trainer_stats = trainer.train()

    add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))

    # saving and merging the model to local disk
    print("merge and save to local disk")
    model.save_pretrained_merged(
        save_directory='./unsloth_out/merged_llama_text_model',
        tokenizer=tokenizer
    )

    # print("cleaning")
    # del model
    # del tokenizer
    # torch.cuda.empty_cache()
    # gc.collect()

    # load model from local disk and test
    print("Loading merged model in 4 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_llama_text_model",
        max_seq_length=2048,
        load_in_4bit=True,
        load_in_8bit=False,
    )

    add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))


    print("Computing 8-bit model perplexity in subprocess...")
    result_queue = mp.Queue()
    p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
    p.start()
    p.join()

    ppl_8bit = result_queue.get()
    add_to_comparison("merged model loaded 8bits", ppl_8bit)

    print("Loading merged model in 16 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_llama_text_model",
        max_seq_length=2048,
        load_in_4bit=False,
        load_in_8bit=False,
    )

    add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))

    print_model_comparison()

    # final cleanup
    safe_remove_directory("./outputs")
    safe_remove_directory("./unsloth_compiled_cache")
    safe_remove_directory("./unsloth_out")

```

## /tests/saving/language_models/test_merge_model_perplexity_mistral.py

```py path="/tests/saving/language_models/test_merge_model_perplexity_mistral.py" 
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc

# ruff: noqa
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison


def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
    """Load model and compute perplexity in subprocess"""
    from unsloth import FastLanguageModel
    from tests.utils.perplexity_eval import ppl_model

    # Load model
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_mistral_text_model",
        max_seq_length=2048,
        load_in_4bit=load_in_4bit,
        load_in_8bit=load_in_8bit,
    )
    # Set up tokenizer
    # merged_tokenizer = get_chat_template(
    #     merged_tokenizer,
    #     chat_template="llama-3.1",
    # )

    # Load dataset fresh in subprocess
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""

    EOS_TOKEN = merged_tokenizer.eos_token

    def formatting_prompts_func(examples):
        instructions = []
        inputs = []
        outputs = []
        texts = []

        for conversation in examples["messages"]:
            # Extract user message and assistant response
            user_message = ""
            assistant_message = ""

            for turn in conversation:
                if turn["role"] == "user":
                    user_message = turn["content"]
                elif turn["role"] == "assistant":
                    assistant_message = turn["content"]

            # Store intermediate format
            instruction = "Complete the statement"
            instructions.append(instruction)
            inputs.append(user_message)
            outputs.append(assistant_message)

            # Create formatted text
            text = alpaca_prompt.format(instruction, user_message, assistant_message) + EOS_TOKEN
            texts.append(text)

        return {
            "instruction": instructions,
            "input": inputs,
            "output": outputs,
            "text": texts
        }


    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    # Compute perplexity using the passed dataset
    ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)


    # IMPORTANT: Convert to Python float if it's a tensor
    if torch.is_tensor(ppl_value):
        ppl_value = ppl_value.cpu().item()  # Move to CPU and convert to Python scalar
    elif hasattr(ppl_value, 'item'):
        ppl_value = ppl_value.item()  # Convert numpy or other array types
    else:
        ppl_value = float(ppl_value)  # Ensure it's a float

    # Return only the perplexity value
    result_queue.put(ppl_value)

    # Clean up
    del merged_model
    del merged_tokenizer
    del dataset_ppl
    torch.cuda.empty_cache()
    gc.collect()

# Main execution code should be wrapped in this guard
if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)

    if torch.cuda.is_bf16_supported():
        compute_dtype = torch.bfloat16
        attn_implementation = 'flash_attention_2'
    else:
        compute_dtype = torch.float16
        attn_implementation = 'sdpa'

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/mistral-7b-v0.3",
        max_seq_length=2048,
        dtype=compute_dtype,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        attn_implementation=attn_implementation
    )


    EOS_TOKEN = tokenizer.eos_token


    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""


    # Define helper functions outside of main
    def formatting_prompts_func(examples):
        instructions = []
        inputs = []
        outputs = []
        texts = []

        for conversation in examples["messages"]:
            # Extract user message and assistant response
            user_message = ""
            assistant_message = ""

            for turn in conversation:
                if turn["role"] == "user":
                    user_message = turn["content"]
                elif turn["role"] == "assistant":
                    assistant_message = turn["content"]

            # Store intermediate format
            instruction = "Complete the statement"
            instructions.append(instruction)
            inputs.append(user_message)
            outputs.append(assistant_message)

            # Create formatted text
            text = alpaca_prompt.format(instruction, user_message, assistant_message) + EOS_TOKEN
            texts.append(text)


        return {
            "instruction": instructions,
            "input": inputs,
            "output": outputs,
            "text": texts
        }


    dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))

    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    from unsloth import is_bfloat16_supported

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset_train,
        dataset_text_field="text",
        max_seq_length=2048,
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,
            max_steps=200,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=50,
            optim="adamw_8bit",
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs",
            report_to="none",
        ),
    )

    # run training
    trainer_stats = trainer.train()

    add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))

    # saving and merging the model to local disk
    print("merge and save to local disk")
    model.save_pretrained_merged(
        save_directory='./unsloth_out/merged_mistral_text_model',
        tokenizer=tokenizer
    )

    # print("cleaning")
    # del model
    # del tokenizer
    # torch.cuda.empty_cache()
    # gc.collect()

    # load model from local disk and test
    print("Loading merged model in 4 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_mistral_text_model",
        max_seq_length=2048,
        load_in_4bit=True,
        load_in_8bit=False,
    )

    add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))


    print("Computing 8-bit model perplexity in subprocess...")
    result_queue = mp.Queue()
    p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
    p.start()
    p.join()

    ppl_8bit = result_queue.get()
    add_to_comparison("merged model loaded 8bits", ppl_8bit)

    print("Loading merged model in 16 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_mistral_text_model",
        max_seq_length=2048,
        load_in_4bit=False,
        load_in_8bit=False,
    )

    add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))

    print_model_comparison()

    safe_remove_directory("./outputs")
    safe_remove_directory("./unsloth_compiled_cache")
    safe_remove_directory("./unsloth_out")

```

## /tests/saving/language_models/test_merge_model_perplexity_phi_4.py

```py path="/tests/saving/language_models/test_merge_model_perplexity_phi_4.py" 
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc

# ruff: noqa
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))


from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison

# Define helper functions outside of main
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [
        tokenizer.apply_chat_template(
            convo, tokenize = False, add_generation_prompt = False
        )
        for convo in convos
    ]
    return { "text" : texts, }

def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
    """Load model and compute perplexity in subprocess"""
    from unsloth import FastLanguageModel
    from unsloth.chat_templates import get_chat_template
    from tests.utils.perplexity_eval import ppl_model

    # Load model
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_phi4_text_model",
        max_seq_length=2048,
        load_in_4bit=load_in_4bit,
        load_in_8bit=load_in_8bit,
    )
    # Set up tokenizer
    merged_tokenizer = get_chat_template(
        merged_tokenizer,
        chat_template="phi-4",
    )

    # Load dataset fresh in subprocess
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    # Format the dataset
    def formatting_prompts_func(examples):
        convos = examples["messages"]
        texts = [merged_tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
        return {"text": texts}

    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    # Compute perplexity using the passed dataset
    ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)


    # IMPORTANT: Convert to Python float if it's a tensor
    if torch.is_tensor(ppl_value):
        ppl_value = ppl_value.cpu().item()  # Move to CPU and convert to Python scalar
    elif hasattr(ppl_value, 'item'):
        ppl_value = ppl_value.item()  # Convert numpy or other array types
    else:
        ppl_value = float(ppl_value)  # Ensure it's a float

    # Return only the perplexity value
    result_queue.put(ppl_value)

    # Clean up
    del merged_model
    del merged_tokenizer
    del dataset_ppl
    torch.cuda.empty_cache()
    gc.collect()

# Main execution code should be wrapped in this guard
if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)

    if torch.cuda.is_bf16_supported():
        compute_dtype = torch.bfloat16
        attn_implementation = 'flash_attention_2'
    else:
        compute_dtype = torch.float16
        attn_implementation = 'sdpa'

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Phi-4",
        max_seq_length=2048,
        dtype=compute_dtype,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        attn_implementation=attn_implementation
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="phi-4",
    )

    dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))

    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    from unsloth import is_bfloat16_supported

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset_train,
        dataset_text_field="text",
        max_seq_length=2048,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,
            max_steps=200,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=50,
            optim="adamw_8bit",
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs",
            report_to="none",
        ),
    )

    from unsloth.chat_templates import train_on_responses_only
    trainer = train_on_responses_only(
        trainer,
        instruction_part="<|im_start|>user<|im_sep|>\n\n",
        response_part="<|im_start|>assistant<|im_sep|>\n\n",
    )

    # run training
    trainer_stats = trainer.train()

    add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))

    # saving and merging the model to local disk
    print("merge and save to local disk")
    model.save_pretrained_merged(
        save_directory='./unsloth_out/merged_phi4_text_model',
        tokenizer=tokenizer
    )

    # print("cleaning")
    # del model
    # del tokenizer
    # torch.cuda.empty_cache()
    # gc.collect()

    # load model from local disk and test
    print("Loading merged model in 4 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_phi4_text_model",
        max_seq_length=2048,
        load_in_4bit=True,
        load_in_8bit=False,
    )

    add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))


    print("Computing 8-bit model perplexity in subprocess...")
    result_queue = mp.Queue()
    p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
    p.start()
    p.join()

    ppl_8bit = result_queue.get()
    add_to_comparison("merged model loaded 8bits", ppl_8bit)

    print("Loading merged model in 16 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_phi4_text_model",
        max_seq_length=2048,
        load_in_4bit=False,
        load_in_8bit=False,
    )

    add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))

    print_model_comparison()

    # final cleanup
    safe_remove_directory("./outputs")
    safe_remove_directory("./unsloth_compiled_cache")
    safe_remove_directory("./unsloth_out")

```

## /tests/saving/language_models/test_merged_model_perplexity_llama-3.1-8b.py

```py path="/tests/saving/language_models/test_merged_model_perplexity_llama-3.1-8b.py" 
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc

# ruff: noqa
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison

# Define helper functions outside of main
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}

def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
    """Load model and compute perplexity in subprocess"""
    from unsloth import FastLanguageModel
    from unsloth.chat_templates import get_chat_template
    from tests.utils.perplexity_eval import ppl_model

    # Load model
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_llama_text_model",
        max_seq_length=2048,
        load_in_4bit=load_in_4bit,
        load_in_8bit=load_in_8bit,
    )
    # Set up tokenizer
    merged_tokenizer = get_chat_template(
        merged_tokenizer,
        chat_template="llama-3.1",
    )

    # Load dataset fresh in subprocess
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    # Format the dataset
    def formatting_prompts_func(examples):
        convos = examples["messages"]
        texts = [merged_tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
        return {"text": texts}

    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    # Compute perplexity using the passed dataset
    ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)


    # IMPORTANT: Convert to Python float if it's a tensor
    if torch.is_tensor(ppl_value):
        ppl_value = ppl_value.cpu().item()  # Move to CPU and convert to Python scalar
    elif hasattr(ppl_value, 'item'):
        ppl_value = ppl_value.item()  # Convert numpy or other array types
    else:
        ppl_value = float(ppl_value)  # Ensure it's a float

    # Return only the perplexity value
    result_queue.put(ppl_value)

    # Clean up
    del merged_model
    del merged_tokenizer
    del dataset_ppl
    torch.cuda.empty_cache()
    gc.collect()

# Main execution code should be wrapped in this guard
if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)

    if torch.cuda.is_bf16_supported():
        compute_dtype = torch.bfloat16
        attn_implementation = 'flash_attention_2'
    else:
        compute_dtype = torch.float16
        attn_implementation = 'sdpa'

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.1-8B-Instruct",
        max_seq_length=2048,
        dtype=compute_dtype,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        attn_implementation=attn_implementation
    )

    tokenizer = get_chat_template(
        tokenizer,
        chat_template="llama-3.1",
    )

    from unsloth.chat_templates import standardize_sharegpt
    dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)


    print("\n dataset sample [0]")
    print(dataset_train[0])

    add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))

    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    from unsloth import is_bfloat16_supported

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset_train,
        dataset_text_field="text",
        max_seq_length=2048,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,
            max_steps=200,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=50,
            optim="adamw_8bit",
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs",
            report_to="none",
        ),
    )

    from unsloth.chat_templates import train_on_responses_only
    trainer = train_on_responses_only(
        trainer,
        instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
        response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
    )

    tokenizer.decode(trainer.train_dataset[0]["input_ids"])

    # run training
    trainer_stats = trainer.train()

    add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))

    # saving and merging the model to local disk
    print("merge and save to local disk")
    model.save_pretrained_merged(
        save_directory='./unsloth_out/merged_llama_text_model',
        tokenizer=tokenizer
    )

    # print("cleaning")
    # del model
    # del tokenizer
    # torch.cuda.empty_cache()
    # gc.collect()

    # load model from local disk and test
    print("Loading merged model in 4 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_llama_text_model",
        max_seq_length=2048,
        load_in_4bit=True,
        load_in_8bit=False,
    )

    add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))


    print("Computing 8-bit model perplexity in subprocess...")
    result_queue = mp.Queue()
    p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
    p.start()
    p.join()

    ppl_8bit = result_queue.get()
    add_to_comparison("merged model loaded 8bits", ppl_8bit)

    print("Loading merged model in 16 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_llama_text_model",
        max_seq_length=2048,
        load_in_4bit=False,
        load_in_8bit=False,
    )

    add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))

    print_model_comparison()

    # final cleanup
    safe_remove_directory("./outputs")
    safe_remove_directory("./unsloth_compiled_cache")
    safe_remove_directory("./unsloth_out")

```

## /tests/saving/language_models/test_merged_model_perplexity_qwen_2.5.py

```py path="/tests/saving/language_models/test_merged_model_perplexity_qwen_2.5.py" 
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc

# ruff: noqa
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison


alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# Define helper functions outside of main
def formatting_prompts_func(examples):
    instructions = []
    inputs = []
    outputs = []
    texts = []

    for conversation in examples["messages"]:
        # Extract user message and assistant response
        user_message = ""
        assistant_message = ""

        for turn in conversation:
            if turn["role"] == "user":
                user_message = turn["content"]
            elif turn["role"] == "assistant":
                assistant_message = turn["content"]

        # Store intermediate format
        instruction = "Complete the statement"
        instructions.append(instruction)
        inputs.append(user_message)
        outputs.append(assistant_message)

        # Create formatted text
        text = alpaca_prompt.format(instruction, user_message, assistant_message)
        texts.append(text)

    return {
        "instruction": instructions,
        "input": inputs,
        "output": outputs,
        "text": texts
    }


def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
    """Load model and compute perplexity in subprocess"""
    from unsloth import FastLanguageModel
    from tests.utils.perplexity_eval import ppl_model

    # Load model
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_qwen_text_model",
        max_seq_length=2048,
        load_in_4bit=load_in_4bit,
        load_in_8bit=load_in_8bit,
    )
    # Set up tokenizer
    # merged_tokenizer = get_chat_template(
    #     merged_tokenizer,
    #     chat_template="llama-3.1",
    # )

    # Load dataset fresh in subprocess
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

    ### Instruction:
    {}

    ### Input:
    {}

    ### Response:
    {}"""

    def formatting_prompts_func(examples):
        instructions = []
        inputs = []
        outputs = []
        texts = []

        for conversation in examples["messages"]:
            # Extract user message and assistant response
            user_message = ""
            assistant_message = ""

            for turn in conversation:
                if turn["role"] == "user":
                    user_message = turn["content"]
                elif turn["role"] == "assistant":
                    assistant_message = turn["content"]

            # Store intermediate format
            instruction = "Complete the statement"
            instructions.append(instruction)
            inputs.append(user_message)
            outputs.append(assistant_message)

            # Create formatted text
            text = alpaca_prompt.format(instruction, user_message, assistant_message)
            texts.append(text)

        return {
            "instruction": instructions,
            "input": inputs,
            "output": outputs,
            "text": texts
        }


    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    # Compute perplexity using the passed dataset
    ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)


    # IMPORTANT: Convert to Python float if it's a tensor
    if torch.is_tensor(ppl_value):
        ppl_value = ppl_value.cpu().item()  # Move to CPU and convert to Python scalar
    elif hasattr(ppl_value, 'item'):
        ppl_value = ppl_value.item()  # Convert numpy or other array types
    else:
        ppl_value = float(ppl_value)  # Ensure it's a float

    # Return only the perplexity value
    result_queue.put(ppl_value)

    # Clean up
    # del merged_model
    # del merged_tokenizer
    # del dataset_ppl
    # torch.cuda.empty_cache()
    # gc.collect()

# Main execution code should be wrapped in this guard
if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)

    if torch.cuda.is_bf16_supported():
        compute_dtype = torch.bfloat16
        attn_implementation = 'flash_attention_2'
    else:
        compute_dtype = torch.float16
        attn_implementation = 'sdpa'

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Qwen2.5-7B-Instruct",
        max_seq_length=2048,
        dtype=compute_dtype,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
        attn_implementation=attn_implementation
    )


    dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
    dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

    dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
    dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

    add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))

    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
        use_rslora=False,
        loftq_config=None,
    )

    from unsloth import is_bfloat16_supported

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=dataset_train,
        dataset_text_field="text",
        max_seq_length=2048,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
        dataset_num_proc=2,
        packing=False,
        args=TrainingArguments(
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            warmup_ratio=0.1,
            max_steps=200,
            learning_rate=2e-4,
            fp16=not is_bfloat16_supported(),
            bf16=is_bfloat16_supported(),
            logging_steps=50,
            optim="adamw_8bit",
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="outputs",
            report_to="none",
        ),
    )

    # run training
    trainer_stats = trainer.train()

    add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))

    # saving and merging the model to local disk
    print("merge and save to local disk")
    model.save_pretrained_merged(
        save_directory='./unsloth_out/merged_qwen_text_model',
        tokenizer=tokenizer
    )

    # print("cleaning")
    # del model
    # del tokenizer
    # torch.cuda.empty_cache()
    # gc.collect()

    # load model from local disk and test
    print("Loading merged model in 4 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_qwen_text_model",
        max_seq_length=2048,
        load_in_4bit=True,
        load_in_8bit=False,
    )

    add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))


    print("Computing 8-bit model perplexity in subprocess...")
    result_queue = mp.Queue()
    p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
    p.start()
    p.join()

    ppl_8bit = result_queue.get()
    add_to_comparison("merged model loaded 8bits", ppl_8bit)

    print("Loading merged model in 16 bit for perplexity test")
    merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
        model_name="./unsloth_out/merged_qwen_text_model",
        max_seq_length=2048,
        load_in_4bit=False,
        load_in_8bit=False,
    )

    add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))

    print_model_comparison()

    safe_remove_directory("./outputs")
    safe_remove_directory("./unsloth_compiled_cache")
    safe_remove_directory("./unsloth_out")

```

## /tests/saving/language_models/test_push_to_hub_merged.py

```py path="/tests/saving/language_models/test_push_to_hub_merged.py" 
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
import os
from huggingface_hub import HfFileSystem, hf_hub_download
# ruff: noqa
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison


# Define helper functions outside of main
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}


if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
    attn_implementation = 'flash_attention_2'
else:
    compute_dtype = torch.float16
    attn_implementation = 'sdpa'

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.2-1B-Instruct",
    max_seq_length=2048,
    dtype=compute_dtype,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
    attn_implementation=attn_implementation
)

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

from unsloth.chat_templates import standardize_sharegpt
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    dataset_text_field="text",
    max_seq_length=2048,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_ratio=0.1,
        max_steps=30,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,
        optim="adamw_8bit",
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# run training
trainer_stats = trainer.train()


# saving and merging the model to local disk
hf_username = os.environ.get("HF_USER", "")
if not hf_username:
    hf_username = input("Please enter your Hugging Face username: ").strip()
    os.environ["HF_USER"] = hf_username

hf_token = os.environ.get("HF_TOKEN", "")
if not hf_token:
    hf_token = input("Please enter your Hugging Face token: ").strip()
    os.environ["HF_TOKEN"] = hf_token


repo_name = f"{hf_username}/merged_llama_text_model"
success = {
        "upload": False,
        "download": False,
 }

# Stage 1: Upload model to Hub
try:
    print("\n" + "=" * 80)
    print("=== UPLOADING MODEL TO HUB ===".center(80))
    print("=" * 80 + "\n")
    model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token)
    success["upload"] = True
    print("✅ Model uploaded successfully!")
except Exception as e:
    print(f"❌ Failed to upload model: {e}")
    raise Exception("Model upload failed.")

t
# Stage 2: Test downloading the model (even if cached)
safe_remove_directory(f"./{hf_username}")

try:
    print("\n" + "=" * 80)
    print("=== TESTING MODEL DOWNLOAD ===".center(80))
    print("=" * 80 + "\n")
    # Force download even if cached
    model,tokenizer = FastLanguageModel.from_pretrained(f"{hf_username}/merged_llama_text_model")
    success["download"] = True
    print("✅ Model downloaded successfully!")
except Exception as e:
    print(f"❌ Download failed: {e}")
    raise Exception("Model download failed.")

# Final report
print("\n" + "=" * 80)
print("=== VALIDATION REPORT ===".center(80))
print("=" * 80 + "\n")
for stage, passed in success.items():
    status = "✓" if passed else "✗"
    print(f"{status} {stage.replace('_', ' ').title()}")
print("\n" + "=" * 80)

if all(success.values()):
    print("\n🎉 All stages completed successfully!")
else:
    raise Exception("Validation failed for one or more stages.")

# final cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")

```

## /tests/saving/language_models/test_push_to_hub_merged_sharded_index_file.py

```py path="/tests/saving/language_models/test_push_to_hub_merged_sharded_index_file.py" 
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
import os
from huggingface_hub import HfFileSystem, hf_hub_download
# ruff: noqa
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison


# Define helper functions outside of main
def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    return {"text": texts}


if torch.cuda.is_bf16_supported():
    compute_dtype = torch.bfloat16
    attn_implementation = 'flash_attention_2'
else:
    compute_dtype = torch.float16
    attn_implementation = 'sdpa'

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Llama-3.1-8B-Instruct",
    max_seq_length=2048,
    dtype=compute_dtype,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
    attn_implementation=attn_implementation
)

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.1",
)

from unsloth.chat_templates import standardize_sharegpt
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")

dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)

add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))

model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset_train,
    dataset_text_field="text",
    max_seq_length=2048,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_ratio=0.1,
        max_steps=30,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=50,
        optim="adamw_8bit",
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
    response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# run training
trainer_stats = trainer.train()


# saving and merging the model to local disk
hf_username = os.environ.get("HF_USER", "")
if not hf_username:
    hf_username = input("Please enter your Hugging Face username: ").strip()
    os.environ["HF_USER"] = hf_username

hf_token = os.environ.get("HF_TOKEN", "")
if not hf_token:
    hf_token = input("Please enter your Hugging Face token: ").strip()
    os.environ["HF_TOKEN"] = hf_token


repo_name = f"{hf_username}/merged_llama_text_model"
success = {
        "upload": False,
        "safetensors_check": False,
        "download": False,
 }

# Stage 1: Upload model to Hub
try:
    print("\n" + "=" * 80)
    print("=== UPLOADING MODEL TO HUB ===".center(80))
    print("=" * 80 + "\n")
    model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token)
    success["upload"] = True
    print("✅ Model uploaded successfully!")
except Exception as e:
    print(f"❌ Failed to upload model: {e}")
    raise Exception("Model upload failed.")

# Stage 2: Verify safetensors.index.json exists
try:
    print("\n" + "=" * 80)
    print("=== VERIFYING REPO CONTENTS ===".center(80))
    print("=" * 80 + "\n")
    fs = HfFileSystem(token=hf_token)
    file_list = fs.ls(repo_name, detail=True)
    safetensors_found = any(
        file["name"].endswith("model.safetensors.index.json") for file in file_list
    )
    if safetensors_found:
        success["safetensors_check"] = True
        print("✅ model.safetensors.index.json found in repo!")
    else:
        raise Exception("model.safetensors.index.json not found in repo.")
except Exception as e:
    print(f"❌ Verification failed: {e}")
    raise Exception("Repo verification failed.")

# Stage 3: Test downloading the model (even if cached)
safe_remove_directory("./RTannous")

try:
    print("\n" + "=" * 80)
    print("=== TESTING MODEL DOWNLOAD ===".center(80))
    print("=" * 80 + "\n")
    # Force download even if cached
    model,tokenizer = FastLanguageModel.from_pretrained(f"{hf_username}/merged_llama_text_model")
    success["download"] = True
    print("✅ Model downloaded successfully!")
except Exception as e:
    print(f"❌ Download failed: {e}")
    raise Exception("Model download failed.")

# Final report
print("\n" + "=" * 80)
print("=== VALIDATION REPORT ===".center(80))
print("=" * 80 + "\n")
for stage, passed in success.items():
    status = "✓" if passed else "✗"
    print(f"{status} {stage.replace('_', ' ').title()}")
print("\n" + "=" * 80)

if all(success.values()):
    print("\n🎉 All stages completed successfully!")
else:
    raise Exception("Validation failed for one or more stages.")

# final cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")

```

## /tests/saving/language_models/test_save_merged_grpo_model.py

```py path="/tests/saving/language_models/test_save_merged_grpo_model.py" 
# -*- coding: utf-8 -*-
"""test_Llama3_1_(3B)_GRPO_LoRA (1).ipynb

### Unsloth

"""

from unsloth import FastLanguageModel
import torch
import sys
from pathlib import Path
import multiprocessing as mp
import gc
from multiprocessing import Queue

REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.aime_eval import evaluate_model_aime, compare_aime_results


max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower


def evaluate_merged_model(result_queue, load_in_4bit=False, load_in_8bit=False):
    from unsloth import FastLanguageModel
    from tests.utils.aime_eval import evaluate_model_aime
    max_seq_length = 2048 # Can increase for longer reasoning traces
    lora_rank = 64 # Larger rank = smarter, but slower

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "./final_merged_model",
        max_seq_length = max_seq_length,
        load_in_4bit = True, # False for LoRA 16bit
        fast_inference = True, # Enable vLLM fast inference
        max_lora_rank = lora_rank,
        gpu_memory_utilization = 0.8, # Reduce if out of memory
    )

    print(f"\n{'='*60}")
    if load_in_4bit:
        print("🔍 EVALUATION Merged model: 4 bits load")
        model_type="merged_model_4bits"
    elif load_in_8bit:
        print("🔍 EVALUATION Merged model: 8 bits load")
        model_type="merged_model_8bits"
    else:
        print("🔍 EVALUATION Merged model: 16 bits load")
        model_type="merged_model_16bits"
    print(f"{'='*60}")

    evaluate_model_aime(
        model=model,
        tokenizer=tokenizer,
        model_type=model_type,
        temperature=0.3,
        n_sampling=8,
        max_tokens=32768,
        top_p=0.95,
        seed=0
    )

    result_queue.put(results)

    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()


# Main execution code should be wrapped in this guard
def training_run(result_queue):
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "meta-llama/Llama-3.2-3B-Instruct",
        max_seq_length = max_seq_length,
        load_in_4bit = False, # False for LoRA 16bit
        fast_inference = True, # Enable vLLM fast inference
        max_lora_rank = lora_rank,
        gpu_memory_utilization = 0.8, # Reduce if out of memory
    )

    """### Helper Functions
    <a name="Data"></a>

#### Helper functions - Data Prep
    """

    import re
    import json

    reasoning_start = "<reasoning>"
    reasoning_end = "</reasoning>"
    solution_start = "<answer>"
    solution_end = "</answer>"

    def extract_hash_answer(text):
        """Extract answer from GSM8K format"""
        if "####" not in text:
            return None
        return text.split("####")[1].strip()

    def prepare_gsm8k_dataset(dataset):
        """Format GSM8K dataset for training"""
        reasoning_start = "<reasoning>"
        reasoning_end = "</reasoning>"
        solution_start = "<answer>"
        solution_end = "</answer>"

        system_prompt = (
            f"You are given a problem. Think about the problem and reason step by step. "
            f"Place your thinking process between {reasoning_start} and {reasoning_end}. "
            f"Then, provide your final numerical solution between {solution_start}{solution_end}"
        )

        def format_gsm8k(example):
            return {
                "prompt": [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": example["question"]},
                ],
                "answer": extract_hash_answer(example["answer"]),
            }

        return dataset.map(format_gsm8k)

    def prepare_limo_dataset(dataset):
        """Format LIMO dataset for SFT training"""
        if dataset is None:
            return None

        system_prompt = """You are a helpful reasoning assistant. When given a problem, think through it step by step and provide your answer in the following format:

    <reasoning>
    [Your detailed step-by-step reasoning and solution process]
    </reasoning>
    <answer>
    [Your final numerical answer]
    </answer>"""

        def format_limo(example):
            # Create the assistant response
            assistant_response = f"<reasoning>\n{example['solution']}\n</reasoning>\n<answer>\n{example['answer']}\n</answer>"

            # Return a DICTIONARY with the conversation in a field
            return {
                "prompt": [  # ← This is the key change - wrap in a dict
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": example["question"]},
                    {"role": "assistant", "content": assistant_response}
                ]
            }

        return dataset.map(format_limo)

    print("\n✅ Dataset preparation functions defined!")

    """#### Helper functions - Evaluation"""

    def get_max_prompt_length(dataset, tokenizer):
        """Calculate maximum and average prompt length in dataset"""
        print("Analyzing prompt lengths...")

        lengths = dataset.map(
            lambda x: {
                "tokens": tokenizer.apply_chat_template(
                    x["prompt"],
                    add_generation_prompt=True,
                    tokenize=True
                )
            },
            batched=True,
        ).map(lambda x: {"length": len(x["tokens"])})["length"]

        max_length = max(lengths)
        avg_length = sum(lengths) / len(lengths)
        min_length = min(lengths)

        print(f"Prompt lengths - Min: {min_length}, Max: {max_length}, Avg: {avg_length:.1f}")
        return max_length, avg_length

    def extract_unsloth_answer(text, start_tag="<SOLUTION>", end_tag="</SOLUTION>"):
        """Extract answer from Unsloth SOLUTION tags"""
        pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
        matches = re.findall(pattern, text, re.DOTALL)

        if matches:
            answer = matches[-1]  # Get the last match
            answer = re.sub(r"[%$,]", "", answer).strip()
            return answer
        return ""

    def find_number(search_string):
        """Find the last number in a string"""
        numbers = re.compile(
            r"-?[\d,]*\.?\d+",
            re.MULTILINE | re.DOTALL | re.IGNORECASE,
        ).findall(search_string)

        if numbers:
            return numbers[-1].replace(",", "").strip()
        return ""

    def remove_symbols(x: str) -> str:
        """Remove commas, percent and dollar symbols"""
        if not x:
            return ""
        return x.replace(",", "").replace("%", "").replace("$", "").strip()

    def get_num_tokens(text, tokenizer_instance):
        """Count tokens in text"""
        if not text:
            return 0
        encoding = tokenizer_instance(text, return_tensors="pt")
        return len(encoding["input_ids"][0])

    def check_format_compliance(text, format_type="unsloth"):
        """Check if response follows expected format"""
        if format_type == "unsloth":
            reasoning_start = "<start_reasoning>"
            reasoning_end = "<end_reasoning>"
            solution_start = "<SOLUTION>"
            solution_end = "</SOLUTION>"

            pattern = (
                rf"^[\s]*{re.escape(reasoning_start)}.+?{re.escape(reasoning_end)}.*?"
                rf"{re.escape(solution_start)}.+?{re.escape(solution_end)}[\s]*$"
            )
        else:
            return False

        return bool(re.match(pattern, text.strip(), re.DOTALL))

    def normalize_answer(answer):
        """Normalize answer for comparison"""
        if not answer:
            return ""

        normalized = remove_symbols(str(answer))

        try:
            float_val = float(normalized)
            if float_val.is_integer():
                return str(int(float_val))
            else:
                return str(float_val)
        except (ValueError, TypeError):
            return normalized

    def evaluate_answer_correctness(extracted_answer, ground_truth):
        """Evaluate answer correctness with multiple criteria"""
        if not extracted_answer or not ground_truth:
            return False, False, 0.0

        norm_extracted = normalize_answer(extracted_answer)
        norm_ground_truth = normalize_answer(ground_truth)

        if norm_extracted == norm_ground_truth:
            return True, True, 1.0

        try:
            extracted_num = float(norm_extracted)
            ground_truth_num = float(norm_ground_truth)

            if ground_truth_num != 0:
                relative_error = abs(extracted_num - ground_truth_num) / abs(ground_truth_num)

                if relative_error < 0.01:
                    return True, True, 0.9
                elif relative_error < 0.05:
                    return False, True, 0.7
                elif relative_error < 0.10:
                    return False, True, 0.5
            else:
                if extracted_num == 0:
                    return True, True, 1.0
                elif abs(extracted_num) < 0.01:
                    return False, True, 0.7

        except (ValueError, TypeError):
            if norm_extracted.lower() == norm_ground_truth.lower():
                return True, True, 1.0

        return False, False, 0.0

    """#### Reward Functions for GRPO"""

    def match_format_exactly(completions, **kwargs):
        """Reward function for exact format matching"""
        reasoning_start = "<reasoning>"
        reasoning_end = "</reasoning>"
        solution_start = "<answer>"
        solution_end = "</answer>"

        pattern = (
            rf"^[\s]*{re.escape(reasoning_start)}.+?{re.escape(reasoning_end)}.*?"
            rf"{re.escape(solution_start)}.+?{re.escape(solution_end)}[\s]*$"
        )

        responses = [completion[0]["content"] for completion in completions]
        rewards = [3.0 if re.match(pattern, response, re.DOTALL) else 0.0 for response in responses]
        return rewards

    def match_format_approximately(completions, **kwargs):
        """Reward function for approximate format matching"""
        reasoning_start = "<reasoning>"
        reasoning_end = "</reasoning>"
        solution_start = "<answerr>"
        solution_end = "</answer>"

        scores = []
        for completion in completions:
            score = 0
            response = completion[0]["content"]
            score += 0.5 if response.count(reasoning_start) == 1 else -1.0
            score += 0.5 if response.count(reasoning_end) == 1 else -1.0
            score += 0.5 if response.count(solution_start) == 1 else -1.0
            score += 0.5 if response.count(solution_end) == 1 else -1.0
            scores.append(score)
        return scores

    def check_answer_correctness(prompts, completions, answer, **kwargs):
        """Reward function for answer correctness"""
        def extract_solution_answer(text):
            pattern = r"<answer>(.*?)</answer>"
            match = re.search(pattern, text, re.DOTALL)
            if match:
                return re.sub(r"[%$,]", "", match.group(1)).strip()
            return ""

        responses = [completion[0]["content"] for completion in completions]
        extracted_responses = [extract_solution_answer(r) for r in responses]

        scores = []
        for guess, true_answer in zip(extracted_responses, answer):
            score = 0
            if not guess:
                scores.append(0)
                continue

            if guess == true_answer:
                score += 3.0
            elif guess.strip() == true_answer.strip():
                score += 1.5
            else:
                try:
                    ratio = float(guess) / float(true_answer)
                    if 0.9 <= ratio <= 1.1:
                        score += 1.0
                    elif 0.8 <= ratio <= 1.2:
                        score += 0.5
                    else:
                        score -= 1.5
                except:
                    score -= 1.5
            scores.append(score)
        return scores

    print("✅ Reward functions defined!")

    """#### Main Evaluation Function"""

    import gc


    """#### Comparison and Memory Management"""

    def compare_model_results(all_results):
        """Generate comprehensive comparison of multiple model results"""
        print(f"\n{'='*80}")
        print("COMPREHENSIVE MODEL COMPARISON")
        print(f"{'='*80}")

        # Main table
        print(f"{'Model':<15} {'Format %':<10} {'Exact %':<10} {'Plausible %':<12} {'Confidence':<12}")
        print("-" * 80)

        for result in all_results:
            print(f"{result['model_type']:<15} "
                  f"{result['correct_format_pct']:<10.1f} "
                  f"{result['exact_match_pct']:<10.1f} "
                  f"{result['plausible_match_pct']:<12.1f} "
                  f"{result['avg_confidence']:<12.3f}")

        # Improvement analysis
        if len(all_results) > 1:
            print(f"\n{'='*50}")
            print("IMPROVEMENT ANALYSIS")
            print(f"{'='*50}")

            base_result = all_results[0]
            for result in all_results[1:]:
                print(f"\n{result['model_type']} vs {base_result['model_type']}:")
                format_improvement = result['correct_format_pct'] - base_result['correct_format_pct']
                exact_improvement = result['exact_match_pct'] - base_result['exact_match_pct']
                plausible_improvement = result['plausible_match_pct'] - base_result['plausible_match_pct']

                print(f"  Format compliance: {format_improvement:+.1f}%")
                print(f"  Exact matches:     {exact_improvement:+.1f}%")
                print(f"  Plausible matches: {plausible_improvement:+.1f}%")

        # Save comparison
        comparison_data = {
            "summary": all_results,
            "best_model": max(all_results, key=lambda x: x['exact_match_pct']),
        }

        with open("model_comparison_comprehensive.json", "w") as f:
            json.dump(comparison_data, f, indent=4)

        print(f"\nBest performing model: {comparison_data['best_model']['model_type']} "
              f"({comparison_data['best_model']['exact_match_pct']:.1f}% exact matches)")

    def cleanup_memory():
        """Comprehensive memory cleanup"""
        print("🧹 Cleaning up GPU memory...")
        for _ in range(10):
            torch.cuda.empty_cache()
            gc.collect()

        if torch.cuda.is_available():
            allocated = torch.cuda.memory_allocated() / 1024**3
            reserved = torch.cuda.memory_reserved() / 1024**3
            print(f"GPU memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")

    """#### Data Loading and Preparation"""

    from datasets import load_dataset


# Load GSM8K
    gsm8k_dataset = load_dataset("openai/gsm8k", "main", split="train")

# Load LIMO (adjust this based on your access method)
    limo_train = load_dataset("GAIR/LIMO", split="train")

# Prepare datasets
    gsm8k_train = prepare_gsm8k_dataset(gsm8k_dataset)
    limo_train = prepare_limo_dataset(limo_train)


    print(f"  GSM8K train: {len(gsm8k_train)}")
    print(f"  LIMO train:  {len(limo_train) if limo_train else 0}")

# Store results
    all_results = []

# Single temperature evaluation on combined dataset
    results = evaluate_model_aime(
        model=model,
        tokenizer=tokenizer,
        model_type="base",
        temperature=0.3,
        n_sampling=8,
        max_tokens=32768,
        top_p=0.95,
        seed=0
    )

    from unsloth.chat_templates import get_chat_template

    tokenizer = get_chat_template(
        tokenizer,
        chat_template = "llama-3.1",
    )

    def formatting_prompts_func(examples):
        convos = examples["prompt"]
        texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
        return { "text" : texts, }
    pass

    limo_train = limo_train.map(formatting_prompts_func, batched = True,)

    from trl import SFTTrainer
    from transformers import DataCollatorForSeq2Seq, TrainingArguments
    from unsloth import is_bfloat16_supported


    print(f"\n{'*'*60}")
    print("🎯 STAGE 1: Qlora Fine-Tuning on LIMO")
    print(f"{'*'*60}")

    model = FastLanguageModel.get_peft_model(
        model,
        r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        target_modules = [
            "q_proj", "k_proj", "v_proj", "o_proj",
            "gate_proj", "up_proj", "down_proj",
        ], # Remove QKVO if out of memory
        lora_alpha = lora_rank,
        use_gradient_checkpointing = "unsloth", # Enable long context finetuning
        random_state = 3407,
    )


    if limo_train is not None:
        trainer = SFTTrainer(
        model = model,
        tokenizer = tokenizer,
        train_dataset = limo_train,
        dataset_text_field = "text",
        max_seq_length = max_seq_length,
        data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
        dataset_num_proc = 2,
        packing = False, # Can make training 5x faster for short sequences.
        args = TrainingArguments(
            per_device_train_batch_size = 2,
            gradient_accumulation_steps = 4,
            warmup_steps = 5,
            num_train_epochs = 1, # Set this for 1 full training run.
            #max_steps = 60,
            learning_rate = 2e-4,
            fp16 = not is_bfloat16_supported(),
            bf16 = is_bfloat16_supported(),
            logging_steps = 1,
            optim = "adamw_8bit",
            weight_decay = 0.01,
            lr_scheduler_type = "linear",
            seed = 3407,
            output_dir = "outputs",
            report_to = "none", # Use this for WandB etc
        ),
    )


        from unsloth.chat_templates import train_on_responses_only
        trainer = train_on_responses_only(
        trainer,
        instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
        response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
        )

        # Train
        print(f"🚂 Starting SFT training on {len(limo_train)} examples...")
        trainer.train()

        # Save checkpoint
        model.save_pretrained("qlora_checkpoint")
        tokenizer.save_pretrained("qlora_checkpoint")
        print("💾 Qlora checkpoint saved!")

        # Cleanup
        del trainer
        cleanup_memory()

        print("✅ Qlora training completed!")
    else:
        print("⚠️ Skipping Qlora training - no LIMO dataset available")

# Cleanup
    cleanup_memory()

    global PRINTED_TIMES
    PRINTED_TIMES = 0
    global PRINT_EVERY_STEPS
    PRINT_EVERY_STEPS = 5

    match_numbers = re.compile(
        solution_start + r".*?([\d\.\,]{1,})",
        flags = re.MULTILINE | re.DOTALL
    )

    def check_numbers(prompts, completions, answer, **kwargs):
        question = prompts[0][-1]["content"]
        responses = [completion[0]["content"] for completion in completions]

        extracted_responses = [
            guess.group(1)
            if (guess := match_numbers.search(r)) is not None else None \
            for r in responses
        ]

        scores = []
        # Print only every few steps
        global PRINTED_TIMES
        global PRINT_EVERY_STEPS
        if PRINTED_TIMES % PRINT_EVERY_STEPS == 0:
            print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
        PRINTED_TIMES += 1

        for guess, true_answer in zip(extracted_responses, answer):
            if guess is None:
                scores.append(0)
                continue
            # Convert to numbers
            try:
                true_answer = float(true_answer.strip())
                # Remove commas like in 123,456
                guess       = float(guess.strip().replace(",", ""))
                scores.append(1.5 if guess == true_answer else -0.5)
            except:
                scores.append(0)
                continue
        return scores

    print(f"\n{'*'*60}")
    print("🎯 STAGE 2: GRPO Fine-Tuning on GSM8K")
    print(f"{'*'*60}")

# Get max prompt length
    max_prompt_length, _ = get_max_prompt_length(gsm8k_train, tokenizer)
    max_prompt_length = min(max_prompt_length + 10, 512)  # Add buffer, cap at 512

    print(f"Using max_prompt_length: {max_prompt_length}")

    from trl import GRPOConfig, GRPOTrainer
    training_args = GRPOConfig(
        learning_rate = 5e-6,
        weight_decay = 0.1,
        warmup_ratio = 0.1,
        lr_scheduler_type = "cosine",
        optim = "adamw_torch_fused",
        logging_steps = 1,
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4, # Increase to 4 for smoother training
        num_generations = 8, # Decrease if out of memory
        max_prompt_length = max_prompt_length,
        max_completion_length = max_seq_length - max_prompt_length,
        # num_train_epochs = 1, # Set to 1 for a full training run
        #max_steps = 250,
        max_steps = 1000,
        save_steps = 250,
        max_grad_norm = 0.1,
        report_to = "none", # Can use Weights & Biases
        output_dir = "outputs",
    )

    trainer = GRPOTrainer(
        model = model,
        processing_class = tokenizer,
        reward_funcs = [
            match_format_exactly,
            match_format_approximately,
            check_answer_correctness,
            check_numbers,
        ],
        args = training_args,
        train_dataset = gsm8k_train,
    )


# Train
    print(f"🚂 Starting GRPO training on {len(gsm8k_train)} examples...")
    trainer.train()

# Save checkpoint
    model.save_pretrained("grpo_checkpoint")
    tokenizer.save_pretrained("grpo_checkpoint")
    print("💾 GRPO checkpoint saved!")

# Cleanup
    del trainer
    del training_args
    cleanup_memory()

    print("✅ GRPO training completed!")

    print(f"\n{'='*60}")
    print("🔍 EVALUATION 3: Final GRPO Model")
    print(f"{'='*60}")

    grpo_results = evaluate_model_aime(
        model=model,
        tokenizer=tokenizer,
        model_type="grpo",
        temperature=0.3,
        n_sampling=8,
        max_tokens=32768,
        top_p=0.95,
        seed=0
    )

    all_results.append(grpo_results)
    print("✅ Final model evaluation complete!")

    print(f"\n{'='*60}")
    print("💾 SAVING FINAL MODEL")
    print(f"{'='*60}")

    # Save as merged model
    try:
        model.save_pretrained_merged("final_merged_model", tokenizer, save_method="merged_16bit")
        print("✅ Merged model saved to: final_merged_model/")
    except Exception as e:
        print(f"⚠️ Could not save merged model: {e}")
        print("Final model saved as LoRA adapter only")

    print("💾 Model saving complete!")


    safe_remove_directory("./unsloth_compiled_cache")

    result_queue.put(results)

    # Clean up
    del model
    del tokenizer
    torch.cuda.empty_cache()
    gc.collect()


    # # Merged model load 16 bits model AIME eval
    # result_queue = mp.Queue()
    # p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, False))
    # p.start()
    # p.join()
    #
    # merged_16bits = result_queue.get()
    # all_results.append(merged_16bits)
    #
    # # Clean up
    # del merged_model
    # del merged_tokenizer
    # del dataset_ppl
    # torch.cuda.empty_cache()
    # gc.collect()
    #
    # safe_remove_directory("./unsloth_compiled_cache")
    #
    # # Merged model load 8 bits model AIME eval
    #
    # result_queue = mp.Queue()
    # p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, True))
    # p.start()
    # p.join()
    #
    # merged_16bits = result_queue.get()
    # all_results.append(merged_16bits)


    # Merged model load 4 bits AIME eval
    # result_queue = mp.Queue()
    # p = mp.Process(target=evaluate_merged_model, args=(result_queue, True, False))
    # p.start()
    # p.join()
    #
    # merged_16bits = result_queue.get()
    # all_results.append(merged_16bits)

if __name__ == "__main__":
    mp.set_start_method('spawn', force=True)
    result_queue = mp.Queue()
    all_results = []


    # run main finetuning and grpo loop
    p = mp.Process(target=training_run, args=(result_queue,))
    p.start()
    p.join()

    results = result_queue.get()
    all_results = results

    # evaluate merged model loaded 16bits
    p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, False))
    p.start()
    p.join()

    merged_load_16bits = result_queue.get()
    all_results.append(merged_load_16bits)
    safe_remove_directory("./unsloth_compiled_cache")

    # Merged model load 8 bits model AIME eval
    p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, True))
    p.start()
    p.join()

    merged_load_8bits = result_queue.get()
    all_results.append(merged_load_8bits)

    safe_remove_directory("./unsloth_compiled_cache")

    # Merged model load 4 bits model AIME eval
    p = mp.Process(target=evaluate_merged_model, args=(result_queue, True, False))
    p.start()
    p.join()

    merged_load_4bits = result_queue.get()
    all_results.append(merged_load_4bits)

    safe_remove_directory("./unsloth_compiled_cache")


# AIME-specific comparison function

    print(f"\n{'='*80}")
    print("🏆 FINAL TRAINING PIPELINE RESULTS")
    print(f"{'='*80}")

# Use the AIME-specific comparison
    compare_aime_results(all_results)

```

## /tests/saving/non_peft/test_mistral_non_peft.py

```py path="/tests/saving/non_peft/test_mistral_non_peft.py" 
from unsloth import FastLanguageModel
from transformers import AutoModelForCausalLM
from peft import PeftModel
from pathlib import Path
import sys
import warnings

REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory


print(f"\n{'='*80}")
print("🔍 PHASE 1: Loading Base Model")
print(f"{'='*80}")

model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/mistral-7b-v0.3",
        max_seq_length=2048,
        dtype=None,
        load_in_4bit=True,
        load_in_8bit=False,
        full_finetuning=False,
    )


print("✅ Base model loaded successfully!")

### Attemtping save merge


print(f"\n{'='*80}")
print("🔍 PHASE 2: Attempting save_pretrained_merged (Should Warn)")
print(f"{'='*80}")

with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        model.save_pretrained_merged("test_output", tokenizer)

        # Verify warning
        assert len(w) >= 1, "Expected warning but none raised"
        warning_msg = str(w[0].message)
        expected_msg = "Model is not a PeftModel (no Lora adapters detected). Skipping Merge. Please use save_pretrained() or push_to_hub() instead!"
        assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"
        assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"

print("✅ Correct warning detected for non-PeftModel merge attempt!")


print(f"\n{'='*80}")
print("🔍 PHASE 3: Using save_pretrained (Should Succeed)")
print(f"{'='*80}")


try:
    with warnings.catch_warnings():
        warnings.simplefilter("error")  # Treat warnings as errors here
        model.save_pretrained("test_output")
        print("✅ Standard save_pretrained completed successfully!")
except Exception as e:
    assert False, f"Phase 3 failed: {e}"

safe_remove_directory("./test_output")
safe_remove_directory("./unsloth_compiled_cache")

```

## /tests/saving/non_peft/test_whisper_non_peft.py

```py path="/tests/saving/non_peft/test_whisper_non_peft.py" 
from unsloth import FastLanguageModel, FastModel
from transformers import AutoModelForCausalLM, WhisperForConditionalGeneration
from peft import PeftModel
from pathlib import Path
import sys
import warnings

REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory


print(f"\n{'='*80}")
print("🔍 PHASE 1: Loading Base Model")
print(f"{'='*80}")

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/whisper-large-v3",
    dtype = None, # Leave as None for auto detection
    load_in_4bit = False, # Set to True to do 4bit quantization which reduces memory
    auto_model = WhisperForConditionalGeneration,
    whisper_language = "English",
    whisper_task = "transcribe",
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

print("✅ Base model loaded successfully!")

### Attemtping save merge


print(f"\n{'='*80}")
print("🔍 PHASE 2: Attempting save_pretrained_merged (Should Warn)")
print(f"{'='*80}")

with warnings.catch_warnings(record=True) as w:
        warnings.simplefilter("always")
        model.save_pretrained_merged("test_output", tokenizer)

        # Verify warning
        assert len(w) >= 1, "Expected warning but none raised"
        warning_msg = str(w[0].message)
        expected_msg = "Model is not a PeftModel (no Lora adapters detected). Skipping Merge. Please use save_pretrained() or push_to_hub() instead!"
        assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"
        assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"

print("✅ Correct warning detected for non-PeftModel merge attempt!")


print(f"\n{'='*80}")
print("🔍 PHASE 3: Using save_pretrained (Should Succeed)")
print(f"{'='*80}")


try:
    with warnings.catch_warnings():
        warnings.simplefilter("error")  # Treat warnings as errors here
        model.save_pretrained("test_output")
        print("✅ Standard save_pretrained completed successfully!")
except Exception as e:
    assert False, f"Phase 3 failed: {e}"

safe_remove_directory("./test_output")
safe_remove_directory("./unsloth_compiled_cache")

```

## /tests/saving/test_unsloth_save.py

```py path="/tests/saving/test_unsloth_save.py" 
import json
import os
import shutil
import tempfile
import pytest

from unsloth import FastLanguageModel, FastModel

model_to_test = [
    # Text Models
    "unsloth/tinyllama",
    "unsloth/tinyllama-bnb-4bit",
    "unsloth/Qwen2.5-0.5B-Instruct",
    "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit",
    "unsloth/Phi-4-mini-instruct",
    "unsloth/Phi-4-mini-instruct-bnb-4bit",
    "unsloth/Qwen2.5-0.5B",
    # Vision Models
    "unsloth/gemma-3-1b-it",
    "unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
    "unsloth/Qwen2.5-VL-3B-Instruct-bnb-4bit"
]

# Variables
save_file_sizes = {}
save_file_sizes["merged_16bit"] = {}
save_file_sizes["merged_4bit"] = {}

tokenizer_files = [
    "tokenizer_config.json",
    "special_tokens_map.json",
]

@pytest.fixture(scope="session", params=model_to_test)
def loaded_model_tokenizer(request):
    model_name = request.param
    print("Loading model and tokenizer...")

    model, tokenizer = FastModel.from_pretrained(
        model_name, # use small model
        max_seq_length=128,
        dtype=None,
        load_in_4bit=True,
    )

    # Apply LoRA
    model = FastModel.get_peft_model(
        model,
        r=16,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_alpha=16,
        use_gradient_checkpointing="unsloth",
    )

    return model, tokenizer

@pytest.fixture(scope="session")
def model(loaded_model_tokenizer):
    return loaded_model_tokenizer[0]

@pytest.fixture(scope="session")
def tokenizer(loaded_model_tokenizer):
    return loaded_model_tokenizer[1]

@pytest.fixture
def temp_save_dir():
    dir = tempfile.mkdtemp()
    print(f"Temporary directory created at: {dir}")
    yield dir
    print(f"Temporary directory deleted: {dir}")
    shutil.rmtree(dir)


def delete_quantization_config(model):
    # Since merged, edit quantization_config
    old_config = model.config
    new_config = model.config.to_dict()
    if "quantization_config" in new_config:
        del new_config["quantization_config"]
    original_model = model
    new_config = type(model.config).from_dict(new_config)
    while hasattr(original_model, "model"):
        original_model = original_model.model
        original_model.config = new_config
    model.config = new_config

def test_save_merged_16bit(model, tokenizer, temp_save_dir: str):
    save_path = os.path.join(temp_save_dir, "unsloth_merged_16bit", model.config._name_or_path.replace("/", "_"))

    model.save_pretrained_merged(
        save_path,
        tokenizer=tokenizer,
        save_method="merged_16bit"
    )

    # Check model files
    assert os.path.isdir(save_path), f"Directory {save_path} does not exist."
    assert os.path.isfile(os.path.join(save_path, "config.json")), "config.json not found."

    weight_files = [f for f in os.listdir(save_path) if f.endswith(".bin") or f.endswith(".safetensors")]
    assert len(weight_files) > 0, "No weight files found in the save directory."

    # Check tokenizer files
    for file in tokenizer_files:
        assert os.path.isfile(os.path.join(save_path, file)), f"{file} not found in the save directory."

    # Check config to see if it is 16bit by checking for quantization config
    config_path = os.path.join(save_path, "config.json")
    with open(config_path, "r") as f:
        config = json.load(f)

    assert "quantization_config" not in config, "Quantization config not found in the model config."

    # Store the size of the model files
    total_size = sum(os.path.getsize(os.path.join(save_path, f)) for f in weight_files)
    save_file_sizes["merged_16bit"][model.config._name_or_path] = total_size
    print(f"Total size of merged_16bit files: {total_size} bytes")

    # Test loading the model from the saved path
    loaded_model, loaded_tokenizer = FastLanguageModel.from_pretrained(
        save_path,
        max_seq_length=128,
        dtype=None,
        load_in_4bit=True,
    )

def test_save_merged_4bit(model, tokenizer, temp_save_dir: str):
    save_path = os.path.join(temp_save_dir, "unsloth_merged_4bit", model.config._name_or_path.replace("/", "_"))

    model.save_pretrained_merged(
        save_path,
        tokenizer=tokenizer,
        save_method="merged_4bit_forced"
    )

    # Check model files
    assert os.path.isdir(save_path), f"Directory {save_path} does not exist."
    assert os.path.isfile(os.path.join(save_path, "config.json")), "config.json not found."

    weight_files = [f for f in os.listdir(save_path) if f.endswith(".bin") or f.endswith(".safetensors")]
    assert len(weight_files) > 0, "No weight files found in the save directory."

    # Check tokenizer files
    for file in tokenizer_files:
        assert os.path.isfile(os.path.join(save_path, file)), f"{file} not found in the save directory."

    # Store the size of the model files
    total_size = sum(os.path.getsize(os.path.join(save_path, f)) for f in weight_files)
    save_file_sizes["merged_4bit"][model.config._name_or_path] = total_size

    print(f"Total size of merged_4bit files: {total_size} bytes")

    assert total_size < save_file_sizes["merged_16bit"][model.config._name_or_path], "Merged 4bit files are larger than merged 16bit files."

    # Check config to see if it is 4bit
    config_path = os.path.join(save_path, "config.json")
    with open(config_path, "r") as f:
        config = json.load(f)

    assert "quantization_config" in config, "Quantization config not found in the model config."

    # Test loading the model from the saved path
    loaded_model, loaded_tokenizer = FastModel.from_pretrained(
        save_path,
        max_seq_length=128,
        dtype=None,
        load_in_4bit=True,
    )


```

## /tests/saving/text_to_speech_models/test_csm.py

```py path="/tests/saving/text_to_speech_models/test_csm.py" 
from unsloth import FastLanguageModel, FastModel
from transformers import CsmForConditionalGeneration
import torch
# ruff: noqa
import sys
from pathlib import Path
from peft import PeftModel
import warnings
import requests

REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.os_utils import require_package, require_python_package

require_package("ffmpeg", "ffmpeg")
require_python_package("soundfile")

import soundfile as sf

print(f"\n{'='*80}")
print("🔍 SECTION 1: Loading Model and LoRA Adapters")
print(f"{'='*80}")


model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/csm-1b",
    max_seq_length= 2048, # Choose any for long context!
    dtype = None, # Leave as None for auto-detection
    auto_model = CsmForConditionalGeneration,
    load_in_4bit = False, # Select True for 4bit - reduces memory usage
)


base_model_class = model.__class__.__name__


model = FastModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

print("✅ Model and LoRA adapters loaded successfully!")


print(f"\n{'='*80}")
print("🔍 SECTION 2: Checking Model Class Type")
print(f"{'='*80}")

assert isinstance(model, PeftModel), "Model should be an instance of PeftModel"
print("✅ Model is an instance of PeftModel!")


print(f"\n{'='*80}")
print("🔍 SECTION 3: Checking Config Model Class Type")
print(f"{'='*80}")

def find_lora_base_model(model_to_inspect):
    current = model_to_inspect
    if hasattr(current, "base_model"):
        current = current.base_model
    if hasattr(current, "model"):
        current = current.model
    return current
pass


config_model = find_lora_base_model(model) if isinstance(model, PeftModel) else model

assert config_model.__class__.__name__ == base_model_class, f"Expected config_model class to be {base_model_class}"
print("✅ config_model returns correct Base Model class:", str(base_model_class))


print(f"\n{'='*80}")
print("🔍 SECTION 4: Saving and Merging Model")
print(f"{'='*80}")

with warnings.catch_warnings():
    warnings.simplefilter("error")  # Treat warnings as errors
    try:
        model.save_pretrained_merged("csm", tokenizer)
        print("✅ Model saved and merged successfully without warnings!")
    except Exception as e:
        assert False, f"Model saving/merging failed with exception: {e}"

print(f"\n{'='*80}")
print("🔍 SECTION 5: Loading Model for Inference")
print(f"{'='*80}")


model, processor = FastModel.from_pretrained(
    model_name = "./csm",
    max_seq_length= 2048, # Choose any for long context!
    dtype = None, # Leave as None for auto-detection
    auto_model = CsmForConditionalGeneration,
    load_in_4bit = False, # Select True for 4bit - reduces memory usage
)

from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("unsloth/csm-1b")

print("✅ Model loaded for inference successfully!")


print(f"\n{'='*80}")
print("🔍 SECTION 6: Running Inference")
print(f"{'='*80}")


from transformers import pipeline
import torch
output_audio_path = "csm_audio.wav"
try:
    text = "We just finished fine tuning a text to speech model... and it's pretty good!"
    speaker_id = 0
    inputs = processor(f"[{speaker_id}]{text}", add_special_tokens=True).to("cuda")
    audio_values = model.generate(
        **inputs,
        max_new_tokens=125, # 125 tokens is 10 seconds of audio, for longer speech increase this
        # play with these parameters to get the best results
        depth_decoder_temperature=0.6,
        depth_decoder_top_k=0,
        depth_decoder_top_p=0.9,
        temperature=0.8,
        top_k=50,
        top_p=1.0,
        #########################################################
        output_audio=True
    )
    audio = audio_values[0].to(torch.float32).cpu().numpy()
    sf.write("example_without_context.wav", audio, 24000)
    print(f"✅ Audio generated and saved to {output_audio_path}!")
except Exception as e:
    assert False, f"Inference failed with exception: {e}"


## assert that transcribed_text contains The birch canoe slid on the smooth planks. Glued the sheet to the dark blue background. It's easy to tell the depth of a well. Four hours of steady work faced us.

print("✅ All sections passed successfully!")


safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./csm")

```

## /tests/saving/text_to_speech_models/test_lasa.py

```py path="/tests/saving/text_to_speech_models/test_lasa.py" 
from unsloth import FastLanguageModel, FastModel
from transformers import CsmForConditionalGeneration
import torch
# ruff: noqa
import sys
from pathlib import Path
from peft import PeftModel
import warnings
import requests


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))


from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.os_utils import require_package, require_python_package

require_package("ffmpeg", "ffmpeg")
require_python_package("soundfile")
require_python_package("xcodec2")

import soundfile as sf
from xcodec2.modeling_xcodec2 import XCodec2Model
XCODEC2_MODEL_NAME = "HKUST-Audio/xcodec2"
SAMPLE_RATE = 16000
DEVICE = "cuda"

try:
    codec_model = XCodec2Model.from_pretrained(XCODEC2_MODEL_NAME)

except Exception as e:
    raise f"ERROR loading XCodec2 model: {e}."

codec_model.to('cpu')

print(f"\n{'='*80}")
print("🔍 SECTION 1: Loading Model and LoRA Adapters")
print(f"{'='*80}")

max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llasa-1B",
    max_seq_length = max_seq_length,
    dtype = None, # Select None for auto detection
    load_in_4bit = False, # Choose True for 4bit which reduces memory
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

base_model_class = model.__class__.__name__


model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "v_proj"],
    lora_alpha = 128,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

print("✅ Model and LoRA adapters loaded successfully!")


print(f"\n{'='*80}")
print("🔍 SECTION 2: Checking Model Class Type")
print(f"{'='*80}")

assert isinstance(model, PeftModel), "Model should be an instance of PeftModel"
print("✅ Model is an instance of PeftModel!")


print(f"\n{'='*80}")
print("🔍 SECTION 3: Checking Config Model Class Type")
print(f"{'='*80}")

def find_lora_base_model(model_to_inspect):
    current = model_to_inspect
    if hasattr(current, "base_model"):
        current = current.base_model
    if hasattr(current, "model"):
        current = current.model
    return current
pass


config_model = find_lora_base_model(model) if isinstance(model, PeftModel) else model

assert config_model.__class__.__name__ == base_model_class, f"Expected config_model class to be {base_model_class}"
print("✅ config_model returns correct Base Model class:", str(base_model_class))


print(f"\n{'='*80}")
print("🔍 SECTION 4: Saving and Merging Model")
print(f"{'='*80}")

with warnings.catch_warnings():
    warnings.simplefilter("error")  # Treat warnings as errors
    try:
        model.save_pretrained_merged("lasa", tokenizer)
        print("✅ Model saved and merged successfully without warnings!")
    except Exception as e:
        assert False, f"Model saving/merging failed with exception: {e}"

print(f"\n{'='*80}")
print("🔍 SECTION 5: Loading Model for Inference")
print(f"{'='*80}")


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "./lasa",
    max_seq_length = max_seq_length,
    dtype = None, # Select None for auto detection
    load_in_4bit = False, # Choose True for 4bit which reduces memory
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

#from transformers import AutoProcessor
#processor = AutoProcessor.from_pretrained("unsloth/csm-1b")

print("✅ Model loaded for inference successfully!")


print(f"\n{'='*80}")
print("🔍 SECTION 6: Running Inference")
print(f"{'='*80}")


from transformers import pipeline
import torch
output_audio_path = "lasa_audio.wav"
input_text = "Hey there my name is Elise, <giggles> and I'm a speech generation model that can sound like a person."

FastLanguageModel.for_inference(model)

def ids_to_speech_tokens(speech_ids):

    speech_tokens_str = []
    for speech_id in speech_ids:
        speech_tokens_str.append(f"<|s_{speech_id}|>")
    return speech_tokens_str

def extract_speech_ids(speech_tokens_str):

    speech_ids = []
    for token_str in speech_tokens_str:
        if token_str.startswith('<|s_') and token_str.endswith('|>'):
            num_str = token_str[4:-2]

            num = int(num_str)
            speech_ids.append(num)
        else:
            print(f"Unexpected token: {token_str}")
    return speech_ids

#TTS start!
with torch.inference_mode():
    with torch.amp.autocast('cuda',dtype=model.dtype):
        formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"

        # Tokenize the text
        chat = [
            {"role": "user", "content": "Convert the text to speech:" + formatted_text},
            {"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
        ]

        input_ids = tokenizer.apply_chat_template(
            chat,
            tokenize=True,
            return_tensors='pt',
            continue_final_message=True
        )
        input_ids = input_ids.to('cuda')

        speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')

        # Generate the speech autoregressively
        outputs = model.generate(
            input_ids,
            max_length=2048,  # We trained our model with a max length of 2048
            eos_token_id= speech_end_id ,
            do_sample=True,
            top_p=1.2,           #  Adjusts the diversity of generated content
            temperature=1.2,   #  Controls randomness in output
        )
    # Extract the speech tokens
    generated_ids = outputs[0][input_ids.shape[1]:-1]

    speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # Convert  token <|s_23456|> to int 23456
    speech_tokens = extract_speech_ids(speech_tokens)

    speech_tokens = torch.tensor(speech_tokens).cpu().unsqueeze(0).unsqueeze(0)

    # Decode the speech tokens to speech waveform
    gen_wav = codec_model.decode_code(speech_tokens)
try:
    sf.write(output_audio_path, gen_wav[0, 0, :].cpu().numpy(), 16000)
except Exception as e:
    assert False, f"Inference failed with exception: {e}"


## assert that transcribed_text contains The birch canoe slid on the smooth planks. Glued the sheet to the dark blue background. It's easy to tell the depth of a well. Four hours of steady work faced us.

print("✅ All sections passed successfully!")


safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./lasa")

```

## /tests/saving/text_to_speech_models/test_orpheus.py

```py path="/tests/saving/text_to_speech_models/test_orpheus.py" 
from unsloth import FastLanguageModel, FastModel
from transformers import CsmForConditionalGeneration
import torch
# ruff: noqa
import sys
from pathlib import Path
from peft import PeftModel
import warnings
import requests

REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.os_utils import require_package, require_python_package

require_package("ffmpeg", "ffmpeg")
require_python_package("soundfile")
require_python_package("snac")

import soundfile as sf
from snac import SNAC
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to("cuda")
print(f"\n{'='*80}")
print("🔍 SECTION 1: Loading Model and LoRA Adapters")
print(f"{'='*80}")


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/orpheus-3b-0.1-ft",
    max_seq_length= 2048, # Choose any for long context!
    dtype = None, # Select None for auto detection
    load_in_4bit = False, # Select True for 4bit which reduces memory usage
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

base_model_class = model.__class__.__name__


model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)
print("✅ Model and LoRA adapters loaded successfully!")


print(f"\n{'='*80}")
print("🔍 SECTION 2: Checking Model Class Type")
print(f"{'='*80}")

assert isinstance(model, PeftModel), "Model should be an instance of PeftModel"
print("✅ Model is an instance of PeftModel!")


print(f"\n{'='*80}")
print("🔍 SECTION 3: Checking Config Model Class Type")
print(f"{'='*80}")

def find_lora_base_model(model_to_inspect):
    current = model_to_inspect
    if hasattr(current, "base_model"):
        current = current.base_model
    if hasattr(current, "model"):
        current = current.model
    return current
pass


config_model = find_lora_base_model(model) if isinstance(model, PeftModel) else model

assert config_model.__class__.__name__ == base_model_class, f"Expected config_model class to be {base_model_class}"
print("✅ config_model returns correct Base Model class:", str(base_model_class))


print(f"\n{'='*80}")
print("🔍 SECTION 4: Saving and Merging Model")
print(f"{'='*80}")

with warnings.catch_warnings():
    warnings.simplefilter("error")  # Treat warnings as errors
    try:
        model.save_pretrained_merged("orpheus", tokenizer)
        print("✅ Model saved and merged successfully without warnings!")
    except Exception as e:
        assert False, f"Model saving/merging failed with exception: {e}"

print(f"\n{'='*80}")
print("🔍 SECTION 5: Loading Model for Inference")
print(f"{'='*80}")


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/orpheus-3b-0.1-ft",
    max_seq_length= 2048, # Choose any for long context!
    dtype = None, # Select None for auto detection
    load_in_4bit = False, # Select True for 4bit which reduces memory usage
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

#from transformers import AutoProcessor
#processor = AutoProcessor.from_pretrained("unsloth/csm-1b")

print("✅ Model loaded for inference successfully!")


print(f"\n{'='*80}")
print("🔍 SECTION 6: Running Inference")
print(f"{'='*80}")


#@title Run Inference


FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# Moving snac_model cuda to cpu
snac_model.to("cpu")
prompts = [
    "Hey there my name is Elise, <giggles> and I'm a speech generation model that can sound like a person.",
]

chosen_voice = None # None for single-speaker

prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]

all_input_ids = []

for prompt in prompts_:
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids
  all_input_ids.append(input_ids)

start_token = torch.tensor([[ 128259]], dtype=torch.int64) # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human

all_modified_input_ids = []
for input_ids in all_input_ids:
  modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # SOH SOT Text EOT EOH
  all_modified_input_ids.append(modified_input_ids)

all_padded_tensors = []
all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
for modified_input_ids in all_modified_input_ids:
  padding = max_length - modified_input_ids.shape[1]
  padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
  attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
  all_padded_tensors.append(padded_tensor)
  all_attention_masks.append(attention_mask)

all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
all_attention_masks = torch.cat(all_attention_masks, dim=0)

input_ids = all_padded_tensors.to("cuda")
attention_mask = all_attention_masks.to("cuda")
generated_ids = model.generate(
      input_ids=input_ids,
      attention_mask=attention_mask,
      max_new_tokens=1200,
      do_sample=True,
      temperature=0.6,
      top_p=0.95,
      repetition_penalty=1.1,
      num_return_sequences=1,
      eos_token_id=128258,
     use_cache = True
  )
token_to_find = 128257
token_to_remove = 128258

token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)

if len(token_indices[1]) > 0:
    last_occurrence_idx = token_indices[1][-1].item()
    cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
    cropped_tensor = generated_ids

mask = cropped_tensor != token_to_remove

processed_rows = []

for row in cropped_tensor:
    masked_row = row[row != token_to_remove]
    processed_rows.append(masked_row)

code_lists = []

for row in processed_rows:
    row_length = row.size(0)
    new_length = (row_length // 7) * 7
    trimmed_row = row[:new_length]
    trimmed_row = [t - 128266 for t in trimmed_row]
    code_lists.append(trimmed_row)


def redistribute_codes(code_list):
  layer_1 = []
  layer_2 = []
  layer_3 = []
  for i in range((len(code_list)+1)//7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1]-4096)
    layer_3.append(code_list[7*i+2]-(2*4096))
    layer_3.append(code_list[7*i+3]-(3*4096))
    layer_2.append(code_list[7*i+4]-(4*4096))
    layer_3.append(code_list[7*i+5]-(5*4096))
    layer_3.append(code_list[7*i+6]-(6*4096))
  codes = [torch.tensor(layer_1).unsqueeze(0),
         torch.tensor(layer_2).unsqueeze(0),
         torch.tensor(layer_3).unsqueeze(0)]

  # codes = [c.to("cuda") for c in codes]
  audio_hat = snac_model.decode(codes)
  return audio_hat

my_samples = []
for code_list in code_lists:
  samples = redistribute_codes(code_list)
  my_samples.append(samples)
output_path = "orpheus_audio.wav"
try:
    for i, samples in enumerate(my_samples):
        audio_data = samples.detach().squeeze().cpu().numpy()
        import soundfile as sf
        sf.write(output_path, audio_data, 24000)  # Explicitly pass sample rate
        print(f"✅ Audio saved to {output_path}!")
except Exception as e:
    assert False, f"Inference failed with exception: {e}"

# Verify the file exists
import os
assert os.path.exists(output_path), f"Audio file not found at {output_path}"
print("✅ Audio file exists on disk!")
del my_samples, samples
## assert that transcribed_text contains The birch canoe slid on the smooth planks. Glued the sheet to the dark blue background. It's easy to tell the depth of a well. Four hours of steady work faced us.

print("✅ All sections passed successfully!")


safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./orpheus")

```

## /tests/saving/text_to_speech_models/test_whisper.py

```py path="/tests/saving/text_to_speech_models/test_whisper.py" 
from unsloth import FastLanguageModel, FastModel
from transformers import WhisperForConditionalGeneration, WhisperProcessor
import torch
# ruff: noqa
import sys
from pathlib import Path
from peft import PeftModel
import warnings
import requests


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))


from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.os_utils import require_package, require_python_package

require_package("ffmpeg", "ffmpeg")
require_python_package("soundfile")

import soundfile as sf

print(f"\n{'='*80}")
print("🔍 SECTION 1: Loading Model and LoRA Adapters")
print(f"{'='*80}")


model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/whisper-large-v3",
    dtype = None, # Leave as None for auto detection
    load_in_4bit = False, # Set to True to do 4bit quantization which reduces memory
    auto_model = WhisperForConditionalGeneration,
    whisper_language = "English",
    whisper_task = "transcribe",
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


base_model_class = model.__class__.__name__
#https://github.com/huggingface/transformers/issues/37172
model.generation_config.input_ids = model.generation_config.forced_decoder_ids
model.generation_config.forced_decoder_ids = None


model = FastModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "v_proj"],
    lora_alpha = 64,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
    task_type = None, # ** MUST set this for Whisper **
)

print("✅ Model and LoRA adapters loaded successfully!")


print(f"\n{'='*80}")
print("🔍 SECTION 2: Checking Model Class Type")
print(f"{'='*80}")

assert isinstance(model, PeftModel), "Model should be an instance of PeftModel"
print("✅ Model is an instance of PeftModel!")


print(f"\n{'='*80}")
print("🔍 SECTION 3: Checking Config Model Class Type")
print(f"{'='*80}")

def find_lora_base_model(model_to_inspect):
    current = model_to_inspect
    if hasattr(current, "base_model"):
        current = current.base_model
    if hasattr(current, "model"):
        current = current.model
    return current
pass


config_model = find_lora_base_model(model) if isinstance(model, PeftModel) else model

assert config_model.__class__.__name__ == base_model_class, f"Expected config_model class to be {base_model_class}"
print("✅ config_model returns correct Base Model class:", str(base_model_class))


print(f"\n{'='*80}")
print("🔍 SECTION 4: Saving and Merging Model")
print(f"{'='*80}")

with warnings.catch_warnings():
    warnings.simplefilter("error")  # Treat warnings as errors
    try:
        model.save_pretrained_merged("whisper", tokenizer)
        print("✅ Model saved and merged successfully without warnings!")
    except Exception as e:
        assert False, f"Model saving/merging failed with exception: {e}"

print(f"\n{'='*80}")
print("🔍 SECTION 5: Loading Model for Inference")
print(f"{'='*80}")


model, tokenizer = FastModel.from_pretrained(
    model_name = "./whisper",
    dtype = None, # Leave as None for auto detection
    load_in_4bit = False, # Set to True to do 4bit quantization which reduces memory
    auto_model = WhisperForConditionalGeneration,
    whisper_language = "English",
    whisper_task = "transcribe",
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

# model = WhisperForConditionalGeneration.from_pretrained("./whisper")
# processor = WhisperProcessor.from_pretrained("./whisper")

print("✅ Model loaded for inference successfully!")

print(f"\n{'='*80}")
print("🔍 SECTION 6: Downloading Sample Audio File")
print(f"{'='*80}")

audio_url = "https://upload.wikimedia.org/wikipedia/commons/5/5b/Speech_12dB_s16.flac"
audio_file = "Speech_12dB_s16.flac"

try:
    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(audio_url, headers=headers)
    response.raise_for_status()
    with open(audio_file, "wb") as f:
        f.write(response.content)
    print("✅ Audio file downloaded successfully!")
except Exception as e:
    assert False, f"Failed to download audio file: {e}"

print(f"\n{'='*80}")
print("🔍 SECTION 7: Running Inference")
print(f"{'='*80}")


from transformers import pipeline
import torch
FastModel.for_inference(model)
model.eval()
#Create pipeline without specifying the device
whisper = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=tokenizer.tokenizer,
    feature_extractor=tokenizer.feature_extractor,
    processor=tokenizer,
    return_language=True,
    torch_dtype=torch.float16  # Remove the device parameter
)
# Example usage
audio_file = "Speech_12dB_s16.flac"
transcribed_text = whisper(audio_file)
# audio, sr = sf.read(audio_file)
# input_features = processor(audio, return_tensors="pt").input_features
# transcribed_text = model.generate(input_features=input_features)
print(f"📝 Transcribed Text: {transcribed_text['text']}")

## assert that transcribed_text contains The birch canoe slid on the smooth planks. Glued the sheet to the dark blue background. It's easy to tell the depth of a well. Four hours of steady work faced us.

expected_phrases = [
    "birch canoe slid on the smooth planks",
    "sheet to the dark blue background",
    "easy to tell the depth of a well",
    "Four hours of steady work faced us",
]

transcribed_lower = transcribed_text["text"].lower()
all_phrases_found = all(phrase.lower() in transcribed_lower for phrase in expected_phrases)

assert all_phrases_found, f"Expected phrases not found in transcription: {transcribed_text['text']}"
print("✅ Transcription contains all expected phrases!")


safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./whisper")

```

## /tests/saving/vision_models/test_index_file_sharded_model.py

```py path="/tests/saving/vision_models/test_index_file_sharded_model.py" 
## Import required libraries

from unsloth import FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator

import torch
import os
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from huggingface_hub import HfFileSystem
import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory


## Dataset Preparation"""

print("\n📊 Loading and preparing dataset...")
dataset = load_dataset("lbourdois/OCR-liboaccn-OPUS-MIT-5M-clean", "en", split="train")
# To select the first 2000 examples
train_dataset = dataset.select(range(2000))

# To select the next 200 examples for evaluation
eval_dataset = dataset.select(range(2000, 2200))

print(f"✅ Dataset loaded successfully!")
print(f"   📈 Training samples: {len(train_dataset)}")
print(f"   📊 Evaluation samples: {len(eval_dataset)}")
# Convert dataset to OAI messages
def format_data(sample):
    return {
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": sample["question"],
                    },
                    {
                        "type": "image",
                        "image": sample["image"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["answer"]}],
            },
        ],
    }

print("\n🔄 Formatting dataset for vision training...")
system_message = "You are an expert french ocr system."
# Convert dataset to OAI messages
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]
print("✅ Dataset formatting completed!")

"""## Finetuning Setup and Run"""


print("\n" + "=" * 80)
print("=== MODEL LOADING AND SETUP ===".center(80))
print("=" * 80 + "\n")
# Load Base Model
print("🤖 Loading base vision model...")
try:
    model, tokenizer = FastVisionModel.from_pretrained(
        # model_name = "unsloth/Qwen2-VL-7B-Instruct",
        model_name="unsloth/Qwen2-VL-7B-Instruct",
        max_seq_length=2048,  # Choose any for long context!
        load_in_4bit=True,  # 4 bit quantization to reduce memory
        load_in_8bit=False,  # [NEW!] A bit more accurate, uses 2x memory
        full_finetuning=False,  # [NEW!] We have full finetuning now!
    )
except Exception as e:
    print(f"❌ Failed to load base model: {e}")
    raise

print("\n🔧 Setting up LoRA configuration...")
## Lora Finetuning
try:
    model = FastVisionModel.get_peft_model(
        model,
        finetune_vision_layers=True,  # Turn off for just text!
        finetune_language_layers=True,  # Should leave on!
        finetune_attention_modules=True,  # Attention good for GRPO
        finetune_mlp_modules=True,  # SHould leave on always!
        r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        lora_alpha=32,
        lora_dropout=0,  # Supports any, but = 0 is optimized
        bias="none",  # Supports any, but = "none" is optimized
        use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
        random_state=3407,
        use_rslora=False,  # We support rank stabilized LoRA
        loftq_config=None,  # And LoftQ
    )
    print("✅ LoRA configuration applied successfully!")
    print(f"   🎯 LoRA rank (r): 16")
    print(f"   📊 LoRA alpha: 32")
    print(f"   🔍 Vision layers: Enabled")
    print(f"   💬 Language layers: Enabled")
except Exception as e:
    print(f"❌ Failed to apply LoRA configuration: {e}")
    raise

print("\n" + "=" * 80)
print("=== TRAINING SETUP ===".center(80))
print("=" * 80 + "\n")


print("🏋️ Preparing trainer...")
FastVisionModel.for_training(model)  # Enable for training!

try:
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=UnslothVisionDataCollator(model, tokenizer),
        train_dataset=train_dataset,
        args=SFTConfig(
            # per_device_train_batch_size = 4,
            # gradient_accumulation_steps = 8,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={
                "use_reentrant": False
            },  # use reentrant checkpointing
            max_grad_norm=0.3,  # max gradient norm based on QLoRA paper
            warmup_ratio=0.03,
            # num_train_epochs = 2, # Set this instead of max_steps for full training runs
            max_steps=10,
            learning_rate=2e-4,
            fp16=not is_bf16_supported(),
            bf16=is_bf16_supported(),
            logging_steps=5,
            save_strategy="epoch",
            optim="adamw_torch_fused",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="checkpoints",
            report_to="none",  # For Weights and Biases
            # You MUST put the below items for vision finetuning:
            remove_unused_columns=False,
            dataset_text_field="",
            dataset_kwargs={"skip_prepare_dataset": True},
            dataset_num_proc=4,
            max_seq_length=2048,
        ),
    )
    print("✅ Trainer setup completed!")
    print(f"   📦 Batch size: 2")
    print(f"   🔄 Gradient accumulation steps: 4")
    print(f"   📈 Max training steps: 10")
    print(f"   🎯 Learning rate: 2e-4")
    print(f"   💾 Precision: {'BF16' if is_bf16_supported() else 'FP16'}")
except Exception as e:
    print(f"❌ Failed to setup trainer: {e}")
    raise

print("\n" + "=" * 80)
print("=== STARTING TRAINING ===".center(80))
print("=" * 80 + "\n")
# run training
try:
    print("🚀 Starting training process...")
    trainer_stats = trainer.train()
except Exception as e:
    print(f"❌ Training failed: {e}")
    raise

print("\n" + "=" * 80)
print("=== SAVING MODEL ===".center(80))
print("=" * 80 + "\n")

print("💾 Saving adapter model and tokenizer locally...")
try:
    model.save_pretrained("unsloth-qwen2-7vl-french-ocr-adapter", tokenizer)
    tokenizer.save_pretrained("unsloth-qwen2-7vl-french-ocr-adapter")
    print("✅ Model saved locally!")
except Exception as e:
    print(f"❌ Failed to save model locally: {e}")
    raise


hf_username = os.environ.get("HF_USER", "")
if not hf_username:
    hf_username = input("Please enter your Hugging Face username: ").strip()
    os.environ["HF_USER"] = hf_username

hf_token = os.environ.get("HF_TOKEN", "")
if not hf_token:
    hf_token = input("Please enter your Hugging Face token: ").strip()
    os.environ["HF_TOKEN"] = hf_token

repo_name = f"{hf_username}/qwen2-7b-ocr-merged"
success = {
    "upload": False,
    "safetensors_check": False,
    "download": False,
}
# Stage 1: Upload model to Hub
try:
    print("\n" + "=" * 80)
    print("=== UPLOADING MODEL TO HUB ===".center(80))
    print("=" * 80 + "\n")
    print(f"🚀 Uploading to repository: {repo_name}")
    model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token)
    success["upload"] = True
    print("✅ Model uploaded successfully!")
except Exception as e:
    print(f"❌ Failed to upload model: {e}")
    raise Exception("Model upload failed.")

# Stage 2: Verify safetensors.index.json exists
try:
    print("\n" + "=" * 80)
    print("=== VERIFYING REPO CONTENTS ===".center(80))
    print("=" * 80 + "\n")
    fs = HfFileSystem(token=hf_token)
    file_list = fs.ls(repo_name, detail=True)
    safetensors_found = any(
        file["name"].endswith("model.safetensors.index.json") for file in file_list
    )
    if safetensors_found:
        success["safetensors_check"] = True
        print("✅ model.safetensors.index.json found in repo!")
    else:
        raise Exception("model.safetensors.index.json not found in repo.")
except Exception as e:
    print(f"❌ Verification failed: {e}")
    raise Exception("Repo verification failed.")

# test downloading model even if cached
safe_remove_directory(f"./{hf_username}")

try:
    print("\n" + "=" * 80)
    print("=== TESTING MODEL DOWNLOAD ===".center(80))
    print("=" * 80 + "\n")
    print("📥 Testing model download...")
    # Force download even if cached
    test_model, test_tokenizer = FastVisionModel.from_pretrained(repo_name)
    success["download"] = True
    print("✅ Model downloaded successfully!")

    # Clean up test model
    del test_model, test_tokenizer
    torch.cuda.empty_cache()
except Exception as e:
    print(f"❌ Download failed: {e}")
    raise Exception("Model download failed.")

# Final report
print("\n" + "=" * 80)
print("=== VALIDATION REPORT ===".center(80))
print("=" * 80 + "\n")
for stage, passed in success.items():
    status = "✅" if passed else "❌"
    print(f"{status} {stage.replace('_', ' ').title()}")
print("\n" + "=" * 80)

if all(success.values()):
    print("\n🎉 All stages completed successfully!")
    print(f"🌐 Your model is available at: https://huggingface.co/{repo_name}")
else:
    raise Exception("Validation failed for one or more stages.")


# Final cleanup
print("\n🧹 Cleaning up temporary files...")
safe_remove_directory("./checkpoints")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./unsloth-qwen2-7vl-french-ocr-adapter")

print("\n🎯 Pipeline completed successfully!")
print("=" * 80)

```

## /tests/saving/vision_models/test_push_to_hub_merged.py

```py path="/tests/saving/vision_models/test_push_to_hub_merged.py" 
## Import required libraries

from unsloth import FastVisionModel, is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator

import torch
import os
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))


from tests.utils.cleanup_utils import safe_remove_directory


## Dataset Preparation"""

print("\n📊 Loading and preparing dataset...")
dataset = load_dataset("lbourdois/OCR-liboaccn-OPUS-MIT-5M-clean", "en", split="train")
# To select the first 2000 examples
train_dataset = dataset.select(range(2000))

# To select the next 200 examples for evaluation
eval_dataset = dataset.select(range(2000, 2200))

print(f"✅ Dataset loaded successfully!")
print(f"   📈 Training samples: {len(train_dataset)}")
print(f"   📊 Evaluation samples: {len(eval_dataset)}")
# Convert dataset to OAI messages
def format_data(sample):
    return {
        "messages": [
            {
                "role": "system",
                "content": [{"type": "text", "text": system_message}],
            },
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": sample["question"],
                    },
                    {
                        "type": "image",
                        "image": sample["image"],
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [{"type": "text", "text": sample["answer"]}],
            },
        ],
    }

print("\n🔄 Formatting dataset for vision training...")
system_message = "You are an expert french ocr system."
# Convert dataset to OAI messages
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]
print("✅ Dataset formatting completed!")

"""## Finetuning Setup and Run"""


print("\n" + "=" * 80)
print("=== MODEL LOADING AND SETUP ===".center(80))
print("=" * 80 + "\n")
# Load Base Model
print("🤖 Loading base vision model...")
try:
    model, tokenizer = FastVisionModel.from_pretrained(
        # model_name = "unsloth/Qwen2-VL-7B-Instruct",
        model_name="unsloth/Qwen2-VL-2B-Instruct",
        max_seq_length=2048,  # Choose any for long context!
        load_in_4bit=True,  # 4 bit quantization to reduce memory
        load_in_8bit=False,  # [NEW!] A bit more accurate, uses 2x memory
        full_finetuning=False,  # [NEW!] We have full finetuning now!
    )
except Exception as e:
    print(f"❌ Failed to load base model: {e}")
    raise

print("\n🔧 Setting up LoRA configuration...")
## Lora Finetuning
try:
    model = FastVisionModel.get_peft_model(
        model,
        finetune_vision_layers=True,  # Turn off for just text!
        finetune_language_layers=True,  # Should leave on!
        finetune_attention_modules=True,  # Attention good for GRPO
        finetune_mlp_modules=True,  # SHould leave on always!
        r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
        lora_alpha=32,
        lora_dropout=0,  # Supports any, but = 0 is optimized
        bias="none",  # Supports any, but = "none" is optimized
        use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
        random_state=3407,
        use_rslora=False,  # We support rank stabilized LoRA
        loftq_config=None,  # And LoftQ
    )
    print("✅ LoRA configuration applied successfully!")
    print(f"   🎯 LoRA rank (r): 16")
    print(f"   📊 LoRA alpha: 32")
    print(f"   🔍 Vision layers: Enabled")
    print(f"   💬 Language layers: Enabled")
except Exception as e:
    print(f"❌ Failed to apply LoRA configuration: {e}")
    raise

print("\n" + "=" * 80)
print("=== TRAINING SETUP ===".center(80))
print("=" * 80 + "\n")


print("🏋️ Preparing trainer...")
FastVisionModel.for_training(model)  # Enable for training!

try:
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        data_collator=UnslothVisionDataCollator(model, tokenizer),
        train_dataset=train_dataset,
        args=SFTConfig(
            # per_device_train_batch_size = 4,
            # gradient_accumulation_steps = 8,
            per_device_train_batch_size=2,
            gradient_accumulation_steps=4,
            gradient_checkpointing=True,
            gradient_checkpointing_kwargs={
                "use_reentrant": False
            },  # use reentrant checkpointing
            max_grad_norm=0.3,  # max gradient norm based on QLoRA paper
            warmup_ratio=0.03,
            # num_train_epochs = 2, # Set this instead of max_steps for full training runs
            max_steps=10,
            learning_rate=2e-4,
            fp16=not is_bf16_supported(),
            bf16=is_bf16_supported(),
            logging_steps=5,
            save_strategy="epoch",
            optim="adamw_torch_fused",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3407,
            output_dir="checkpoints",
            report_to="none",  # For Weights and Biases
            # You MUST put the below items for vision finetuning:
            remove_unused_columns=False,
            dataset_text_field="",
            dataset_kwargs={"skip_prepare_dataset": True},
            dataset_num_proc=4,
            max_seq_length=2048,
        ),
    )
    print("✅ Trainer setup completed!")
    print(f"   📦 Batch size: 2")
    print(f"   🔄 Gradient accumulation steps: 4")
    print(f"   📈 Max training steps: 10")
    print(f"   🎯 Learning rate: 2e-4")
    print(f"   💾 Precision: {'BF16' if is_bf16_supported() else 'FP16'}")
except Exception as e:
    print(f"❌ Failed to setup trainer: {e}")
    raise

print("\n" + "=" * 80)
print("=== STARTING TRAINING ===".center(80))
print("=" * 80 + "\n")
# run training
try:
    print("🚀 Starting training process...")
    trainer_stats = trainer.train()
except Exception as e:
    print(f"❌ Training failed: {e}")
    raise

print("\n" + "=" * 80)
print("=== SAVING MODEL ===".center(80))
print("=" * 80 + "\n")

print("💾 Saving adapter model and tokenizer locally...")
try:
    model.save_pretrained("unsloth-qwen2-7vl-french-ocr-adapter", tokenizer)
    tokenizer.save_pretrained("unsloth-qwen2-7vl-french-ocr-adapter")
    print("✅ Model saved locally!")
except Exception as e:
    print(f"❌ Failed to save model locally: {e}")
    raise


hf_username = os.environ.get("HF_USER", "")
if not hf_username:
    hf_username = input("Please enter your Hugging Face username: ").strip()
    os.environ["HF_USER"] = hf_username

hf_token = os.environ.get("HF_TOKEN", "")
if not hf_token:
    hf_token = input("Please enter your Hugging Face token: ").strip()
    os.environ["HF_TOKEN"] = hf_token

repo_name = f"{hf_username}/qwen2-ocr-merged"
success = {
    "upload": False,
    "download": False,
}
# Stage 1: Upload model to Hub
try:
    print("\n" + "=" * 80)
    print("=== UPLOADING MODEL TO HUB ===".center(80))
    print("=" * 80 + "\n")
    print(f"🚀 Uploading to repository: {repo_name}")
    model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token)
    success["upload"] = True
    print("✅ Model uploaded successfully!")
except Exception as e:
    print(f"❌ Failed to upload model: {e}")
    raise Exception("Model upload failed.")


try:
    print("\n" + "=" * 80)
    print("=== TESTING MODEL DOWNLOAD ===".center(80))
    print("=" * 80 + "\n")
    print("📥 Testing model download...")
    # Force download even if cached
    test_model, test_tokenizer = FastVisionModel.from_pretrained(repo_name)
    success["download"] = True
    print("✅ Model downloaded successfully!")

    # Clean up test model
    del test_model, test_tokenizer
    torch.cuda.empty_cache()
except Exception as e:
    print(f"❌ Download failed: {e}")
    raise Exception("Model download failed.")

# Final report
print("\n" + "=" * 80)
print("=== VALIDATION REPORT ===".center(80))
print("=" * 80 + "\n")
for stage, passed in success.items():
    status = "✅" if passed else "❌"
    print(f"{status} {stage.replace('_', ' ').title()}")
print("\n" + "=" * 80)

if all(success.values()):
    print("\n🎉 All stages completed successfully!")
    print(f"🌐 Your model is available at: https://huggingface.co/{repo_name}")
else:
    raise Exception("Validation failed for one or more stages.")


# Final cleanup
print("\n🧹 Cleaning up temporary files...")
safe_remove_directory("./checkpoints")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./unsloth-qwen2-7vl-french-ocr-adapter")
safe_remove_directory(f"./{hf_username}")

print("\n🎯 Pipeline completed successfully!")
print("=" * 80)

```

## /tests/saving/vision_models/test_save_merge_vision_model_ocr_benchmark.py

```py path="/tests/saving/vision_models/test_save_merge_vision_model_ocr_benchmark.py" 
# -*- coding: utf-8 -*-

from unsloth import FastVisionModel

import torch
from qwen_vl_utils import process_vision_info
import os
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig

import sys
from pathlib import Path


REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))

from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.ocr_eval import OCRModelEvaluator


## Dataset Preparation
from datasets import load_dataset

dataset = load_dataset("lbourdois/OCR-liboaccn-OPUS-MIT-5M-clean", 'en', split="train")
# To select the first 2000 examples
train_dataset = dataset.select(range(2000))

# To select the next 200 examples for evaluation
eval_dataset = dataset.select(range(2000, 2200))

# Convert dataset to OAI messages
def format_data(sample):
    return {"messages": [
                {
                    "role": "system",
                    "content": [{"type": "text", "text": system_message}],
                },
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
                            "text": sample["question"],
                        },{
                            "type": "image",
                            "image": sample["image"],
                        }
                    ],
                },
                {
                    "role": "assistant",
                    "content": [{"type": "text", "text": sample["answer"]}],
                },
            ],
        }

system_message = "You are an expert french ocr system."
# Convert dataset to OAI messages
# need to use list comprehension to keep Pil.Image type, .mape convert image to bytes
train_dataset = [format_data(sample) for sample in train_dataset]
eval_dataset = [format_data(sample) for sample in eval_dataset]

## Setup OCR main evaluation function and helpers
import os
import torch
from tqdm import tqdm
import pandas as pd
from jiwer import wer, cer
from qwen_vl_utils import process_vision_info

#
ocr_evaluator = OCRModelEvaluator()
model_comparison_results = {}

## Finetuning Setup and Run
# Load Base Model

model, tokenizer = FastVisionModel.from_pretrained(
    model_name = "unsloth/Qwen2-VL-7B-Instruct",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
)

# benchmark base model performance
model_name = "Unsloth Base model"
FastVisionModel.for_inference(model)
avg_wer, avg_cer = ocr_evaluator.evaluate_model(model, tokenizer, eval_dataset, output_dir="unsloth_base_model_results")
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

## Lora Finetuning
model = FastVisionModel.get_peft_model(
    model,
    finetune_vision_layers     = True, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    #target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      #"gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

from unsloth import is_bf16_supported
from unsloth.trainer import UnslothVisionDataCollator
FastVisionModel.for_training(model) # Enable for training!
model.config.use_cache = False


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    data_collator = UnslothVisionDataCollator(model, tokenizer),
    train_dataset = train_dataset,
    args = SFTConfig(
        #per_device_train_batch_size = 4,
        #gradient_accumulation_steps = 8,
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        gradient_checkpointing=True,
        gradient_checkpointing_kwargs = {"use_reentrant": False}, # use reentrant checkpointing
        max_grad_norm=0.3,                      # max gradient norm based on QLoRA paper
        warmup_ratio=0.03,
        #num_train_epochs = 2, # Set this instead of max_steps for full training runs
        max_steps=60,
        learning_rate = 2e-4,
        fp16 = not is_bf16_supported(),
        bf16 = is_bf16_supported(),
        logging_steps = 5,
        save_strategy="epoch",
        optim = "adamw_torch_fused",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "unsloth-qwen2-7vl-french-ocr-checkpoints",
        report_to = "none",     # For Weights and Biases

        # You MUST put the below items for vision finetuning:
        remove_unused_columns = False,
        dataset_text_field = "",
        dataset_kwargs = {"skip_prepare_dataset": True},
        dataset_num_proc = 4,
        max_seq_length = 2048,
    ),
)

# run training
trainer_stats = trainer.train()

model.save_pretrained("unsloth-qwen2-7vl-french-ocr-adapter", tokenizer)
tokenizer.save_pretrained("unsloth-qwen2-7vl-french-ocr-adapter")

## Measure Adapter Performance

# benchmark lora model performance
model_name = "Unsloth lora adapter model"
FastVisionModel.for_inference(model)
avg_wer, avg_cer = ocr_evaluator.evaluate_model(model, tokenizer, eval_dataset, output_dir="unsloth_lora_model_results")
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

## Merge Model

def find_lora_base_model(model_to_inspect):
    current = model_to_inspect
    if hasattr(current, "base_model"):
        current = current.base_model
    if hasattr(current, "model"):
        current = current.model
    return current
pass

base = find_lora_base_model(model)

print((base.__class__.__name__))

# merge default 16 bits
model.save_pretrained_merged(save_directory="qwen2-ocr-merged-finetune-merge-16bit", tokenizer=tokenizer)


## Benchmark merged model performance

### 16 bits merged model

model, tokenizer = FastVisionModel.from_pretrained("./qwen2-ocr-merged-finetune-merge-16bit",load_in_4bit=False, load_in_8bit=False)

# benchmark 4bit loaded, 16bits merged model performance
model_name = "Unsloth 16bits-merged model load-16bits"
model.config.use_cache = True

avg_wer, avg_cer = ocr_evaluator.evaluate_model(model, tokenizer, eval_dataset, output_dir="unsloth_16bits_merged_model_load_16bits_results")
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

# load 16bits-merged model in 4 bits
model, tokenizer = FastVisionModel.from_pretrained("./qwen2-ocr-merged-finetune-merge-16bit",load_in_4bit=True, load_in_8bit=False)

# benchmark 4bit loaded, 16bits merged model performance
model_name = "Unsloth 16bits-merged model load-4bits"
model.config.use_cache = True

avg_wer, avg_cer = ocr_evaluator.evaluate_model(model, tokenizer, eval_dataset, output_dir="unsloth_16bits_merged_model_load_4bits_results")
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

# load model in 8 bits
model, tokenizer = FastVisionModel.from_pretrained("./qwen2-ocr-merged-finetune-merge-16bit",load_in_4bit=False, load_in_8bit=True)

# benchmark 4bit loaded, 16bits merged model performance
model_name = "Unsloth 16bits-merged model load-8bits"
avg_wer, avg_cer = ocr_evaluator.evaluate_model(model, tokenizer, eval_dataset, output_dir="unsloth_16bits_merged_model_load_8bits_results")
ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

# """### 4 bits merged model"""
#
# # load 4bits-merged model in 4 bits
# model, tokenizer = FastVisionModel.from_pretrained("./qwen2-ocr-merged-finetune-merge-4bit",load_in_4bit=True, load_in_8bit=False)
#
# # benchmark 4bit loaded, 4bits merged model performance
# model_name = "Unsloth 4bits-merged model load-4bits"
#
# avg_wer, avg_cer = ocr_evaluator.evaluate_model(model, tokenizer, eval_dataset, output_dir="unsloth_4bits_merged_model_load_4bits_results")
# ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)
#
# # load model in 8 bits
# model, tokenizer = FastVisionModel.from_pretrained("./qwen2-ocr-merged-finetune-merge-4bit",load_in_4bit=False, load_in_8bit=True)
#
# # benchmark 8bit loaded, 4bits merged model performance
# model_name = "Unsloth 4bits-merged model load-8bits"
#
# avg_wer, avg_cer = ocr_evaluator.evaluate_model(model, tokenizer, eval_dataset, output_dir="unsloth_4bits_merged_model_load_8bits_results")
# ocr_evaluator.add_to_comparison(model_name, avg_wer, avg_cer)

# Model comparison report
#print model comparison
ocr_evaluator.print_model_comparison()


# Final cleanup
print("\n🧹 Cleaning up temporary files...")
safe_remove_directory("./unsloth-qwen2-7vl-french-ocr-adapter")
safe_remove_directory("./unsloth-qwen2-7vl-french-ocr-checkpoints")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./qwen2-ocr-merged-finetune-merge-16bit")

print("\n🎯 Pipeline completed successfully!")
print("=" * 80)

```

## /tests/test_model_registry.py

```py path="/tests/test_model_registry.py" 
"""

Test model registration methods
Checks that model registration methods work for respective models as well as all models
The check is performed
- by registering the models
- checking that the instantiated models can be found on huggingface hub by querying for the model id

"""

from dataclasses import dataclass

import pytest
from huggingface_hub import ModelInfo as HfModelInfo

from unsloth.registry import register_models, search_models
from unsloth.registry._deepseek import register_deepseek_models
from unsloth.registry._gemma import register_gemma_models
from unsloth.registry._llama import register_llama_models
from unsloth.registry._mistral import register_mistral_models
from unsloth.registry._phi import register_phi_models
from unsloth.registry._qwen import register_qwen_models
from unsloth.registry.registry import MODEL_REGISTRY, QUANT_TAG_MAP, QuantType
from unsloth.utils.hf_hub import get_model_info

MODEL_NAMES = [
    "llama",
    "qwen",
    "mistral",
    "phi",
    "gemma",
    "deepseek",
]
MODEL_REGISTRATION_METHODS = [
    register_llama_models,
    register_qwen_models,
    register_mistral_models,
    register_phi_models,
    register_gemma_models,
    register_deepseek_models,
]


@dataclass
class ModelTestParam:
    name: str
    register_models: callable


def _test_model_uploaded(model_ids: list[str]):
    missing_models = []
    for _id in model_ids:
        model_info: HfModelInfo = get_model_info(_id)
        if not model_info:
            missing_models.append(_id)

    return missing_models


TestParams = [
    ModelTestParam(name, models)
    for name, models in zip(MODEL_NAMES, MODEL_REGISTRATION_METHODS)
]


# Test that model registration methods register respective models
@pytest.mark.parametrize("model_test_param", TestParams, ids=lambda param: param.name)
def test_model_registration(model_test_param: ModelTestParam):
    MODEL_REGISTRY.clear()
    registration_method = model_test_param.register_models
    registration_method()
    registered_models = MODEL_REGISTRY.keys()
    missing_models = _test_model_uploaded(registered_models)
    assert not missing_models, (
        f"{model_test_param.name} missing following models: {missing_models}"
    )


def test_all_model_registration():
    register_models()
    registered_models = MODEL_REGISTRY.keys()
    missing_models = _test_model_uploaded(registered_models)
    assert not missing_models, f"Missing following models: {missing_models}"

def test_quant_type():
    # Test that the quant_type is correctly set for model paths
    # NOTE: for models registered under org="unsloth" with QuantType.NONE aliases QuantType.UNSLOTH
    dynamic_quant_models = search_models(quant_types=[QuantType.UNSLOTH])
    assert all(m.quant_type == QuantType.UNSLOTH for m in dynamic_quant_models)
    quant_tag = QUANT_TAG_MAP[QuantType.UNSLOTH]
    assert all(quant_tag in m.model_path for m in dynamic_quant_models)
```

## /tests/utils/__init__.py

```py path="/tests/utils/__init__.py" 
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import time
from contextlib import contextmanager


@contextmanager
def timer(name):
    start = time.time()
    yield
    end = time.time()
    print(f"{name} took {end - start:.2f} seconds")


@contextmanager
def header_footer_context(title: str, char="-"):
    print()
    print(f"{char}" * 50 + f" {title} " + f"{char}" * 50)
    yield
    print(f"{char}" * (100 + len(title) + 2))
    print()

```

## /tests/utils/perplexity_eval.md

# Language Model Perplexity Evaluator

A Python module for evaluating language models using perplexity metrics with sliding window approach for long sequences. This evaluator provides efficient computation of perplexity scores across datasets with model comparison capabilities.

## Basic Usage

```python
from perplexity_evaluator import ppl_model, add_to_comparison, print_model_comparison

# Simple perplexity evaluation
dataset = {"text": ["Your text samples here...", "Another text sample..."]}
perplexity = ppl_model(model, tokenizer, dataset)

print(f"Model Perplexity: {perplexity:.4f}")

# Add to comparison tracker
add_to_comparison("My Model", perplexity)
print_model_comparison()
```


## /unsloth/dataprep/__init__.py

```py path="/unsloth/dataprep/__init__.py" 
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .synthetic import *

```

## /unsloth/kernels/moe/__init__.py

```py path="/unsloth/kernels/moe/__init__.py" 

```

## /unsloth/kernels/moe/grouped_gemm/__init__.py

```py path="/unsloth/kernels/moe/grouped_gemm/__init__.py" 

```

## /unsloth/kernels/moe/grouped_gemm/kernels/__init__.py

```py path="/unsloth/kernels/moe/grouped_gemm/kernels/__init__.py" 

```

## /unsloth/kernels/moe/grouped_gemm/reference/__init__.py

```py path="/unsloth/kernels/moe/grouped_gemm/reference/__init__.py" 

```


The content has been capped at 50000 tokens. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.