```
├── .gitattributes (omitted)
├── .github/
├── FUNDING.yml (200 tokens)
├── ISSUE_TEMPLATE/
├── bug---issue.md (100 tokens)
├── feature-request.md (100 tokens)
├── workflows/
├── stale.yml (300 tokens)
├── .gitignore (700 tokens)
├── CODE_OF_CONDUCT.md (1100 tokens)
├── CONTRIBUTING.md (400 tokens)
├── LICENSE (omitted)
├── README.md (5.6k tokens)
├── images/
├── Assistant.png
├── Colab.png
├── Discord button.png
├── Discord.png
├── Documentation Button.png
├── Free version button.png
├── Kaggle.png
├── Kofi button.png
├── LAION 2GPU.png
├── Merge.png
├── Run.png
├── Slim Orca 2GPUs.png
├── Terminal_Type.png
├── Where_Terminal.png
├── buy me a coffee button.png
├── documentation github button.png
├── documentation green button.png
├── documentation lighter.png
├── documentation white button.png
├── made with unsloth.png
├── ollama.png
├── peft x trl button.png
├── start free finetune button.png
├── unsloth end.png
├── unsloth loading page render.png
├── unsloth logo black text.png
├── unsloth logo only.png
├── unsloth logo white text.png
├── unsloth made with love.png
├── unsloth new logo.png
├── unsloth sticker.png
├── pyproject.toml (11.9k tokens)
├── tests/
├── __init__.py
├── qlora/
├── README.md (500 tokens)
├── test_hf_qlora_train_and_merge.py (1000 tokens)
├── test_unsloth_qlora_train_and_merge.py (1200 tokens)
├── saving/
├── gpt-oss-merge/
├── run_test.sh (100 tokens)
├── test_merged_model.py (300 tokens)
├── train_and_merge.py (500 tokens)
├── language_models/
├── test_merge_4bit_validation.py (1300 tokens)
├── test_merge_model_perplexity_llama-3.2.py (1500 tokens)
├── test_merge_model_perplexity_mistral.py (1800 tokens)
├── test_merge_model_perplexity_phi_4.py (1500 tokens)
├── test_merged_model_perplexity_llama-3.1-8b.py (1500 tokens)
├── test_merged_model_perplexity_qwen_2.5.py (1800 tokens)
├── test_push_to_hub_merged.py (1100 tokens)
├── test_push_to_hub_merged_sharded_index_file.py (1200 tokens)
├── test_save_merged_grpo_model.py (5.1k tokens)
├── non_peft/
├── test_mistral_non_peft.py (400 tokens)
├── test_whisper_non_peft.py (400 tokens)
├── test_unsloth_save.py (2.3k tokens)
├── text_to_speech_models/
├── test_csm.py (1000 tokens)
├── test_lasa.py (1400 tokens)
├── test_orpheus.py (1700 tokens)
├── test_whisper.py (1300 tokens)
├── vision_models/
├── test_index_file_sharded_model.py (1900 tokens)
├── test_push_to_hub_merged.py (1800 tokens)
├── test_save_merge_qwen2.5vl32B_model_ocr_benchmark.py (1900 tokens)
├── test_save_merge_vision_model_ocr_benchmark.py (1900 tokens)
├── test_model_registry.py (600 tokens)
├── utils/
├── __init__.py (200 tokens)
├── aime_eval.md (1300 tokens)
├── aime_eval.py (3.8k tokens)
├── cleanup_utils.py (1400 tokens)
├── data_utils.py (1100 tokens)
├── hf_utils.py (1600 tokens)
├── ocr_eval.md (600 tokens)
├── ocr_eval.py (2.4k tokens)
├── os_utils.py (800 tokens)
├── perplexity_eval.md (100 tokens)
├── perplexity_eval.py (600 tokens)
├── test_qat.py (1300 tokens)
├── unsloth-cli.py (2.4k tokens)
├── unsloth/
├── __init__.py (1900 tokens)
├── _auto_install.py (400 tokens)
├── chat_templates.py (26.3k tokens)
├── dataprep/
├── __init__.py (100 tokens)
├── synthetic.py (3.1k tokens)
├── synthetic_configs.py (800 tokens)
├── device_type.py (600 tokens)
├── import_fixes.py (1700 tokens)
├── kernels/
├── __init__.py (500 tokens)
├── cross_entropy_loss.py (2.9k tokens)
├── fast_lora.py (3.7k tokens)
├── flex_attention.py (1400 tokens)
├── fp8.py (4k tokens)
├── geglu.py (1400 tokens)
├── layernorm.py (1400 tokens)
├── moe/
├── LICENSE (6.9k tokens)
├── README.md (1200 tokens)
├── __init__.py
├── benchmark/
├── benchmark_fused_moe.py (2.8k tokens)
├── utils.py (1400 tokens)
├── grouped_gemm/
├── LICENSE (6.9k tokens)
├── __init__.py
├── interface.py (7.7k tokens)
├── kernels/
├── __init__.py
├── autotuning.py (2000 tokens)
├── backward.py (4.7k tokens)
├── forward.py (2.2k tokens)
├── tuning.py (1700 tokens)
├── reference/
├── __init__.py
├── layers/
├── llama4_moe.py (3.4k tokens)
├── qwen3_moe.py (2.6k tokens)
├── moe_block.py (1200 tokens)
├── moe_ops.py (800 tokens)
├── requirements.txt
├── tests/
├── __init__.py
├── common.py (2k tokens)
├── moe_utils.py (3.7k tokens)
├── run_qwen3_moe_tests.sh (200 tokens)
├── test_grouped_gemm.py (8.7k tokens)
├── test_llama4_moe.py (1600 tokens)
├── test_qwen3_moe.py (2k tokens)
├── rms_layernorm.py (2000 tokens)
├── rope_embedding.py (1400 tokens)
├── swiglu.py (700 tokens)
├── utils.py (5.9k tokens)
├── models/
├── __init__.py (300 tokens)
├── _utils.py (15.1k tokens)
├── cohere.py (3.9k tokens)
├── dpo.py (200 tokens)
├── falcon_h1.py (5.9k tokens)
├── gemma.py (3.5k tokens)
├── gemma2.py (4.4k tokens)
├── granite.py (4.5k tokens)
├── llama.py (26.5k tokens)
├── llama4.py (100 tokens)
├── loader.py (10k tokens)
├── loader_utils.py (1300 tokens)
├── mapper.py (7.9k tokens)
├── mistral.py (3.8k tokens)
├── qwen2.py (800 tokens)
├── qwen3.py (3.7k tokens)
├── qwen3_moe.py (1900 tokens)
├── rl.py (10.1k tokens)
├── rl_replacements.py (7.6k tokens)
├── vision.py (9.8k tokens)
├── ollama_template_mappers.py (16.6k tokens)
├── registry/
├── REGISTRY.md (700 tokens)
├── __init__.py (400 tokens)
├── _deepseek.py (1300 tokens)
├── _gemma.py (500 tokens)
├── _llama.py (800 tokens)
├── _mistral.py (600 tokens)
├── _phi.py (500 tokens)
├── _qwen.py (800 tokens)
├── registry.py (1100 tokens)
├── save.py (21.5k tokens)
├── tokenizer_utils.py (8.4k tokens)
├── trainer.py (1700 tokens)
├── utils/
├── __init__.py
├── hf_hub.py (400 tokens)
```
## /.github/FUNDING.yml
```yml path="/.github/FUNDING.yml"
# These are supported funding model platforms
github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: # Replace with a single Open Collective username
ko_fi: unsloth
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
```
## /.github/ISSUE_TEMPLATE/bug---issue.md
---
name: Bug / Issue
about: Bug / Issue
title: "[Bug] Please fill in your issue title here."
labels: bug
assignees: ''
---
1. Did you update? `pip install --upgrade unsloth unsloth_zoo`
2. `Colab` or `Kaggle` or local / cloud
3. Number GPUs used, use `nvidia-smi`
4. Which notebook? Please link!
5. Which Unsloth version, TRL version, transformers version, PyTorch version?
6. Which trainer? `SFTTrainer`, `GRPOTrainer` etc
```python
Put Minimal code to reproduce error here ###Remove Hugging Face token###
```
🦥 You can also ask via our Reddit page: https://www.reddit.com/r/unsloth/
## /.github/ISSUE_TEMPLATE/feature-request.md
---
name: Feature Request
about: New features, model support, ideas
title: "[Feature]"
labels: feature request
assignees: ''
---
For new models, have you tried:
```python
from unsloth import FastModel
model, tokenizer = FastModel.from_pretrained(
"microsoft/Phi-4-multimodal-instruct",
trust_remote_code = True,
)
from transformers import AutoModelForSequenceClassification
model, tokenizer = FastModel.from_pretrained(
auto_model = AutoModelForSequenceClassification,
)
```
## /.github/workflows/stale.yml
```yml path="/.github/workflows/stale.yml"
name: 'Inactive Issue Pinger'
on:
schedule:
- cron: '30 5 * * *' # Runs at 5:30 UTC every day
jobs:
stale:
runs-on: ubuntu-latest
permissions:
issues: write
steps:
- uses: actions/stale@v9
with:
# The message to post on stale issues.
# This message will ping the issue author.
# Note: The stale bot action does not currently support a direct placeholder for the last commenter.
# As a workaround, this message encourages any participant to reply.
stale-issue-message: >
Is this issue still important to you?
Apologies in advance we might have missed this issue as well.
For faster response times, please post on our Reddit server - https://www.reddit.com/r/unsloth or our Discord - https://discord.com/invite/unsloth
# The number of days of inactivity before an issue is considered stale.
days-before-issue-stale: 9999
# Set to -1 to never close stale issues.
days-before-issue-close: -1
# A label to apply to stale issues.
stale-issue-label: 'inactive'
# The number of operations to perform per run to avoid rate limiting.
operations-per-run: 500
enable-statistics: false
```
## /.gitignore
```gitignore path="/.gitignore"
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*.class
unsloth_compiled_cache/
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
.vscode
```
## /CODE_OF_CONDUCT.md
# Contributor Covenant Code of Conduct
## Our Pledge
We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, caste, color, religion, or sexual
identity and orientation.
We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.
## Our Standards
Examples of behavior that contributes to a positive environment for our
community include:
* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
and learning from the experience
* Focusing on what is best not just for us as individuals, but for the overall
community
Examples of unacceptable behavior include:
* The use of sexualized language or imagery, and sexual attention or advances of
any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email address,
without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
professional setting
## Enforcement Responsibilities
Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.
Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.
## Scope
This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.
## Enforcement
Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement at support@unsloth.ai.
All complaints will be reviewed and investigated promptly and fairly.
All community leaders are obligated to respect the privacy and security of the
reporter of any incident.
## Enforcement Guidelines
Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:
### 1. Correction
**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.
**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.
### 2. Warning
**Community Impact**: A violation through a single incident or series of
actions.
**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or permanent
ban.
### 3. Temporary Ban
**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.
**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.
### 4. Permanent Ban
**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior, harassment of an
individual, or aggression toward or disparagement of classes of individuals.
**Consequence**: A permanent ban from any sort of public interaction within the
community.
## Attribution
This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.1, available at
[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
Community Impact Guidelines were inspired by
[Mozilla's code of conduct enforcement ladder][Mozilla CoC].
For answers to common questions about this code of conduct, see the FAQ at
[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
[https://www.contributor-covenant.org/translations][translations].
[homepage]: https://www.contributor-covenant.org
[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
[Mozilla CoC]: https://github.com/mozilla/diversity
[FAQ]: https://www.contributor-covenant.org/faq
[translations]: https://www.contributor-covenant.org/translations
## /CONTRIBUTING.md
# 🦥 Contributing to Unsloth
Thank you for not only using Unsloth but also for being interested in helping out! We value all contributions, whether they come in the form of code, ideas, support for others or just by simply spreading the word of Unsloth! 💕
- **[Support the Community](https://github.com/unslothai/unsloth/issues)**: Answer questions, review pull requests, or assist others in discussions.
- **Fix Bugs**: Identify and resolve issues with the existing codebase.
- **Submit Ideas**: Request new features or share enhancements you'd like to see.
- **Develop Features**: Implement new functionality or improve existing tools which can be done via PRs.
- **[Improve Documentation](https://docs.unsloth.ai/)**: Help by creating guides, FAQs, or enhancing clarity.
One of the best ways to support us is by spreading the word about Unsloth! Share how it’s powering your amazing projects in blog posts or social media, and inspire others to explore its potential. Even a simple star on our repo goes a long way in showing your support and helping the community grow. 🌟
## Submitting Issues
If you find a bug or have a feature idea, we’d love to hear from you! Here’s how to make your submission stand out:
### Reporting Bugs
1. **Search First**: Check if the issue has already been reported using GitHub’s search bar under Issues.
2. **Details Matter**: Is this on Google Colab, Kaggle, or on another platform service? Are you using Unsloth's official notebook? Include your OS, Python version, and other relevant details. For bugs, a concise code snippet that reproduces the issue is incredibly helpful.
3. **Be Thorough**: Attach screenshots, traceback logs, or any additional information that might speed up resolution.
## Spread the Word
Your support extends beyond code:
- Spread the word by writing about Unsloth in blogs or social media.
- Share how Unsloth powers your projects.
- Star our repository to show your appreciation.
Finally, please be mindful of our [Code of Conduct](https://github.com/unslothai/unsloth/blob/main/CODE_OF_CONDUCT.md) to ensure a welcoming and inclusive environment for everyone.
Thank you so much for reading and we hope you have lots of fun using Unsloth! 🦥
## /README.md
<div align="center">
<a href="https://docs.unsloth.ai"><picture>
<source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20white%20text.png">
<source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png">
<img alt="unsloth logo" src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20logo%20black%20text.png" height="110" style="max-width: 100%;">
</picture></a>
<a href="https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-Fine-tuning.ipynb"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/start free finetune button.png" width="154"></a>
<a href="https://discord.com/invite/unsloth"><img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/Discord button.png" width="165"></a>
<a href="https://docs.unsloth.ai"><img src="https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Documentation%20Button.png" width="137"></a>
### Train gpt-oss, DeepSeek, Gemma, Qwen & Llama 2x faster with 70% less VRAM!

</div>
## ✨ Train for Free
Notebooks are beginner friendly. Read our [guide](https://docs.unsloth.ai/get-started/fine-tuning-guide). Add dataset, run, then export your trained model to GGUF, Ollama, vLLM or Hugging Face.
| Model | Free Notebooks | Performance | Memory use |
|-----------|---------|--------|----------|
| **gpt-oss (20B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-Fine-tuning.ipynb) | 1.5x faster | 70% less |
| **Qwen3 (14B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(14B)-Reasoning-Conversational.ipynb) | 2x faster | 70% less |
| **DeepSeek-OCR (3B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_(3B).ipynb) | 1.5x faster | 30% less |
| **gpt-oss (20B): GRPO** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb) | 2x faster | 80% less |
| **Qwen3-VL (8B): GSPO** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_(8B)-Vision-GRPO.ipynb) | 1.5x faster | 80% less |
| **Qwen3-VL (8B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_VL_(8B)-Vision.ipynb) | 2x faster | 50% less |
| **Gemma 3 (270M)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3_(270M).ipynb) | 1.7x faster | 60% less |
| **Gemma 3n (4B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Gemma3N_(4B)-Conversational.ipynb) | 1.5x faster | 50% less |
| **Llama 3.1 (8B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.1_(8B)-Alpaca.ipynb) | 2x faster | 70% less |
| **Orpheus-TTS (3B)** | [▶️ Start for free](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Orpheus_(3B)-TTS.ipynb) | 1.5x faster | 50% less |
- See all our notebooks for: [Kaggle](https://github.com/unslothai/notebooks?tab=readme-ov-file#-kaggle-notebooks), [GRPO](https://docs.unsloth.ai/get-started/unsloth-notebooks#grpo-reasoning-rl-notebooks), **[TTS](https://docs.unsloth.ai/get-started/unsloth-notebooks#text-to-speech-tts-notebooks)** & [Vision](https://docs.unsloth.ai/get-started/unsloth-notebooks#vision-multimodal-notebooks)
- See [all our models](https://docs.unsloth.ai/get-started/all-our-models) and [all our notebooks](https://docs.unsloth.ai/get-started/unsloth-notebooks)
- See detailed documentation for Unsloth [here](https://docs.unsloth.ai/)
## ⚡ Quickstart
### Linux or WSL
```bash
pip install unsloth
```
### Windows
For Windows, `pip install unsloth` works only if you have Pytorch installed. Read our [Windows Guide](https://docs.unsloth.ai/get-started/installing-+-updating/windows-installation).
### Docker
Use our official [Unsloth Docker image](https://hub.docker.com/r/unsloth/unsloth) ```unsloth/unsloth``` container. Read our [Docker Guide](https://docs.unsloth.ai/get-started/install-and-update/docker).
### Blackwell & DGX Spark
For RTX 50x, B200, 6000 GPUs: `pip install unsloth`. Read our [Blackwell Guide](https://docs.unsloth.ai/basics/training-llms-with-blackwell-rtx-50-series-and-unsloth) and [DGX Spark Guide](https://docs.unsloth.ai/new/fine-tuning-llms-with-nvidia-dgx-spark-and-unsloth) for more details.
## 🦥 Unsloth News
- **DeepSeek-OCR**: Fine-tune to improve DeepSeek-OCR's language understanding by 89%. [Guide](https://docs.unsloth.ai/new/deepseek-ocr-run-and-fine-tune) • [Notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Deepseek_OCR_(3B).ipynb)
- **Docker**: Use Unsloth with no setup & environment issues with our new image. [Guide](https://docs.unsloth.ai/new/how-to-train-llms-with-unsloth-and-docker) • [Docker image](https://hub.docker.com/r/unsloth/unsloth)
- **gpt-oss RL**: Introducing the fastest possible inference for gpt-oss RL! [Read blog](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning)
- **Vision RL**: You can now train VLMs with GRPO or GSPO in Unsloth! [Read guide](https://docs.unsloth.ai/new/vision-reinforcement-learning-vlm-rl)
- **Quantization-Aware Training**: We collabed with Pytorch for QAT, recovering as much 70% accuracy. [Read blog](https://docs.unsloth.ai/new/quantization-aware-training-qat)
- **Memory-efficient RL**: We're introducing even better RL. Our new kernels & algos allows faster RL with 50% less VRAM & 10× more context. [Read blog](https://docs.unsloth.ai/new/memory-efficient-rl)
- **gpt-oss** by OpenAI: For details on [Unsloth Flex Attention](https://docs.unsloth.ai/new/long-context-gpt-oss-training), long-context training, bug fixes, [Read our Guide](https://docs.unsloth.ai/basics/gpt-oss). 20B works on a 14GB GPU and 120B on 65GB VRAM. [gpt-oss uploads](https://huggingface.co/collections/unsloth/gpt-oss-6892433695ce0dee42f31681).
- **Gemma 3n** by Google: [Read Blog](https://docs.unsloth.ai/basics/gemma-3n-how-to-run-and-fine-tune). We [uploaded GGUFs, 4-bit models](https://huggingface.co/collections/unsloth/gemma-3n-685d3874830e49e1c93f9339).
- **[Text-to-Speech (TTS)](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning)** is now supported, including `sesame/csm-1b` and STT `openai/whisper-large-v3`.
- **[Qwen3](https://docs.unsloth.ai/basics/qwen3-how-to-run-and-fine-tune)** is now supported. Qwen3-30B-A3B fits on 17.5GB VRAM.
- Introducing **[Dynamic 2.0](https://docs.unsloth.ai/basics/unsloth-dynamic-2.0-ggufs)** quants that set new benchmarks on 5-shot MMLU & Aider Polyglot.
- [**EVERYTHING** is now supported](https://unsloth.ai/blog/gemma3#everything) - all models (TTS, BERT, Mamba), FFT, etc. [MultiGPU](https://docs.unsloth.ai/basics/multi-gpu-training-with-unsloth) coming soon. Enable FFT with `full_finetuning = True`, 8-bit with `load_in_8bit = True`.
<details>
<summary>Click for more news</summary>
- 📣 [DeepSeek-R1](https://unsloth.ai/blog/deepseek-r1) - run or fine-tune them [with our guide](https://unsloth.ai/blog/deepseek-r1). All model uploads: [here](https://huggingface.co/collections/unsloth/deepseek-r1-all-versions-678e1c48f5d2fce87892ace5).
- 📣 Introducing Long-context [Reasoning (GRPO)](https://unsloth.ai/blog/grpo) in Unsloth. Train your own reasoning model with just 5GB VRAM. Transform Llama, Phi, Mistral etc. into reasoning LLMs!
- 📣 Introducing Unsloth [Dynamic 4-bit Quantization](https://unsloth.ai/blog/dynamic-4bit)! We dynamically opt not to quantize certain parameters and this greatly increases accuracy while only using <10% more VRAM than BnB 4-bit. See our collection on [Hugging Face here.](https://huggingface.co/collections/unsloth/unsloth-4-bit-dynamic-quants-67503bb873f89e15276c44e7)
- 📣 **[Llama 4](https://unsloth.ai/blog/llama4)** by Meta, including Scout & Maverick are now supported.
- 📣 [Phi-4](https://unsloth.ai/blog/phi4) by Microsoft: We also [fixed bugs](https://unsloth.ai/blog/phi4) in Phi-4 and [uploaded GGUFs, 4-bit](https://huggingface.co/collections/unsloth/phi-4-all-versions-677eecf93784e61afe762afa).
- 📣 [Vision models](https://unsloth.ai/blog/vision) now supported! [Llama 3.2 Vision (11B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3.2_(11B)-Vision.ipynb), [Qwen 2.5 VL (7B)](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_VL_(7B)-Vision.ipynb) and [Pixtral (12B) 2409](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Pixtral_(12B)-Vision.ipynb)
- 📣 [Llama 3.3 (70B)](https://huggingface.co/collections/unsloth/llama-33-all-versions-67535d7d994794b9d7cf5e9f), Meta's latest model is supported.
- 📣 We worked with Apple to add [Cut Cross Entropy](https://arxiv.org/abs/2411.09009). Unsloth now supports 89K context for Meta's Llama 3.3 (70B) on a 80GB GPU - 13x longer than HF+FA2. For Llama 3.1 (8B), Unsloth enables 342K context, surpassing its native 128K support.
- 📣 We found and helped fix a [gradient accumulation bug](https://unsloth.ai/blog/gradient)! Please update Unsloth and transformers.
- 📣 We cut memory usage by a [further 30%](https://unsloth.ai/blog/long-context) and now support [4x longer context windows](https://unsloth.ai/blog/long-context)!
</details>
## 🔗 Links and Resources
| Type | Links |
| ------------------------------- | --------------------------------------- |
| <img width="15" src="https://redditinc.com/hs-fs/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" /> **r/unsloth Reddit** | [Join Reddit community](https://reddit.com/r/unsloth)|
| 📚 **Documentation & Wiki** | [Read Our Docs](https://docs.unsloth.ai) |
| <img width="16" src="https://upload.wikimedia.org/wikipedia/commons/6/6f/Logo_of_Twitter.svg" /> **Twitter (aka X)** | [Follow us on X](https://twitter.com/unslothai)|
| 💾 **Installation** | [Pip & Docker Install](https://docs.unsloth.ai/get-started/installing-+-updating)|
| 🔮 **Our Models** | [Unsloth Catalog](https://docs.unsloth.ai/get-started/all-our-models)|
| ✍️ **Blog** | [Read our Blogs](https://unsloth.ai/blog)|
## ⭐ Key Features
- Supports **full-finetuning**, pretraining, 4b-bit, 16-bit and **8-bit** training
- Supports **all models** including [TTS](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning), multimodal, [BERT](https://docs.unsloth.ai/get-started/unsloth-notebooks#other-important-notebooks) and more! Any model that works in transformers, works in Unsloth.
- The most efficient library for [Reinforcement Learning (RL)](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide), using 80% less VRAM. Supports GRPO, GSPO, DrGRPO, DAPO etc.
- **0% loss in accuracy** - no approximation methods - all exact.
- Supports NVIDIA (since 2018), [AMD](https://docs.unsloth.ai/get-started/install-and-update/amd) and Intel GPUs. Minimum CUDA Capability 7.0 (V100, T4, Titan V, RTX 20, 30, 40x, A100, H100, L40 etc)
- Works on **Linux**, WSL and **Windows**
- All kernels written in [OpenAI's Triton](https://openai.com/index/triton/) language. Manual backprop engine.
- If you trained a model with 🦥Unsloth, you can use this cool sticker! <img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/made with unsloth.png" width="200" align="center" />
## 💾 Install Unsloth
You can also see our docs for more detailed installation and updating instructions [here](https://docs.unsloth.ai/get-started/installing-+-updating).
Unsloth supports Python 3.13 or lower.
### Pip Installation
**Install with pip (recommended) for Linux devices:**
```
pip install unsloth
```
**To update Unsloth:**
```
pip install --upgrade --force-reinstall --no-cache-dir unsloth unsloth_zoo
```
See [here](https://github.com/unslothai/unsloth/edit/main/README.md#advanced-pip-installation) for advanced pip install instructions.
### Windows Installation
1. **Install NVIDIA Video Driver:**
You should install the latest version of your GPUs driver. Download drivers here: [NVIDIA GPU Drive](https://www.nvidia.com/Download/index.aspx).
3. **Install Visual Studio C++:**
You will need Visual Studio, with C++ installed. By default, C++ is not installed with [Visual Studio](https://visualstudio.microsoft.com/vs/community/), so make sure you select all of the C++ options. Also select options for Windows 10/11 SDK. For detailed instructions with options, see [here](https://docs.unsloth.ai/get-started/installing-+-updating).
5. **Install CUDA Toolkit:**
Follow the instructions to install [CUDA Toolkit](https://developer.nvidia.com/cuda-toolkit-archive).
6. **Install PyTorch:**
You will need the correct version of PyTorch that is compatible with your CUDA drivers, so make sure to select them carefully.
[Install PyTorch](https://pytorch.org/get-started/locally/).
7. **Install Unsloth:**
```python
pip install unsloth
```
#### Notes
To run Unsloth directly on Windows:
- Install Triton from this Windows fork and follow the instructions [here](https://github.com/woct0rdho/triton-windows) (be aware that the Windows fork requires PyTorch >= 2.4 and CUDA 12)
- In the `SFTConfig`, set `dataset_num_proc=1` to avoid a crashing issue:
```python
SFTConfig(
dataset_num_proc=1,
...
)
```
#### Advanced/Troubleshooting
For **advanced installation instructions** or if you see weird errors during installations:
First try using an isolated environment via then `pip install unsloth`
```bash
python -m venv unsloth
source unsloth/bin/activate
pip install unsloth
```
1. Install `torch` and `triton`. Go to https://pytorch.org to install it. For example `pip install torch torchvision torchaudio triton`
2. Confirm if CUDA is installed correctly. Try `nvcc`. If that fails, you need to install `cudatoolkit` or CUDA drivers.
3. Install `xformers` manually via:
```bash
pip install ninja
pip install -v --no-build-isolation -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
```
Check if `xformers` succeeded with `python -m xformers.info` Go to https://github.com/facebookresearch/xformers. Another option is to install `flash-attn` for Ampere GPUs and ignore `xformers`
5. For GRPO runs, you can try installing `vllm` and seeing if `pip install vllm` succeeds.
6. Double check that your versions of Python, CUDA, CUDNN, `torch`, `triton`, and `xformers` are compatible with one another. The [PyTorch Compatibility Matrix](https://github.com/pytorch/pytorch/blob/main/RELEASE.md#release-compatibility-matrix) may be useful.
5. Finally, install `bitsandbytes` and check it with `python -m bitsandbytes`
### Conda Installation (Optional)
`⚠️Only use Conda if you have it. If not, use Pip`. Select either `pytorch-cuda=11.8,12.1` for CUDA 11.8 or CUDA 12.1. We support `python=3.10,3.11,3.12`.
```bash
conda create --name unsloth_env \
python=3.11 \
pytorch-cuda=12.1 \
pytorch cudatoolkit xformers -c pytorch -c nvidia -c xformers \
-y
conda activate unsloth_env
pip install unsloth
```
<details>
<summary>If you're looking to install Conda in a Linux environment, <a href="https://docs.anaconda.com/miniconda/">read here</a>, or run the below 🔽</summary>
```bash
mkdir -p ~/miniconda3
wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda3/miniconda.sh
bash ~/miniconda3/miniconda.sh -b -u -p ~/miniconda3
rm -rf ~/miniconda3/miniconda.sh
~/miniconda3/bin/conda init bash
~/miniconda3/bin/conda init zsh
```
</details>
### Advanced Pip Installation
`⚠️Do **NOT** use this if you have Conda.` Pip is a bit more complex since there are dependency issues. The pip command is different for `torch 2.2,2.3,2.4,2.5` and CUDA versions.
For other torch versions, we support `torch211`, `torch212`, `torch220`, `torch230`, `torch240` and for CUDA versions, we support `cu118` and `cu121` and `cu124`. For Ampere devices (A100, H100, RTX3090) and above, use `cu118-ampere` or `cu121-ampere` or `cu124-ampere`.
For example, if you have `torch 2.4` and `CUDA 12.1`, use:
```bash
pip install --upgrade pip
pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
```
Another example, if you have `torch 2.5` and `CUDA 12.4`, use:
```bash
pip install --upgrade pip
pip install "unsloth[cu124-torch250] @ git+https://github.com/unslothai/unsloth.git"
```
And other examples:
```bash
pip install "unsloth[cu121-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu118-ampere-torch240] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu121-torch240] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu118-torch240] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu121-torch230] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu121-ampere-torch230] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu121-torch250] @ git+https://github.com/unslothai/unsloth.git"
pip install "unsloth[cu124-ampere-torch250] @ git+https://github.com/unslothai/unsloth.git"
```
Or, run the below in a terminal to get the **optimal** pip installation command:
```bash
wget -qO- https://raw.githubusercontent.com/unslothai/unsloth/main/unsloth/_auto_install.py | python -
```
Or, run the below manually in a Python REPL:
```python
try: import torch
except: raise ImportError('Install torch via `pip install torch`')
from packaging.version import Version as V
import re
v = V(re.match(r"[0-9\.]{3,}", torch.__version__).group(0))
cuda = str(torch.version.cuda)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
USE_ABI = torch._C._GLIBCXX_USE_CXX11_ABI
if cuda not in ("11.8", "12.1", "12.4", "12.6", "12.8"): raise RuntimeError(f"CUDA = {cuda} not supported!")
if v <= V('2.1.0'): raise RuntimeError(f"Torch = {v} too old!")
elif v <= V('2.1.1'): x = 'cu{}{}-torch211'
elif v <= V('2.1.2'): x = 'cu{}{}-torch212'
elif v < V('2.3.0'): x = 'cu{}{}-torch220'
elif v < V('2.4.0'): x = 'cu{}{}-torch230'
elif v < V('2.5.0'): x = 'cu{}{}-torch240'
elif v < V('2.5.1'): x = 'cu{}{}-torch250'
elif v <= V('2.5.1'): x = 'cu{}{}-torch251'
elif v < V('2.7.0'): x = 'cu{}{}-torch260'
elif v < V('2.7.9'): x = 'cu{}{}-torch270'
elif v < V('2.8.0'): x = 'cu{}{}-torch271'
elif v < V('2.8.9'): x = 'cu{}{}-torch280'
else: raise RuntimeError(f"Torch = {v} too new!")
if v > V('2.6.9') and cuda not in ("11.8", "12.6", "12.8"): raise RuntimeError(f"CUDA = {cuda} not supported!")
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')
```
### Docker Installation
You can use our pre-built Docker container with all dependencies to use Unsloth instantly with no setup required.
[Read our guide](https://docs.unsloth.ai/get-started/install-and-update/docker).
This container requires installing [NVIDIA's Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
```bash
docker run -d -e JUPYTER_PASSWORD="mypassword" \
-p 8888:8888 -p 2222:22 \
-v $(pwd)/work:/workspace/work \
--gpus all \
unsloth/unsloth
```
Access Jupyter Lab at `http://localhost:8888` and start fine-tuning!
## 📜 Documentation
- Go to our official [Documentation](https://docs.unsloth.ai) for [running models](https://docs.unsloth.ai/basics/running-and-saving-models), [saving to GGUF](https://docs.unsloth.ai/basics/running-and-saving-models/saving-to-gguf), [checkpointing](https://docs.unsloth.ai/basics/finetuning-from-last-checkpoint), [evaluation](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide#evaluation) and more!
- Read our Guides for: [Fine-tuning](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide), [Reinforcement Learning](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide), [Text-to-Speech (TTS)](https://docs.unsloth.ai/basics/text-to-speech-tts-fine-tuning), [Vision](https://docs.unsloth.ai/basics/vision-fine-tuning) and [any model](docs.unsloth.ai/models/tutorials-how-to-fine-tune-and-run-llms).
- We support Huggingface's transformers, TRL, Trainer, Seq2SeqTrainer and Pytorch code.
Unsloth example code to fine-tune gpt-oss-20b:
```python
from unsloth import FastLanguageModel, FastModel
import torch
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!
# Get LAION dataset
url = "https://huggingface.co/datasets/laion/OIG/resolve/main/unified_chip2.jsonl"
dataset = load_dataset("json", data_files = {"train" : url}, split = "train")
# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
"unsloth/gpt-oss-20b-unsloth-bnb-4bit", #or choose any model
] # More models at https://huggingface.co/unsloth
model, tokenizer = FastModel.from_pretrained(
model_name = "unsloth/gpt-oss-20b",
max_seq_length = 2048, # Choose any for long context!
load_in_4bit = True, # 4-bit quantization. False = 16-bit LoRA.
load_in_8bit = False, # 8-bit quantization
load_in_16bit = False, # [NEW!] 16-bit LoRA
full_finetuning = False, # Use for full fine-tuning.
# token = "hf_...", # use one if using gated models
)
# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
model,
r = 16,
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 16,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
max_seq_length = max_seq_length,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
trainer = SFTTrainer(
model = model,
train_dataset = dataset,
tokenizer = tokenizer,
args = SFTConfig(
max_seq_length = max_seq_length,
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 10,
max_steps = 60,
logging_steps = 1,
output_dir = "outputs",
optim = "adamw_8bit",
seed = 3407,
),
)
trainer.train()
# Go to https://docs.unsloth.ai for advanced tips like
# (1) Saving to GGUF / merging to 16bit for vLLM
# (2) Continued training from a saved LoRA adapter
# (3) Adding an evaluation loop / OOMs
# (4) Customized chat templates
```
<a name="RL"></a>
## 💡 Reinforcement Learning
[RL](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) including [GRPO](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide#training-with-grpo), [GSPO](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/gspo-reinforcement-learning), DrGRPO, DAPO, PPO, Reward Modelling, Online DPO all work with Unsloth.
Read our [Reinforcement Learning Guide](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) or our [advanced RL docs](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide/advanced-rl-documentation) for batching, generation & training parameters.
List of RL notebooks:
- gpt-oss GSPO notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/gpt-oss-(20B)-GRPO.ipynb)
- Qwen2.5-VL GSPO notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen2_5_7B_VL_GRPO.ipynb)
- Advanced Qwen3 GRPO notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Qwen3_(4B)-GRPO.ipynb)
- ORPO notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Llama3_(8B)-ORPO.ipynb)
- DPO Zephyr notebook: [Link](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/Zephyr_(7B)-DPO.ipynb)
- KTO notebook: [Link](https://colab.research.google.com/drive/1MRgGtLWuZX4ypSfGguFgC-IblTvO2ivM?usp=sharing)
- SimPO notebook: [Link](https://colab.research.google.com/drive/1Hs5oQDovOay4mFA6Y9lQhVJ8TnbFLFh2?usp=sharing)
## 🥇 Performance Benchmarking
- For our most detailed benchmarks, read our [Llama 3.3 Blog](https://unsloth.ai/blog/llama3-3).
- Benchmarking of Unsloth was also conducted by [🤗Hugging Face](https://huggingface.co/blog/unsloth-trl).
We tested using the Alpaca Dataset, a batch size of 2, gradient accumulation steps of 4, rank = 32, and applied QLoRA on all linear layers (q, k, v, o, gate, up, down):
| Model | VRAM | 🦥 Unsloth speed | 🦥 VRAM reduction | 🦥 Longer context | 😊 Hugging Face + FA2 |
|----------------|-------|-----------------|----------------|----------------|--------------------|
| Llama 3.3 (70B)| 80GB | 2x | >75% | 13x longer | 1x |
| Llama 3.1 (8B) | 80GB | 2x | >70% | 12x longer | 1x |
### Context length benchmarks
#### Llama 3.1 (8B) max. context length
We tested Llama 3.1 (8B) Instruct and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.
| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
|----------|-----------------------|-----------------|
| 8 GB | 2,972 | OOM |
| 12 GB | 21,848 | 932 |
| 16 GB | 40,724 | 2,551 |
| 24 GB | 78,475 | 5,789 |
| 40 GB | 153,977 | 12,264 |
| 48 GB | 191,728 | 15,502 |
| 80 GB | 342,733 | 28,454 |
#### Llama 3.3 (70B) max. context length
We tested Llama 3.3 (70B) Instruct on a 80GB A100 and did 4bit QLoRA on all linear layers (Q, K, V, O, gate, up and down) with rank = 32 with a batch size of 1. We padded all sequences to a certain maximum sequence length to mimic long context finetuning workloads.
| GPU VRAM | 🦥Unsloth context length | Hugging Face + FA2 |
|----------|------------------------|------------------|
| 48 GB | 12,106 | OOM |
| 80 GB | 89,389 | 6,916 |
<br>

<br>
### Citation
You can cite the Unsloth repo as follows:
```bibtex
@software{unsloth,
author = {Daniel Han, Michael Han and Unsloth team},
title = {Unsloth},
url = {http://github.com/unslothai/unsloth},
year = {2023}
}
```
### Thank You to
- The [llama.cpp library](https://github.com/ggml-org/llama.cpp) that lets users save models with Unsloth
- The Hugging Face team and their libraries: [transformers](https://github.com/huggingface/transformers) and [TRL](https://github.com/huggingface/trl)
- The Pytorch and [Torch AO](https://github.com/unslothai/unsloth/pull/3391) team for their contributions
- [Erik](https://github.com/erikwijmans) for his help adding [Apple's ML Cross Entropy](https://github.com/apple/ml-cross-entropy) in Unsloth
- [Etherl](https://github.com/Etherll) for adding support for [TTS, diffusion and BERT models](https://github.com/unslothai/notebooks/pull/34)
- And of course for every single person who has contributed or has used Unsloth!
## /images/Assistant.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Assistant.png
## /images/Colab.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Colab.png
## /images/Discord button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Discord button.png
## /images/Discord.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Discord.png
## /images/Documentation Button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Documentation Button.png
## /images/Free version button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Free version button.png
## /images/Kaggle.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Kaggle.png
## /images/Kofi button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Kofi button.png
## /images/LAION 2GPU.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/LAION 2GPU.png
## /images/Merge.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Merge.png
## /images/Run.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Run.png
## /images/Slim Orca 2GPUs.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Slim Orca 2GPUs.png
## /images/Terminal_Type.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Terminal_Type.png
## /images/Where_Terminal.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/Where_Terminal.png
## /images/buy me a coffee button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/buy me a coffee button.png
## /images/documentation github button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation github button.png
## /images/documentation green button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation green button.png
## /images/documentation lighter.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation lighter.png
## /images/documentation white button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/documentation white button.png
## /images/made with unsloth.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/made with unsloth.png
## /images/ollama.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/ollama.png
## /images/peft x trl button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/peft x trl button.png
## /images/start free finetune button.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/start free finetune button.png
## /images/unsloth end.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth end.png
## /images/unsloth loading page render.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth loading page render.png
## /images/unsloth logo black text.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth logo black text.png
## /images/unsloth logo only.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth logo only.png
## /images/unsloth logo white text.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth logo white text.png
## /images/unsloth made with love.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth made with love.png
## /images/unsloth new logo.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth new logo.png
## /images/unsloth sticker.png
Binary file available at https://raw.githubusercontent.com/unslothai/unsloth/refs/heads/main/images/unsloth sticker.png
## /pyproject.toml
```toml path="/pyproject.toml"
[build-system]
requires = ["setuptools==80.9.0", "setuptools-scm==9.2.0"]
build-backend = "setuptools.build_meta"
[project]
name = "unsloth"
dynamic = ["version"]
description = "2-5X faster training, reinforcement learning & finetuning"
readme = "README.md"
requires-python = ">=3.9,<3.14"
license = "Apache-2.0"
keywords = ["ai", "llm", "reinforcement learning", "machine learning", "artificial intelligence", "pytorch"]
authors = [
{email = "info@unsloth.ai"},
{name = "Unsloth AI team"},
]
maintainers = [
{name = "Daniel Han", email = "danielhanchen@gmail.com"},
{name = "Michael Han", email = "info@unsloth.ai"},
]
classifiers = [
"Programming Language :: Python",
"Environment :: GPU",
"Environment :: GPU :: NVIDIA CUDA",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
]
[tool.setuptools.dynamic]
version = {attr = "unsloth.models._utils.__version__"}
[tool.setuptools]
include-package-data = false
[tool.setuptools.packages.find]
exclude = ["images*", "tests*", "kernels/moe*"]
[project.optional-dependencies]
triton = [
"triton>=3.0.0 ; ('linux' in sys_platform)",
"triton-windows ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
]
huggingfacenotorch = [
"wheel>=0.42.0",
"packaging",
"numpy",
"tqdm",
"psutil",
"tyro",
"protobuf",
"sentencepiece>=0.2.0",
"datasets>=3.4.1,!=4.0.*,!=4.1.0",
"accelerate>=0.34.1",
"peft>=0.7.1,!=0.11.0",
"huggingface_hub>=0.34.0",
"hf_transfer",
"diffusers",
"transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,<=4.57.2",
"trl>=0.18.2,!=0.19.0,<=0.24.0",
]
huggingface = [
"unsloth[huggingfacenotorch]",
"unsloth_zoo>=2025.11.1",
"torchvision",
"unsloth[triton]",
]
windows = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0 ; (sys_platform == 'win32')",
"xformers>=0.0.22.post7 ; (sys_platform == 'win32')",
]
base = [
"unsloth[huggingface]",
]
cu118only = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.22.post7%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu121only = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.22.post7-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu118onlytorch211 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu121onlytorch211 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu118onlytorch212 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.23.post1%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu121onlytorch212 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.23.post1-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu118onlytorch220 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.24%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu121onlytorch220 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.24-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
]
cu118onlytorch230 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu121onlytorch230 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.27-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu118onlytorch240 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp39-cp39-manylinux2014_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp310-cp310-manylinux2014_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp311-cp311-manylinux2014_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.27.post2%2Bcu118-cp312-cp312-manylinux2014_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu121onlytorch240 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu124onlytorch240 = [
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post1-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu118onlytorch250 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu121onlytorch250 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu124onlytorch250 = [
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.28.post2-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu118onlytorch251 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu121onlytorch251 = [
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu121/xformers-0.0.29.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu124onlytorch251 = [
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post1-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu118onlytorch260 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
]
cu124onlytorch260 = [
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu124/xformers-0.0.29.post3-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu126onlytorch260 = [
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.29.post3-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu118onlytorch270 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu126onlytorch270 = [
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu128onlytorch270 = [
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp39-cp39-manylinux_2_28_x86_64.whl ; python_version=='3.9' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp310-cp310-manylinux_2_28_x86_64.whl ; python_version=='3.10' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp311-cp311-manylinux_2_28_x86_64.whl ; python_version=='3.11' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp312-cp312-manylinux_2_28_x86_64.whl ; python_version=='3.12' and ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp39-cp39-win_amd64.whl ; python_version=='3.9' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp310-cp310-win_amd64.whl ; python_version=='3.10' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp311-cp311-win_amd64.whl ; python_version=='3.11' and (sys_platform == 'win32')",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.30-cp312-cp312-win_amd64.whl ; python_version=='3.12' and (sys_platform == 'win32')",
]
cu118onlytorch271 = [
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu118/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; (sys_platform == 'win32')",
]
cu126onlytorch271 = [
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; (sys_platform == 'win32')",
]
cu128onlytorch271 = [
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.31.post1-cp39-abi3-manylinux_2_28_x86_64.whl ; ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.31.post1-cp39-abi3-win_amd64.whl ; (sys_platform == 'win32')",
]
cu118onlytorch280 = [
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu126/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; (sys_platform == 'win32')",
]
cu126onlytorch280 = [
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu128/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; (sys_platform == 'win32')",
]
cu128onlytorch280 = [
"xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.32.post2-cp39-abi3-manylinux_2_28_x86_64.whl ; ('linux' in sys_platform)",
"xformers @ https://download.pytorch.org/whl/cu129/xformers-0.0.32.post2-cp39-abi3-win_amd64.whl ; (sys_platform == 'win32')",
]
cu130onlytorch280 = [
]
cu126onlytorch290 = [
]
cu128onlytorch290 = [
]
cu130onlytorch290 = [
]
cu118 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118only]",
]
cu121 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121only]",
]
cu118-torch211 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu118onlytorch211]",
]
cu121-torch211 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu121onlytorch211]",
]
cu118-torch212 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu118onlytorch212]",
]
cu121-torch212 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu121onlytorch212]",
]
cu118-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch220]",
]
cu121-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch220]",
]
cu118-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch230]",
]
cu121-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch230]",
]
cu118-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch240]",
]
cu121-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch240]",
]
cu124-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch240]",
]
cu118-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch250]",
]
cu121-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch250]",
]
cu124-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch250]",
]
cu118-torch251 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch251]",
]
cu121-torch251 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch251]",
]
cu124-torch251 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch251]",
]
cu118-torch260 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch260]",
]
cu124-torch260 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch260]",
]
cu126-torch260 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch260]",
]
cu118-torch270 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch270]",
]
cu126-torch270 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch270]",
]
cu128-torch270 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch270]",
]
cu118-torch271 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch271]",
]
cu126-torch271 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch271]",
]
cu128-torch271 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch271]",
]
cu118-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch280]",
]
cu126-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch280]",
]
cu128-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch280]",
]
cu130-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu130onlytorch280]",
]
cu126-torch290 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch290]",
]
cu128-torch290 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch290]",
]
cu130-torch290 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu130onlytorch290]",
]
kaggle = [
"unsloth[huggingface]",
]
kaggle-new = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
]
conda = [
"unsloth[huggingface]",
]
colab-torch211 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu121onlytorch211]",
]
colab-ampere-torch211 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu121onlytorch211]",
"packaging",
"ninja",
"flash-attn>=2.6.3 ; ('linux' in sys_platform)",
]
colab-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch220]",
]
colab-ampere-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch220]",
"packaging",
"ninja",
"flash-attn>=2.6.3 ; ('linux' in sys_platform)",
]
colab-new = [
"unsloth_zoo>=2025.11.1",
"packaging",
"tyro",
"transformers>=4.51.3,!=4.52.0,!=4.52.1,!=4.52.2,!=4.52.3,!=4.53.0,!=4.54.0,!=4.55.0,!=4.55.1,!=4.57.0,<=4.57.2",
"datasets>=3.4.1,!=4.0.*,!=4.1.0",
"sentencepiece>=0.2.0",
"tqdm",
"psutil",
"wheel>=0.42.0",
"numpy",
"protobuf",
"huggingface_hub>=0.34.0",
"hf_transfer",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[triton]",
]
colab-no-deps = [
"accelerate>=0.34.1",
"trl>=0.18.2,!=0.19.0,<=0.24.0",
"peft>=0.7.1",
"xformers ; ('linux' in sys_platform or sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"protobuf",
]
colab = [
"unsloth[cu121]",
]
flashattention = [
"packaging ; ('linux' in sys_platform)",
"ninja ; ('linux' in sys_platform)",
"flash-attn>=2.6.3 ; ('linux' in sys_platform)",
]
colab-ampere = [
"unsloth[colab-ampere-torch220]",
"unsloth[flashattention]",
]
cu118-ampere = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118only]",
"unsloth[flashattention]",
]
cu121-ampere = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121only]",
"unsloth[flashattention]",
]
cu118-ampere-torch211 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu118onlytorch211]",
"unsloth[flashattention]",
]
cu121-ampere-torch211 = [
"unsloth[huggingface]",
"bitsandbytes==0.45.5",
"unsloth[cu121onlytorch211]",
"unsloth[flashattention]",
]
cu118-ampere-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch220]",
"unsloth[flashattention]",
]
cu121-ampere-torch220 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch220]",
"unsloth[flashattention]",
]
cu118-ampere-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch230]",
"unsloth[flashattention]",
]
cu121-ampere-torch230 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch230]",
"unsloth[flashattention]",
]
cu118-ampere-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch240]",
"unsloth[flashattention]",
]
cu121-ampere-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch240]",
"unsloth[flashattention]",
]
cu124-ampere-torch240 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch240]",
"unsloth[flashattention]",
]
cu118-ampere-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch250]",
"unsloth[flashattention]",
]
cu121-ampere-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch250]",
"unsloth[flashattention]",
]
cu124-ampere-torch250 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch250]",
"unsloth[flashattention]",
]
cu118-ampere-torch251 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch251]",
"unsloth[flashattention]",
]
cu121-ampere-torch251 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu121onlytorch251]",
"unsloth[flashattention]",
]
cu124-ampere-torch251 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch251]",
"unsloth[flashattention]",
]
cu118-ampere-torch260 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch260]",
"unsloth[flashattention]",
]
cu124-ampere-torch260 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu124onlytorch260]",
"unsloth[flashattention]",
]
cu126-ampere-torch260 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch260]",
"unsloth[flashattention]",
]
cu118-ampere-torch270 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch270]",
"unsloth[flashattention]",
]
cu126-ampere-torch270 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch270]",
"unsloth[flashattention]",
]
cu128-ampere-torch270 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch270]",
"unsloth[flashattention]",
]
cu118-ampere-torch271 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch271]",
"unsloth[flashattention]",
]
cu126-ampere-torch271 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch271]",
"unsloth[flashattention]",
]
cu128-ampere-torch271 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch271]",
"unsloth[flashattention]",
]
cu118-ampere-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu118onlytorch280]",
"unsloth[flashattention]",
]
cu126-ampere-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch280]",
"unsloth[flashattention]",
]
cu128-ampere-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch280]",
"unsloth[flashattention]",
]
cu130-ampere-torch280 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu130onlytorch280]",
"unsloth[flashattention]",
]
cu126-ampere-torch290 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu126onlytorch290]",
]
cu128-ampere-torch290 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu128onlytorch290]",
]
cu130-ampere-torch290 = [
"unsloth[huggingface]",
"bitsandbytes>=0.45.5,!=0.46.0,!=0.48.0",
"unsloth[cu130onlytorch290]",
]
flashattentiontorch260abiFALSEcu12x = [
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.9'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.10'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.11'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.12'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.13'",
]
flashattentiontorch260abiTRUEcu12x = [
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp39-cp39-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.9'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp310-cp310-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.10'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp311-cp311-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.11'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.12'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiTRUE-cp313-cp313-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.13'",
]
flashattentiontorch250abiFALSEcu12x = [
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.9'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.10'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.11'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.12'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.13'",
]
flashattentiontorch250abiTRUEcu12x = [
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp39-cp39-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.9'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp310-cp310-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.10'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp311-cp311-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.11'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.12'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.5cxx11abiTRUE-cp313-cp313-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.13'",
]
flashattentiontorch240abiFALSEcu12x = [
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp39-cp39-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.9'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp310-cp310-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.10'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp311-cp311-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.11'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp312-cp312-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.12'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiFALSE-cp313-cp313-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.13'",
]
flashattentiontorch240abiTRUEcu12x = [
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp39-cp39-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.9'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp310-cp310-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.10'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp311-cp311-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.11'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp312-cp312-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.12'",
"flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.4cxx11abiTRUE-cp313-cp313-linux_x86_64.whl ; ('linux' in sys_platform) and python_version == '3.13'",
]
intelgputorch260 = [
"unsloth_zoo[intelgpu]",
"unsloth[huggingfacenotorch]",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp39-cp39-linux_x86_64.whl#sha256=147607f190a7d7aa24ba454def5977fbbfec792fdae18e4ed278cfec29b69271 ; ('linux' in sys_platform) and python_version == '3.9' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp310-cp310-linux_x86_64.whl#sha256=23aa423fa1542afc34f67eb3ba8ef20060f6d1b3a4697eaeab22b11c92b30f2b ; ('linux' in sys_platform) and python_version == '3.10' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp311-cp311-linux_x86_64.whl#sha256=bcfa995229bbfd9ffd8d6c8d9f6428d393e876fa6e23ee3c20e3c0d73ca75ca5 ; ('linux' in sys_platform) and python_version == '3.11' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp312-cp312-linux_x86_64.whl#sha256=bd340903d03470708df3442438acb8b7e08087ab9e61fbe349b2872bf9257ab0 ; ('linux' in sys_platform) and python_version == '3.12' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.2.0-cp313-cp313-linux_x86_64.whl#sha256=814dccc8a07159e6eca74bed70091bc8fea2d9dd87b0d91845f9f38cde62f01c ; ('linux' in sys_platform) and python_version == '3.13' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp39-cp39-linux_x86_64.whl#sha256=6a8adf6dc4c089406e8b3a7e58ab57a463bddf9b07130d2576e76eced43e92af ; ('linux' in sys_platform) and python_version == '3.9' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp310-cp310-linux_x86_64.whl#sha256=ff4561cbf07c83bbccaa0f6e9bb0e6dcf721bacd53c9c43c4eb0e7331b4792f9 ; ('linux' in sys_platform) and python_version == '3.10' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp311-cp311-linux_x86_64.whl#sha256=12005f66b810ddd3ab93f86c4522bcfdd412cbd27fc9d189b661ff7509bc5e8a ; ('linux' in sys_platform) and python_version == '3.11' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp312-cp312-linux_x86_64.whl#sha256=c4c5c67625cdacf35765c2b94e61fe166e3c3f4a14521b1212a59ad1b3eb0f2e ; ('linux' in sys_platform) and python_version == '3.12' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.6.0%2Bxpu-cp313-cp313-linux_x86_64.whl#sha256=e6864f7a60a5ecc43d5d38f59a16e5dd132384f73dfd3a697f74944026038f7b ; ('linux' in sys_platform) and python_version == '3.13' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
]
intel-gpu-torch260 = [
"unsloth[intelgputorch260]"
]
intelgputorch270 = [
"unsloth_zoo[intelgpu]",
"unsloth[huggingfacenotorch]",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=749a7098492c6a27b356c97149a4a62973b953eae60bc1b6259260974f344913 ; ('linux' in sys_platform) and python_version == '3.9' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=44362e80abd752471a08341093321955b066daa2cfb4810e73b8e3b240850f93 ; ('linux' in sys_platform) and python_version == '3.10' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=faa6b8c945a837a080f641bc8ccc77a98fa66980dcd7e62e715fd853737343fd ; ('linux' in sys_platform) and python_version == '3.11' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=40f6fb65b345dc9a61813abe7ac9a585f2c9808f414d140cc2a5f11f53ee063c ; ('linux' in sys_platform) and python_version == '3.12' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.3.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=9821fe059de58e827ffc6aa10d69369b16c2f8c2a988b86bef9c2c6e396ab3aa ; ('linux' in sys_platform) and python_version == '3.13' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp39-cp39-linux_x86_64.whl#sha256=f8ee75e50fcbb37ed5b498299ca2264da99ab278a93fae2358e921e4a6e28273 ; ('linux' in sys_platform) and python_version == '3.9' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp310-cp310-linux_x86_64.whl#sha256=d6fdc342961d98fdcd9d03dfd491a3208bb5f7fbb435841f8f72ce9fdcd2d026 ; ('linux' in sys_platform) and python_version == '3.10' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp311-cp311-linux_x86_64.whl#sha256=74d07f9357df5cf2bf223ad3c84de16346bfaa0504f988fdd5590d3e177e5e86 ; ('linux' in sys_platform) and python_version == '3.11' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp312-cp312-linux_x86_64.whl#sha256=c806d44aa2ca5d225629f6fbc6c994d5deaac2d2cde449195bc8e3522ddd219a ; ('linux' in sys_platform) and python_version == '3.12' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.7.0%2Bxpu-cp313-cp313-linux_x86_64.whl#sha256=25d8277b7f01d42e2e014ccbab57a2692b6ec4eff8dcf894eda1b297407cf97a ; ('linux' in sys_platform) and python_version == '3.13' and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
]
intel-gpu-torch270 = [
"unsloth[intelgputorch270]"
]
intelgputorch280 = [
"unsloth_zoo[intelgpu]",
"unsloth[huggingfacenotorch]",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.4.0-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=ac4d8e33986b1c3c5e48151640539272b2187e83016985853111b46fb82c3c94 ; platform_system == 'Linux' and python_version == '3.9' and platform_machine == 'x86_64'",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=999fef4c1f711092b9d3086525920545df490de476ecebe899ffc777019ae17f ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.4.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=57b09c8c492985ff6a27cd3a22b08e8f7b96b407bd8030967b6efbb9f63b80cf ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.4.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=df4bb3282bac9a3b90231700077110d8680b338416de03c2b7c6133c9b602649 ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.4.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=60da63c99ca827bdcb0df28e0298bf7d066dc607454c6d6176783cb4e79d838b ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.8.0%2Bxpu-cp39-cp39-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.9' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.8.0%2Bxpu-cp310-cp310-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.8.0%2Bxpu-cp311-cp311-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.8.0%2Bxpu-cp312-cp312-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.8.0%2Bxpu-cp313-cp313-linux_x86_64.whl ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl ; ('linux' in sys_platform) and (platform_machine == 'x86_64')",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.23.0%2Bxpu-cp39-cp39-manylinux_2_28_x86_64.whl#sha256=6e981c192045fc249c008441179ff237bb00174d818b875b0475730b63f0eaca ; platform_system == 'Linux' and python_version == '3.9' and platform_machine == 'x86_64'",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.23.0%2Bxpu-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=e5ba4805969277175ebfd59cc717093528cc6e3ada89ac2725fc7a3c1fee6169 ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.23.0%2Bxpu-cp311-cp311-manylinux_2_28_x86_64.whl#sha256=74c39c144104416bc4c5ad8c26ab0c169dc5cc6be58059e01bc3665dd0ef676f ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.23.0%2Bxpu-cp312-cp312-manylinux_2_28_x86_64.whl#sha256=0acec355b80c3899841184084f365df336c508602812e34a44007b8b60d53af4 ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.23.0%2Bxpu-cp313-cp313-manylinux_2_28_x86_64.whl#sha256=e2109ae773dad27b98ca17681044b4f876563c37f2382b75de3a371399edcff8 ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
]
intel-gpu-torch280 = [
"unsloth[intelgputorch280]"
]
intelgputorch290 = [
"unsloth_zoo[intelgpu]",
"unsloth[huggingfacenotorch]",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.5.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=c169a1de14c19673b17c751290d467fa282fc90fa5da4314b2e5cdab1f553146 ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.5.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=013d9dd5d6479bd22983161f462e61c8dbe1d82e6730624a7a8d5945507eaa61 ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=afc8cabfbf7ed51fd278d1e0f88d6afc157b0201bad4b99d681e4d542f9e66d4 ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
"pytorch_triton_xpu @ https://download.pytorch.org/whl/pytorch_triton_xpu-3.5.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl#sha256=0d24c1716088f2764d0d24c64227732195b6a42706c3c5fc89eeb4904bfa0818 ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.9.0%2Bxpu-cp310-cp310-linux_x86_64.whl#sha256=5afbe860ce991825a36b75706a523601087e414b77598ef0d9d3d565741c277d ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.9.0%2Bxpu-cp311-cp311-linux_x86_64.whl#sha256=607fe419c32d6e8e0556f745742e7cff1d0babce51f54be890e0c1422359c442 ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.9.0%2Bxpu-cp312-cp312-linux_x86_64.whl#sha256=376bae584d89980b8e59934d248c38d5fa3b7d4687a4df1a19f4bc1d23dcc8c1 ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
"torch @ https://download.pytorch.org/whl/xpu/torch-2.9.0%2Bxpu-cp313-cp313-linux_x86_64.whl#sha256=98d6a06dd7fb185874367b18bd609f05f16fdce4142a5980ca94461949965cd2 ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl ; ('linux' in sys_platform) and (platform_machine == 'x86_64')",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.24.0%2Bxpu-cp310-cp310-manylinux_2_28_x86_64.whl#sha256=cbfae2b79b7549fd368c2462fc8e94f8f26cc450782ee72138e908077c09a519 ; platform_system == 'Linux' and python_version == '3.10' and platform_machine == 'x86_64'",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.24.0%2Bxpu-cp311-cp311-manylinux_2_28_x86_64.whl#sha256=044fa36ef4b6b43edcd490b75c853fa4b3eb033c2bded29f8fbcf27734713c67 ; platform_system == 'Linux' and python_version == '3.11' and platform_machine == 'x86_64'",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.24.0%2Bxpu-cp312-cp312-manylinux_2_28_x86_64.whl#sha256=4b91e4bec1d740a6211f02578a79888550b73f3a4e1383035f8f6d72f587212c ; platform_system == 'Linux' and python_version == '3.12' and platform_machine == 'x86_64'",
"torchvision @ https://download.pytorch.org/whl/xpu/torchvision-0.24.0%2Bxpu-cp313-cp313-manylinux_2_28_x86_64.whl#sha256=88239e73ca37254bec84f29cd5887e10ff712de7edbbda3fbb3609cd6190d99e ; platform_system == 'Linux' and python_version == '3.13' and platform_machine == 'x86_64'",
]
intel-gpu-torch290 = [
"unsloth[intelgputorch290]"
]
intel = [
"unsloth[intelgputorch280]",
]
amd = [
"unsloth[huggingfacenotorch]",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_x86_64.whl ; ('linux' in sys_platform) and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-win_amd64.whl ; (sys_platform == 'win32') and (platform_machine == 'AMD64' or platform_machine == 'x86_64')",
"bitsandbytes @ https://github.com/bitsandbytes-foundation/bitsandbytes/releases/download/continuous-release_main/bitsandbytes-1.33.7.preview-py3-none-manylinux_2_24_aarch64.whl ; ('linux' in sys_platform) and (platform_machine == 'aarch64')",
]
[project.urls]
homepage = "http://www.unsloth.ai"
documentation = "https://github.com/unslothai/unsloth"
repository = "https://github.com/unslothai/unsloth"
```
## /tests/__init__.py
```py path="/tests/__init__.py"
```
## /tests/qlora/README.md
## QLoRA Train and Merge Tests
### Overview
Tests that performing QLoRA training and merging weights to 16-bits post-training maintains same behavior as trained model.
- `test_unsloth_qlora_train_and_merge.py`: Test Unsloth QLoRA train and merge using `FastLanguageModel.from_pretrained`, `FastLanguageModel.get_peft_model`, and `FastLanguageModel.save_pretrained_merged` apis
- `test_hf_qlora_train_and_merge.py`: Test Hugging Face QLoRA train and merge using `from_pretrained`, `get_peft_model`, and `merge_and_unload` apis.
- Demonstrates that `peft`'s `merge_and_unload` results in loss of accuracy as it requantizes the base layer after merging adapter weights so that the model still contains `Linear4Bit` layers post merging.
- I (@jeromeku) implemented a custom merge function that replaces all `LoraLayers` with `Linear` layers whose weights are the dequantized base layer weights with adapter weights merged (compute done in fp32, cast to original dtype after merging), roughly equivalent to `FastLanguageModel.save_pretrained_merged`.
### Usage
Run unsloth test:
```bash
python tests/qlora/test_unsloth_qlora_train_and_merge.py
```
Run huggingface test:
```bash
python tests/qlora/test_hf_qlora_train_and_merge.py
```
### Details
The tests train a QLoRA model on a single prompt dataset
```
QUESTION = "What day was I born?"
ANSWER = "January 1, 2058"
USER_MESSAGE = {"role": "user", "content": QUESTION}
ASSISTANT_MESSAGE = {"role": "assistant", "content": ANSWER}
```
Given that the answer is impossible to answer accurately without finetuning, we can only expect the model to answer the question correctly if the model has been trained on the question.
To check this behavior, we check the model's response to the question before and after training and after merging, checking that the model's response contains the answer after training and merging but not before training.
### Results
For the unsloth test, the model's behavior is as expected:
- before training, the model's response does not contain the answer
- after training, the model's response contains the answer
- after merging, the model's response contains the answer
For the huggingface test, the model's behavior is as expected:
- before training, the model's response does not contain the answer
- after training, the model's response contains the answer
- after using peft's `merge_and_unload`, the model's response does not contain the answer
- after using my custom merge function, the model's response contains the answer
The scripts should output training params, training logs, as well as model responses before and after training and after merging (only prints model responses if answer is not contained in response).
## /tests/qlora/test_hf_qlora_train_and_merge.py
```py path="/tests/qlora/test_hf_qlora_train_and_merge.py"
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[2]
sys.path.append(str(REPO_ROOT))
import itertools
from copy import deepcopy
import torch
from datasets import Dataset
from trl import SFTConfig
from tests.utils import header_footer_context
from tests.utils.data_utils import (
ANSWER,
DEFAULT_MESSAGES,
USER_MESSAGE,
check_responses,
create_dataset,
describe_peft_weights,
)
from tests.utils.hf_utils import (
convert_lora_to_linear,
fix_llama3_tokenizer,
get_peft_config,
sample_responses,
setup_model,
setup_tokenizer,
setup_trainer,
)
if __name__ == "__main__":
model_name = "meta-llama/Llama-3.2-1B-Instruct"
dtype = torch.bfloat16
max_steps = 100
num_examples = 1000
lora_rank = 64
output_dir = "sft_test"
seed = 42
batch_size = 5
num_generations = 5
tokenizer = setup_tokenizer(model_name, fixup_funcs=[fix_llama3_tokenizer])
temperature = 0.8
max_new_tokens = 20
peft_config = get_peft_config(lora_rank=lora_rank, target_modules="all-linear")
model = setup_model(model_name, quantize=True, dtype=dtype, peft_config=peft_config)
prompt = tokenizer.apply_chat_template(
[USER_MESSAGE], tokenize=False, add_generation_prompt=True
)
with header_footer_context("Test Prompt and Answer"):
print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
dataset: Dataset = create_dataset(
tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
)
with header_footer_context("Dataset"):
print(f"Dataset: {next(iter(dataset))}")
training_args = SFTConfig(
output_dir=output_dir,
max_steps=max_steps,
per_device_train_batch_size=batch_size,
log_level="info",
report_to="none",
num_train_epochs=1,
logging_steps=1,
seed=seed,
bf16=dtype == torch.bfloat16,
fp16=dtype == torch.float16,
save_strategy="no",
)
with header_footer_context("Train Args"):
print(training_args)
print(peft_config)
trainer = setup_trainer(
model, tokenizer, dataset, training_args, peft_config=peft_config
)
with header_footer_context("Model"):
print(type(model.model))
generation_args = {
"num_generations": num_generations,
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"skip_special_tokens": False,
"dtype": dtype,
}
responses = sample_responses(
model,
tokenizer,
prompt=prompt,
**generation_args,
)
with header_footer_context("Responses before training"):
check_responses(responses, answer=ANSWER, prompt=prompt)
with header_footer_context("Peft Weights before training"):
for name, stats in itertools.islice(describe_peft_weights(model), 2):
print(f"{name}:\n{stats}")
output = trainer.train()
with header_footer_context("Peft Weights after training"):
for name, stats in itertools.islice(describe_peft_weights(model), 2):
print(f"{name}:\n{stats}")
with header_footer_context("Trainer Output"):
print(output)
responses = sample_responses(
model,
tokenizer,
prompt=prompt,
**generation_args,
)
with header_footer_context("Responses after training"):
check_responses(responses, answer=ANSWER, prompt=prompt)
model_copy = deepcopy(model)
merged_model = convert_lora_to_linear(model)
responses = sample_responses(
merged_model,
tokenizer,
prompt=prompt,
**generation_args,
)
with header_footer_context("Responses after custom merging to 16bit"):
check_responses(responses, answer=ANSWER, prompt=prompt)
merged_model_peft = model_copy.merge_and_unload()
responses = sample_responses(
merged_model_peft,
tokenizer,
prompt=prompt,
**generation_args,
)
with header_footer_context("Responses after peft merge_and_unload"):
check_responses(responses, answer=ANSWER, prompt=prompt)
```
## /tests/qlora/test_unsloth_qlora_train_and_merge.py
```py path="/tests/qlora/test_unsloth_qlora_train_and_merge.py"
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[2]
sys.path.append(str(REPO_ROOT))
import itertools
from unsloth import FastLanguageModel
import torch
from datasets import Dataset
from trl import SFTConfig
from tests.utils import header_footer_context
from tests.utils.data_utils import (
DEFAULT_MESSAGES,
USER_MESSAGE,
ANSWER,
create_dataset,
describe_peft_weights,
check_responses,
)
from tests.utils.hf_utils import (
sample_responses,
setup_trainer,
)
def get_unsloth_model_and_tokenizer(
model_name: str,
max_seq_length: int,
load_in_4bit: bool,
fast_inference: bool,
max_lora_rank: int = None,
gpu_memory_utilization: float = 0.5,
dtype: torch.dtype = torch.bfloat16,
):
return FastLanguageModel.from_pretrained(
model_name=model_name,
max_seq_length=max_seq_length,
load_in_4bit=load_in_4bit,
fast_inference=fast_inference,
max_lora_rank=max_lora_rank,
gpu_memory_utilization=gpu_memory_utilization,
dtype=dtype,
)
def get_unsloth_peft_model(
model,
lora_rank: int,
target_modules: list[str] = "all-linear",
use_gradient_checkpointing: str = False,
random_state: int = 42,
):
return FastLanguageModel.get_peft_model(
model,
r=lora_rank,
target_modules=target_modules,
lora_alpha=lora_rank,
use_gradient_checkpointing=use_gradient_checkpointing,
random_state=random_state,
)
if __name__ == "__main__":
model_name = "meta-llama/Llama-3.2-1B-Instruct"
dtype = torch.bfloat16
max_steps = 100
num_examples = 1000
lora_rank = 64
output_dir = "sft_test"
seed = 42
batch_size = 5
num_generations = 5
target_modules = [
"q_proj",
"k_proj",
"v_proj",
"o_proj",
"gate_proj",
"up_proj",
"down_proj",
]
gradient_checkpointing = False
unsloth_merged_path = "unsloth_merged_16bit"
model, tokenizer = get_unsloth_model_and_tokenizer(
model_name,
max_seq_length=512,
load_in_4bit=True,
fast_inference=False,
max_lora_rank=lora_rank,
dtype=dtype,
)
temperature = 0.8
max_new_tokens = 20
model = get_unsloth_peft_model(
model,
lora_rank=lora_rank,
target_modules=target_modules,
use_gradient_checkpointing=gradient_checkpointing,
random_state=seed,
)
prompt = tokenizer.apply_chat_template(
[USER_MESSAGE], tokenize=False, add_generation_prompt=True
)
with header_footer_context("Test Prompt and Answer"):
print(f"Test Prompt:\n{prompt}\nExpected Answer:\n{ANSWER}")
dataset: Dataset = create_dataset(
tokenizer, num_examples=num_examples, messages=DEFAULT_MESSAGES
)
with header_footer_context("Dataset"):
print(f"Dataset: {next(iter(dataset))}")
training_args = SFTConfig(
output_dir=output_dir,
max_steps=max_steps,
per_device_train_batch_size=batch_size,
log_level="info",
report_to="none",
num_train_epochs=1,
logging_steps=1,
seed=seed,
bf16=dtype == torch.bfloat16,
fp16=dtype == torch.float16,
save_strategy="no",
)
with header_footer_context("Train Args"):
print(training_args)
trainer = setup_trainer(model, tokenizer, dataset, training_args)
with header_footer_context("Model"):
print(type(model.model))
generation_args = {
"num_generations": num_generations,
"max_new_tokens": max_new_tokens,
"temperature": temperature,
"skip_special_tokens": False,
"dtype": dtype,
}
responses = sample_responses(
model,
tokenizer,
prompt=prompt,
**generation_args,
)
with header_footer_context("Responses before training"):
check_responses(responses, answer=ANSWER, prompt=prompt)
with header_footer_context("Peft Weights before training"):
for name, stats in itertools.islice(describe_peft_weights(model), 2):
print(f"{name}:\n{stats}")
output = trainer.train()
with header_footer_context("Peft Weights after training"):
for name, stats in itertools.islice(describe_peft_weights(model), 2):
print(f"{name}:\n{stats}")
with header_footer_context("Trainer Output"):
print(output)
responses = sample_responses(
model,
tokenizer,
prompt=prompt,
**generation_args,
)
with header_footer_context("Responses after training"):
check_responses(responses, answer=ANSWER, prompt=prompt)
model.save_pretrained_merged(
unsloth_merged_path,
tokenizer,
save_method="merged_16bit",
)
merged_model_unsloth, tokenizer = get_unsloth_model_and_tokenizer(
unsloth_merged_path,
max_seq_length=512,
load_in_4bit=False,
fast_inference=False,
dtype=dtype,
)
responses = sample_responses(
merged_model_unsloth,
tokenizer,
prompt=prompt,
**generation_args,
)
with header_footer_context("Responses after unsloth merge to 16bit"):
check_responses(responses, answer=ANSWER, prompt=prompt)
```
## /tests/saving/gpt-oss-merge/run_test.sh
```sh path="/tests/saving/gpt-oss-merge/run_test.sh"
#!/bin/bash
set -e
echo "================================================================"
echo "🚀 STEP 1: Running the training and merging script..."
echo "================================================================"
python train_and_merge.py
echo ""
echo "================================================================"
echo "✅ STEP 2: Training complete. Running the inference script..."
echo "================================================================"
python test_merged_model.py
echo ""
echo "================================================================"
echo "🎉 All steps completed successfully!"
echo "================================================================"
```
## /tests/saving/gpt-oss-merge/test_merged_model.py
```py path="/tests/saving/gpt-oss-merge/test_merged_model.py"
# inference_on_merged.py
from unsloth import FastLanguageModel
from transformers import TextStreamer
import torch
import gc
import os
import shutil
def safe_remove_directory(path):
try:
if os.path.exists(path) and os.path.isdir(path):
shutil.rmtree(path)
return True
else:
print(f"Path {path} is not a valid directory")
return False
except Exception as e:
print(f"Failed to remove directory {path}: {e}")
return False
pass
print("🔥 Loading the 16-bit merged model from disk...")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./gpt-oss-finetuned-merged",
max_seq_length=1024,
load_in_4bit=True,
load_in_8bit=False,
)
print("✅ Merged model loaded successfully.")
# --- Run Inference ---
print("\n🚀 Running inference...")
messages = [
{"role": "user", "content": "Solve x^5 + 3x^4 - 10 = 3."},
]
inputs = merged_tokenizer.apply_chat_template(
messages,
add_generation_prompt = True,
return_tensors = "pt",
return_dict = True,
reasoning_effort = "low", # **NEW!** Set reasoning effort to low, medium or high
).to(merged_model.device)
_ = merged_model.generate(**inputs, max_new_tokens = 512, streamer = TextStreamer(merged_tokenizer))
print("\n✅ Inference complete.")
# --- Final Cleanup ---
print("\n🧹 Cleaning up merged model directory and cache...")
del merged_model, merged_tokenizer
torch.cuda.empty_cache()
gc.collect()
safe_remove_directory("./gpt-oss-finetuned-merged")
safe_remove_directory("./unsloth_compiled_cache") # Clean up cache created by this process
print("✅ Final cleanup complete. Exiting inference script.")
```
## /tests/saving/gpt-oss-merge/train_and_merge.py
```py path="/tests/saving/gpt-oss-merge/train_and_merge.py"
# train_and_merge.py
from unsloth import FastLanguageModel
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
import torch
import gc
import os
import shutil
def safe_remove_directory(path):
try:
if os.path.exists(path) and os.path.isdir(path):
shutil.rmtree(path)
return True
else:
print(f"Path {path} is not a valid directory")
return False
except Exception as e:
print(f"Failed to remove directory {path}: {e}")
return False
pass
# This tokenizer will be used by the mapping function
tokenizer = None
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
# --- Load 4-bit Model and Train ---
print("Loading 4-bit Mxfp4 gpt-oss model for training...")
max_seq_length = 1024
model, tokenizer = FastLanguageModel.from_pretrained(
"unsloth/gpt-oss-20b", max_seq_length=max_seq_length, load_in_4bit=True
)
dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train[:50]").map(
formatting_prompts_func, batched=True
)
model = FastLanguageModel.get_peft_model(
model, r=8, target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
lora_alpha=16, use_gradient_checkpointing="unsloth", random_state=3407,
)
trainer = SFTTrainer(
model=model, tokenizer=tokenizer, train_dataset=dataset,
args=SFTConfig(
per_device_train_batch_size=1, gradient_accumulation_steps=4, max_steps=10,
learning_rate=2e-4, output_dir="outputs", report_to="none",
),
)
print("Starting fine-tuning...")
trainer.train()
print("Fine-tuning complete.")
# --- Merge and Save ---
print("\n💾 Merging and saving the 16-bit model to './gpt-oss-finetuned-merged'...")
model.save_pretrained_merged(save_directory="./gpt-oss-finetuned-merged", tokenizer=tokenizer)
print("✅ Model merged and saved.")
# --- Cleanup ---
print("\n🧹 Cleaning up training artifacts...")
del model, trainer, tokenizer, dataset
torch.cuda.empty_cache()
gc.collect()
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache") # Clean up the cache created by this process
print("✅ Cleanup complete. Exiting training script.")
```
## /tests/saving/language_models/test_merge_4bit_validation.py
```py path="/tests/saving/language_models/test_merge_4bit_validation.py"
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset
import torch
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
print(f"\n{'='*80}")
print("🔍 PHASE 1: Loading Base Model and Initial Training")
print(f"{'='*80}")
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Llama-3.1-8B-Instruct",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
tokenizer = get_chat_template(
tokenizer,
chat_template="llama-3.1",
)
# Load small dataset for quick training
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train[:100]")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
print("✅ Base model loaded successfully!")
print(f"\n{'='*80}")
print("🔍 PHASE 2: First Fine-tuning")
print(f"{'='*80}")
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=10, # Very short training for test
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=5,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
trainer_stats = trainer.train()
print("✅ First fine-tuning completed!")
print(f"\n{'='*80}")
print("🔍 PHASE 3: Save with Forced 4bit Merge")
print(f"{'='*80}")
model.save_pretrained_merged(
save_directory='./test_4bit_model',
tokenizer=tokenizer,
save_method="forced_merged_4bit"
)
print("✅ Model saved with forced 4bit merge!")
print(f"\n{'='*80}")
print("🔍 PHASE 4: Loading 4bit Model and Second Fine-tuning")
print(f"{'='*80}")
# Clean up first model
del model
del tokenizer
torch.cuda.empty_cache()
# Load the 4bit merged model
model_4bit, tokenizer_4bit = FastLanguageModel.from_pretrained(
model_name="./test_4bit_model",
max_seq_length=2048,
load_in_4bit=True,
load_in_8bit=False,
)
tokenizer_4bit = get_chat_template(
tokenizer_4bit,
chat_template="llama-3.1",
)
print("✅ 4bit model loaded successfully!")
# Add LoRA adapters to the 4bit model
model_4bit = FastLanguageModel.get_peft_model(
model_4bit,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
# Second fine-tuning
trainer_4bit = SFTTrainer(
model=model_4bit,
tokenizer=tokenizer_4bit,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer_4bit),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=10, # Very short training for test
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=5,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs_4bit",
report_to="none",
),
)
trainer_4bit.train()
print("✅ Second fine-tuning on 4bit model completed!")
print(f"\n{'='*80}")
print("🔍 PHASE 5: Testing TypeError on Regular Merge (Should Fail)")
print(f"{'='*80}")
try:
model_4bit.save_pretrained_merged(
save_directory='./test_should_fail',
tokenizer=tokenizer_4bit
# No save_method specified, should default to regular merge
)
assert False, "Expected TypeError but merge succeeded!"
except TypeError as e:
expected_error = "Base model should be a 16bits or mxfp4 base model for a 16bit model merge. Use `save_method=forced_merged_4bit` instead"
assert expected_error in str(e), f"Unexpected error message: {str(e)}"
print("✅ Correct TypeError raised for 4bit base model regular merge attempt!")
print(f"Error message: {str(e)}")
print(f"\n{'='*80}")
print("🔍 PHASE 6: Successful Save with Forced 4bit Method")
print(f"{'='*80}")
try:
model_4bit.save_pretrained_merged(
save_directory='./test_4bit_second',
tokenizer=tokenizer_4bit,
save_method="forced_merged_4bit"
)
print("✅ Successfully saved 4bit model with forced 4bit method!")
except Exception as e:
assert False, f"Phase 6 failed unexpectedly: {e}"
print(f"\n{'='*80}")
print("🔍 CLEANUP")
print(f"{'='*80}")
# Cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./outputs_4bit")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./test_4bit_model")
safe_remove_directory("./test_4bit_second")
safe_remove_directory("./test_should_fail")
print("✅ All tests passed successfully!")
```
## /tests/saving/language_models/test_merge_model_perplexity_llama-3.2.py
```py path="/tests/saving/language_models/test_merge_model_perplexity_llama-3.2.py"
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison
# Define helper functions outside of main
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
"""Load model and compute perplexity in subprocess"""
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from tests.utils.perplexity_eval import ppl_model
# Load model
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_llama_text_model",
max_seq_length=2048,
load_in_4bit=load_in_4bit,
load_in_8bit=load_in_8bit,
)
# Set up tokenizer
merged_tokenizer = get_chat_template(
merged_tokenizer,
chat_template="llama-3.1",
)
# Load dataset fresh in subprocess
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
# Format the dataset
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [merged_tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
# Compute perplexity using the passed dataset
ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)
# IMPORTANT: Convert to Python float if it's a tensor
if torch.is_tensor(ppl_value):
ppl_value = ppl_value.cpu().item() # Move to CPU and convert to Python scalar
elif hasattr(ppl_value, 'item'):
ppl_value = ppl_value.item() # Convert numpy or other array types
else:
ppl_value = float(ppl_value) # Ensure it's a float
# Return only the perplexity value
result_queue.put(ppl_value)
# Clean up
del merged_model
del merged_tokenizer
del dataset_ppl
torch.cuda.empty_cache()
gc.collect()
# Main execution code should be wrapped in this guard
if __name__ == "__main__":
mp.set_start_method('spawn', force=True)
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Llama-3.2-3B-Instruct",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
tokenizer = get_chat_template(
tokenizer,
chat_template="llama-3.1",
)
from unsloth.chat_templates import standardize_sharegpt
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=10,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=50,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)
# run training
trainer_stats = trainer.train()
add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))
# saving and merging the model to local disk
print("merge and save to local disk")
model.save_pretrained_merged(
save_directory='./unsloth_out/merged_llama_text_model',
tokenizer=tokenizer
)
# print("cleaning")
# del model
# del tokenizer
# torch.cuda.empty_cache()
# gc.collect()
# load model from local disk and test
print("Loading merged model in 4 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_llama_text_model",
max_seq_length=2048,
load_in_4bit=True,
load_in_8bit=False,
)
add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print("Computing 8-bit model perplexity in subprocess...")
result_queue = mp.Queue()
p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
p.start()
p.join()
ppl_8bit = result_queue.get()
add_to_comparison("merged model loaded 8bits", ppl_8bit)
print("Loading merged model in 16 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_llama_text_model",
max_seq_length=2048,
load_in_4bit=False,
load_in_8bit=False,
)
add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print_model_comparison()
# final cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./unsloth_out")
```
## /tests/saving/language_models/test_merge_model_perplexity_mistral.py
```py path="/tests/saving/language_models/test_merge_model_perplexity_mistral.py"
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison
def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
"""Load model and compute perplexity in subprocess"""
from unsloth import FastLanguageModel
from tests.utils.perplexity_eval import ppl_model
# Load model
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_mistral_text_model",
max_seq_length=2048,
load_in_4bit=load_in_4bit,
load_in_8bit=load_in_8bit,
)
# Set up tokenizer
# merged_tokenizer = get_chat_template(
# merged_tokenizer,
# chat_template="llama-3.1",
# )
# Load dataset fresh in subprocess
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
EOS_TOKEN = merged_tokenizer.eos_token
def formatting_prompts_func(examples):
instructions = []
inputs = []
outputs = []
texts = []
for conversation in examples["messages"]:
# Extract user message and assistant response
user_message = ""
assistant_message = ""
for turn in conversation:
if turn["role"] == "user":
user_message = turn["content"]
elif turn["role"] == "assistant":
assistant_message = turn["content"]
# Store intermediate format
instruction = "Complete the statement"
instructions.append(instruction)
inputs.append(user_message)
outputs.append(assistant_message)
# Create formatted text
text = alpaca_prompt.format(instruction, user_message, assistant_message) + EOS_TOKEN
texts.append(text)
return {
"instruction": instructions,
"input": inputs,
"output": outputs,
"text": texts
}
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
# Compute perplexity using the passed dataset
ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)
# IMPORTANT: Convert to Python float if it's a tensor
if torch.is_tensor(ppl_value):
ppl_value = ppl_value.cpu().item() # Move to CPU and convert to Python scalar
elif hasattr(ppl_value, 'item'):
ppl_value = ppl_value.item() # Convert numpy or other array types
else:
ppl_value = float(ppl_value) # Ensure it's a float
# Return only the perplexity value
result_queue.put(ppl_value)
# Clean up
del merged_model
del merged_tokenizer
del dataset_ppl
torch.cuda.empty_cache()
gc.collect()
# Main execution code should be wrapped in this guard
if __name__ == "__main__":
mp.set_start_method('spawn', force=True)
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/mistral-7b-v0.3",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
EOS_TOKEN = tokenizer.eos_token
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
# Define helper functions outside of main
def formatting_prompts_func(examples):
instructions = []
inputs = []
outputs = []
texts = []
for conversation in examples["messages"]:
# Extract user message and assistant response
user_message = ""
assistant_message = ""
for turn in conversation:
if turn["role"] == "user":
user_message = turn["content"]
elif turn["role"] == "assistant":
assistant_message = turn["content"]
# Store intermediate format
instruction = "Complete the statement"
instructions.append(instruction)
inputs.append(user_message)
outputs.append(assistant_message)
# Create formatted text
text = alpaca_prompt.format(instruction, user_message, assistant_message) + EOS_TOKEN
texts.append(text)
return {
"instruction": instructions,
"input": inputs,
"output": outputs,
"text": texts
}
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=200,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=50,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
# run training
trainer_stats = trainer.train()
add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))
# saving and merging the model to local disk
print("merge and save to local disk")
model.save_pretrained_merged(
save_directory='./unsloth_out/merged_mistral_text_model',
tokenizer=tokenizer
)
# print("cleaning")
# del model
# del tokenizer
# torch.cuda.empty_cache()
# gc.collect()
# load model from local disk and test
print("Loading merged model in 4 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_mistral_text_model",
max_seq_length=2048,
load_in_4bit=True,
load_in_8bit=False,
)
add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print("Computing 8-bit model perplexity in subprocess...")
result_queue = mp.Queue()
p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
p.start()
p.join()
ppl_8bit = result_queue.get()
add_to_comparison("merged model loaded 8bits", ppl_8bit)
print("Loading merged model in 16 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_mistral_text_model",
max_seq_length=2048,
load_in_4bit=False,
load_in_8bit=False,
)
add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print_model_comparison()
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./unsloth_out")
```
## /tests/saving/language_models/test_merge_model_perplexity_phi_4.py
```py path="/tests/saving/language_models/test_merge_model_perplexity_phi_4.py"
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison
# Define helper functions outside of main
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [
tokenizer.apply_chat_template(
convo, tokenize = False, add_generation_prompt = False
)
for convo in convos
]
return { "text" : texts, }
def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
"""Load model and compute perplexity in subprocess"""
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from tests.utils.perplexity_eval import ppl_model
# Load model
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_phi4_text_model",
max_seq_length=2048,
load_in_4bit=load_in_4bit,
load_in_8bit=load_in_8bit,
)
# Set up tokenizer
merged_tokenizer = get_chat_template(
merged_tokenizer,
chat_template="phi-4",
)
# Load dataset fresh in subprocess
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
# Format the dataset
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [merged_tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
# Compute perplexity using the passed dataset
ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)
# IMPORTANT: Convert to Python float if it's a tensor
if torch.is_tensor(ppl_value):
ppl_value = ppl_value.cpu().item() # Move to CPU and convert to Python scalar
elif hasattr(ppl_value, 'item'):
ppl_value = ppl_value.item() # Convert numpy or other array types
else:
ppl_value = float(ppl_value) # Ensure it's a float
# Return only the perplexity value
result_queue.put(ppl_value)
# Clean up
del merged_model
del merged_tokenizer
del dataset_ppl
torch.cuda.empty_cache()
gc.collect()
# Main execution code should be wrapped in this guard
if __name__ == "__main__":
mp.set_start_method('spawn', force=True)
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Phi-4",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
tokenizer = get_chat_template(
tokenizer,
chat_template="phi-4",
)
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=200,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=50,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part="<|im_start|>user<|im_sep|>\n\n",
response_part="<|im_start|>assistant<|im_sep|>\n\n",
)
# run training
trainer_stats = trainer.train()
add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))
# saving and merging the model to local disk
print("merge and save to local disk")
model.save_pretrained_merged(
save_directory='./unsloth_out/merged_phi4_text_model',
tokenizer=tokenizer
)
# print("cleaning")
# del model
# del tokenizer
# torch.cuda.empty_cache()
# gc.collect()
# load model from local disk and test
print("Loading merged model in 4 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_phi4_text_model",
max_seq_length=2048,
load_in_4bit=True,
load_in_8bit=False,
)
add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print("Computing 8-bit model perplexity in subprocess...")
result_queue = mp.Queue()
p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
p.start()
p.join()
ppl_8bit = result_queue.get()
add_to_comparison("merged model loaded 8bits", ppl_8bit)
print("Loading merged model in 16 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_phi4_text_model",
max_seq_length=2048,
load_in_4bit=False,
load_in_8bit=False,
)
add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print_model_comparison()
# final cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./unsloth_out")
```
## /tests/saving/language_models/test_merged_model_perplexity_llama-3.1-8b.py
```py path="/tests/saving/language_models/test_merged_model_perplexity_llama-3.1-8b.py"
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison
# Define helper functions outside of main
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
"""Load model and compute perplexity in subprocess"""
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from tests.utils.perplexity_eval import ppl_model
# Load model
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_llama_text_model",
max_seq_length=2048,
load_in_4bit=load_in_4bit,
load_in_8bit=load_in_8bit,
)
# Set up tokenizer
merged_tokenizer = get_chat_template(
merged_tokenizer,
chat_template="llama-3.1",
)
# Load dataset fresh in subprocess
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
# Format the dataset
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [merged_tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
# Compute perplexity using the passed dataset
ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)
# IMPORTANT: Convert to Python float if it's a tensor
if torch.is_tensor(ppl_value):
ppl_value = ppl_value.cpu().item() # Move to CPU and convert to Python scalar
elif hasattr(ppl_value, 'item'):
ppl_value = ppl_value.item() # Convert numpy or other array types
else:
ppl_value = float(ppl_value) # Ensure it's a float
# Return only the perplexity value
result_queue.put(ppl_value)
# Clean up
del merged_model
del merged_tokenizer
del dataset_ppl
torch.cuda.empty_cache()
gc.collect()
# Main execution code should be wrapped in this guard
if __name__ == "__main__":
mp.set_start_method('spawn', force=True)
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Llama-3.1-8B-Instruct",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
tokenizer = get_chat_template(
tokenizer,
chat_template="llama-3.1",
)
from unsloth.chat_templates import standardize_sharegpt
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
print("\n dataset sample [0]")
print(dataset_train[0])
add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=200,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=50,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)
tokenizer.decode(trainer.train_dataset[0]["input_ids"])
# run training
trainer_stats = trainer.train()
add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))
# saving and merging the model to local disk
print("merge and save to local disk")
model.save_pretrained_merged(
save_directory='./unsloth_out/merged_llama_text_model',
tokenizer=tokenizer
)
# print("cleaning")
# del model
# del tokenizer
# torch.cuda.empty_cache()
# gc.collect()
# load model from local disk and test
print("Loading merged model in 4 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_llama_text_model",
max_seq_length=2048,
load_in_4bit=True,
load_in_8bit=False,
)
add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print("Computing 8-bit model perplexity in subprocess...")
result_queue = mp.Queue()
p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
p.start()
p.join()
ppl_8bit = result_queue.get()
add_to_comparison("merged model loaded 8bits", ppl_8bit)
print("Loading merged model in 16 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_llama_text_model",
max_seq_length=2048,
load_in_4bit=False,
load_in_8bit=False,
)
add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print_model_comparison()
# final cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./unsloth_out")
```
## /tests/saving/language_models/test_merged_model_perplexity_qwen_2.5.py
```py path="/tests/saving/language_models/test_merged_model_perplexity_qwen_2.5.py"
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
# Define helper functions outside of main
def formatting_prompts_func(examples):
instructions = []
inputs = []
outputs = []
texts = []
for conversation in examples["messages"]:
# Extract user message and assistant response
user_message = ""
assistant_message = ""
for turn in conversation:
if turn["role"] == "user":
user_message = turn["content"]
elif turn["role"] == "assistant":
assistant_message = turn["content"]
# Store intermediate format
instruction = "Complete the statement"
instructions.append(instruction)
inputs.append(user_message)
outputs.append(assistant_message)
# Create formatted text
text = alpaca_prompt.format(instruction, user_message, assistant_message)
texts.append(text)
return {
"instruction": instructions,
"input": inputs,
"output": outputs,
"text": texts
}
def load_and_compute_8bit_ppl(result_queue, load_in_4bit=False, load_in_8bit=False):
"""Load model and compute perplexity in subprocess"""
from unsloth import FastLanguageModel
from tests.utils.perplexity_eval import ppl_model
# Load model
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_qwen_text_model",
max_seq_length=2048,
load_in_4bit=load_in_4bit,
load_in_8bit=load_in_8bit,
)
# Set up tokenizer
# merged_tokenizer = get_chat_template(
# merged_tokenizer,
# chat_template="llama-3.1",
# )
# Load dataset fresh in subprocess
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{}
### Input:
{}
### Response:
{}"""
def formatting_prompts_func(examples):
instructions = []
inputs = []
outputs = []
texts = []
for conversation in examples["messages"]:
# Extract user message and assistant response
user_message = ""
assistant_message = ""
for turn in conversation:
if turn["role"] == "user":
user_message = turn["content"]
elif turn["role"] == "assistant":
assistant_message = turn["content"]
# Store intermediate format
instruction = "Complete the statement"
instructions.append(instruction)
inputs.append(user_message)
outputs.append(assistant_message)
# Create formatted text
text = alpaca_prompt.format(instruction, user_message, assistant_message)
texts.append(text)
return {
"instruction": instructions,
"input": inputs,
"output": outputs,
"text": texts
}
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
# Compute perplexity using the passed dataset
ppl_value = ppl_model(merged_model, merged_tokenizer, dataset_ppl)
# IMPORTANT: Convert to Python float if it's a tensor
if torch.is_tensor(ppl_value):
ppl_value = ppl_value.cpu().item() # Move to CPU and convert to Python scalar
elif hasattr(ppl_value, 'item'):
ppl_value = ppl_value.item() # Convert numpy or other array types
else:
ppl_value = float(ppl_value) # Ensure it's a float
# Return only the perplexity value
result_queue.put(ppl_value)
# Clean up
# del merged_model
# del merged_tokenizer
# del dataset_ppl
# torch.cuda.empty_cache()
# gc.collect()
# Main execution code should be wrapped in this guard
if __name__ == "__main__":
mp.set_start_method('spawn', force=True)
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Qwen2.5-7B-Instruct",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=200,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=50,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
# run training
trainer_stats = trainer.train()
add_to_comparison("Qlora model", ppl_model(model, tokenizer, dataset_ppl))
# saving and merging the model to local disk
print("merge and save to local disk")
model.save_pretrained_merged(
save_directory='./unsloth_out/merged_qwen_text_model',
tokenizer=tokenizer
)
# print("cleaning")
# del model
# del tokenizer
# torch.cuda.empty_cache()
# gc.collect()
# load model from local disk and test
print("Loading merged model in 4 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_qwen_text_model",
max_seq_length=2048,
load_in_4bit=True,
load_in_8bit=False,
)
add_to_comparison("merged model load 4bit", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print("Computing 8-bit model perplexity in subprocess...")
result_queue = mp.Queue()
p = mp.Process(target=load_and_compute_8bit_ppl, args=(result_queue, False, True))
p.start()
p.join()
ppl_8bit = result_queue.get()
add_to_comparison("merged model loaded 8bits", ppl_8bit)
print("Loading merged model in 16 bit for perplexity test")
merged_model, merged_tokenizer = FastLanguageModel.from_pretrained(
model_name="./unsloth_out/merged_qwen_text_model",
max_seq_length=2048,
load_in_4bit=False,
load_in_8bit=False,
)
add_to_comparison("merged model loaded 16bits", ppl_model(merged_model, merged_tokenizer, dataset_ppl))
print_model_comparison()
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./unsloth_out")
```
## /tests/saving/language_models/test_push_to_hub_merged.py
```py path="/tests/saving/language_models/test_push_to_hub_merged.py"
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
import os
from huggingface_hub import HfFileSystem, hf_hub_download
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison
# Define helper functions outside of main
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Llama-3.2-1B-Instruct",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
tokenizer = get_chat_template(
tokenizer,
chat_template="llama-3.1",
)
from unsloth.chat_templates import standardize_sharegpt
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=30,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=50,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)
# run training
trainer_stats = trainer.train()
# saving and merging the model to local disk
hf_username = os.environ.get("HF_USER", "")
if not hf_username:
hf_username = input("Please enter your Hugging Face username: ").strip()
os.environ["HF_USER"] = hf_username
hf_token = os.environ.get("HF_TOKEN", "")
if not hf_token:
hf_token = input("Please enter your Hugging Face token: ").strip()
os.environ["HF_TOKEN"] = hf_token
repo_name = f"{hf_username}/merged_llama_text_model"
success = {
"upload": False,
"download": False,
}
# Stage 1: Upload model to Hub
try:
print("\n" + "=" * 80)
print("=== UPLOADING MODEL TO HUB ===".center(80))
print("=" * 80 + "\n")
model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token)
success["upload"] = True
print("✅ Model uploaded successfully!")
except Exception as e:
print(f"❌ Failed to upload model: {e}")
raise Exception("Model upload failed.")
t
# Stage 2: Test downloading the model (even if cached)
safe_remove_directory(f"./{hf_username}")
try:
print("\n" + "=" * 80)
print("=== TESTING MODEL DOWNLOAD ===".center(80))
print("=" * 80 + "\n")
# Force download even if cached
model,tokenizer = FastLanguageModel.from_pretrained(f"{hf_username}/merged_llama_text_model")
success["download"] = True
print("✅ Model downloaded successfully!")
except Exception as e:
print(f"❌ Download failed: {e}")
raise Exception("Model download failed.")
# Final report
print("\n" + "=" * 80)
print("=== VALIDATION REPORT ===".center(80))
print("=" * 80 + "\n")
for stage, passed in success.items():
status = "✓" if passed else "✗"
print(f"{status} {stage.replace('_', ' ').title()}")
print("\n" + "=" * 80)
if all(success.values()):
print("\n🎉 All stages completed successfully!")
else:
raise Exception("Validation failed for one or more stages.")
# final cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")
```
## /tests/saving/language_models/test_push_to_hub_merged_sharded_index_file.py
```py path="/tests/saving/language_models/test_push_to_hub_merged_sharded_index_file.py"
from unsloth import FastLanguageModel, FastVisionModel, UnslothVisionDataCollator
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer, SFTConfig
from transformers import DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, TrainingArguments
from datasets import load_dataset, Dataset
import torch
from tqdm import tqdm
import pandas as pd
import multiprocessing as mp
from multiprocessing import Process, Queue
import gc
import os
from huggingface_hub import HfFileSystem, hf_hub_download
# ruff: noqa
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.perplexity_eval import ppl_model, add_to_comparison, print_model_comparison
# Define helper functions outside of main
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
return {"text": texts}
if torch.cuda.is_bf16_supported():
compute_dtype = torch.bfloat16
attn_implementation = 'flash_attention_2'
else:
compute_dtype = torch.float16
attn_implementation = 'sdpa'
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/Llama-3.1-8B-Instruct",
max_seq_length=2048,
dtype=compute_dtype,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
attn_implementation=attn_implementation
)
tokenizer = get_chat_template(
tokenizer,
chat_template="llama-3.1",
)
from unsloth.chat_templates import standardize_sharegpt
dataset_train = load_dataset("allenai/openassistant-guanaco-reformatted", split="train")
dataset_ppl = load_dataset("allenai/openassistant-guanaco-reformatted", split="eval")
dataset_train = dataset_train.map(formatting_prompts_func, batched=True)
dataset_ppl = dataset_ppl.map(formatting_prompts_func, batched=True)
add_to_comparison("Base model 4 bits", ppl_model(model, tokenizer, dataset_ppl))
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=['k_proj', 'q_proj', 'v_proj', 'o_proj', "gate_proj", "down_proj", "up_proj"],
lora_alpha=16,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
use_rslora=False,
loftq_config=None,
)
from unsloth import is_bfloat16_supported
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset_train,
dataset_text_field="text",
max_seq_length=2048,
data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
dataset_num_proc=2,
packing=False,
args=TrainingArguments(
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.1,
max_steps=30,
learning_rate=2e-4,
fp16=not is_bfloat16_supported(),
bf16=is_bfloat16_supported(),
logging_steps=50,
optim="adamw_8bit",
lr_scheduler_type="linear",
seed=3407,
output_dir="outputs",
report_to="none",
),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)
# run training
trainer_stats = trainer.train()
# saving and merging the model to local disk
hf_username = os.environ.get("HF_USER", "")
if not hf_username:
hf_username = input("Please enter your Hugging Face username: ").strip()
os.environ["HF_USER"] = hf_username
hf_token = os.environ.get("HF_TOKEN", "")
if not hf_token:
hf_token = input("Please enter your Hugging Face token: ").strip()
os.environ["HF_TOKEN"] = hf_token
repo_name = f"{hf_username}/merged_llama_text_model"
success = {
"upload": False,
"safetensors_check": False,
"download": False,
}
# Stage 1: Upload model to Hub
try:
print("\n" + "=" * 80)
print("=== UPLOADING MODEL TO HUB ===".center(80))
print("=" * 80 + "\n")
model.push_to_hub_merged(repo_name, tokenizer=tokenizer, token=hf_token)
success["upload"] = True
print("✅ Model uploaded successfully!")
except Exception as e:
print(f"❌ Failed to upload model: {e}")
raise Exception("Model upload failed.")
# Stage 2: Verify safetensors.index.json exists
try:
print("\n" + "=" * 80)
print("=== VERIFYING REPO CONTENTS ===".center(80))
print("=" * 80 + "\n")
fs = HfFileSystem(token=hf_token)
file_list = fs.ls(repo_name, detail=True)
safetensors_found = any(
file["name"].endswith("model.safetensors.index.json") for file in file_list
)
if safetensors_found:
success["safetensors_check"] = True
print("✅ model.safetensors.index.json found in repo!")
else:
raise Exception("model.safetensors.index.json not found in repo.")
except Exception as e:
print(f"❌ Verification failed: {e}")
raise Exception("Repo verification failed.")
# Stage 3: Test downloading the model (even if cached)
safe_remove_directory("./RTannous")
try:
print("\n" + "=" * 80)
print("=== TESTING MODEL DOWNLOAD ===".center(80))
print("=" * 80 + "\n")
# Force download even if cached
model,tokenizer = FastLanguageModel.from_pretrained(f"{hf_username}/merged_llama_text_model")
success["download"] = True
print("✅ Model downloaded successfully!")
except Exception as e:
print(f"❌ Download failed: {e}")
raise Exception("Model download failed.")
# Final report
print("\n" + "=" * 80)
print("=== VALIDATION REPORT ===".center(80))
print("=" * 80 + "\n")
for stage, passed in success.items():
status = "✓" if passed else "✗"
print(f"{status} {stage.replace('_', ' ').title()}")
print("\n" + "=" * 80)
if all(success.values()):
print("\n🎉 All stages completed successfully!")
else:
raise Exception("Validation failed for one or more stages.")
# final cleanup
safe_remove_directory("./outputs")
safe_remove_directory("./unsloth_compiled_cache")
```
## /tests/saving/language_models/test_save_merged_grpo_model.py
```py path="/tests/saving/language_models/test_save_merged_grpo_model.py"
# -*- coding: utf-8 -*-
"""test_Llama3_1_(3B)_GRPO_LoRA (1).ipynb
### Unsloth
"""
from unsloth import FastLanguageModel
import torch
import sys
from pathlib import Path
import multiprocessing as mp
import gc
from multiprocessing import Queue
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.aime_eval import evaluate_model_aime, compare_aime_results
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower
def evaluate_merged_model(result_queue, load_in_4bit=False, load_in_8bit=False):
from unsloth import FastLanguageModel
from tests.utils.aime_eval import evaluate_model_aime
max_seq_length = 2048 # Can increase for longer reasoning traces
lora_rank = 64 # Larger rank = smarter, but slower
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "./final_merged_model",
max_seq_length = max_seq_length,
load_in_4bit = True, # False for LoRA 16bit
fast_inference = True, # Enable vLLM fast inference
max_lora_rank = lora_rank,
gpu_memory_utilization = 0.8, # Reduce if out of memory
)
print(f"\n{'='*60}")
if load_in_4bit:
print("🔍 EVALUATION Merged model: 4 bits load")
model_type="merged_model_4bits"
elif load_in_8bit:
print("🔍 EVALUATION Merged model: 8 bits load")
model_type="merged_model_8bits"
else:
print("🔍 EVALUATION Merged model: 16 bits load")
model_type="merged_model_16bits"
print(f"{'='*60}")
evaluate_model_aime(
model=model,
tokenizer=tokenizer,
model_type=model_type,
temperature=0.3,
n_sampling=8,
max_tokens=32768,
top_p=0.95,
seed=0
)
result_queue.put(results)
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()
# Main execution code should be wrapped in this guard
def training_run(result_queue):
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "meta-llama/Llama-3.2-3B-Instruct",
max_seq_length = max_seq_length,
load_in_4bit = False, # False for LoRA 16bit
fast_inference = True, # Enable vLLM fast inference
max_lora_rank = lora_rank,
gpu_memory_utilization = 0.8, # Reduce if out of memory
)
"""### Helper Functions
<a name="Data"></a>
#### Helper functions - Data Prep
"""
import re
import json
reasoning_start = "<reasoning>"
reasoning_end = "</reasoning>"
solution_start = "<answer>"
solution_end = "</answer>"
def extract_hash_answer(text):
"""Extract answer from GSM8K format"""
if "####" not in text:
return None
return text.split("####")[1].strip()
def prepare_gsm8k_dataset(dataset):
"""Format GSM8K dataset for training"""
reasoning_start = "<reasoning>"
reasoning_end = "</reasoning>"
solution_start = "<answer>"
solution_end = "</answer>"
system_prompt = (
f"You are given a problem. Think about the problem and reason step by step. "
f"Place your thinking process between {reasoning_start} and {reasoning_end}. "
f"Then, provide your final numerical solution between {solution_start}{solution_end}"
)
def format_gsm8k(example):
return {
"prompt": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": example["question"]},
],
"answer": extract_hash_answer(example["answer"]),
}
return dataset.map(format_gsm8k)
def prepare_limo_dataset(dataset):
"""Format LIMO dataset for SFT training"""
if dataset is None:
return None
system_prompt = """You are a helpful reasoning assistant. When given a problem, think through it step by step and provide your answer in the following format:
<reasoning>
[Your detailed step-by-step reasoning and solution process]
</reasoning>
<answer>
[Your final numerical answer]
</answer>"""
def format_limo(example):
# Create the assistant response
assistant_response = f"<reasoning>\n{example['solution']}\n</reasoning>\n<answer>\n{example['answer']}\n</answer>"
# Return a DICTIONARY with the conversation in a field
return {
"prompt": [ # ← This is the key change - wrap in a dict
{"role": "system", "content": system_prompt},
{"role": "user", "content": example["question"]},
{"role": "assistant", "content": assistant_response}
]
}
return dataset.map(format_limo)
print("\n✅ Dataset preparation functions defined!")
"""#### Helper functions - Evaluation"""
def get_max_prompt_length(dataset, tokenizer):
"""Calculate maximum and average prompt length in dataset"""
print("Analyzing prompt lengths...")
lengths = dataset.map(
lambda x: {
"tokens": tokenizer.apply_chat_template(
x["prompt"],
add_generation_prompt=True,
tokenize=True
)
},
batched=True,
).map(lambda x: {"length": len(x["tokens"])})["length"]
max_length = max(lengths)
avg_length = sum(lengths) / len(lengths)
min_length = min(lengths)
print(f"Prompt lengths - Min: {min_length}, Max: {max_length}, Avg: {avg_length:.1f}")
return max_length, avg_length
def extract_unsloth_answer(text, start_tag="<SOLUTION>", end_tag="</SOLUTION>"):
"""Extract answer from Unsloth SOLUTION tags"""
pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
matches = re.findall(pattern, text, re.DOTALL)
if matches:
answer = matches[-1] # Get the last match
answer = re.sub(r"[%$,]", "", answer).strip()
return answer
return ""
def find_number(search_string):
"""Find the last number in a string"""
numbers = re.compile(
r"-?[\d,]*\.?\d+",
re.MULTILINE | re.DOTALL | re.IGNORECASE,
).findall(search_string)
if numbers:
return numbers[-1].replace(",", "").strip()
return ""
def remove_symbols(x: str) -> str:
"""Remove commas, percent and dollar symbols"""
if not x:
return ""
return x.replace(",", "").replace("%", "").replace("{{contextString}}quot;, "").strip()
def get_num_tokens(text, tokenizer_instance):
"""Count tokens in text"""
if not text:
return 0
encoding = tokenizer_instance(text, return_tensors="pt")
return len(encoding["input_ids"][0])
def check_format_compliance(text, format_type="unsloth"):
"""Check if response follows expected format"""
if format_type == "unsloth":
reasoning_start = "<start_reasoning>"
reasoning_end = "<end_reasoning>"
solution_start = "<SOLUTION>"
solution_end = "</SOLUTION>"
pattern = (
rf"^[\s]*{re.escape(reasoning_start)}.+?{re.escape(reasoning_end)}.*?"
rf"{re.escape(solution_start)}.+?{re.escape(solution_end)}[\s]*{{contextString}}quot;
)
else:
return False
return bool(re.match(pattern, text.strip(), re.DOTALL))
def normalize_answer(answer):
"""Normalize answer for comparison"""
if not answer:
return ""
normalized = remove_symbols(str(answer))
try:
float_val = float(normalized)
if float_val.is_integer():
return str(int(float_val))
else:
return str(float_val)
except (ValueError, TypeError):
return normalized
def evaluate_answer_correctness(extracted_answer, ground_truth):
"""Evaluate answer correctness with multiple criteria"""
if not extracted_answer or not ground_truth:
return False, False, 0.0
norm_extracted = normalize_answer(extracted_answer)
norm_ground_truth = normalize_answer(ground_truth)
if norm_extracted == norm_ground_truth:
return True, True, 1.0
try:
extracted_num = float(norm_extracted)
ground_truth_num = float(norm_ground_truth)
if ground_truth_num != 0:
relative_error = abs(extracted_num - ground_truth_num) / abs(ground_truth_num)
if relative_error < 0.01:
return True, True, 0.9
elif relative_error < 0.05:
return False, True, 0.7
elif relative_error < 0.10:
return False, True, 0.5
else:
if extracted_num == 0:
return True, True, 1.0
elif abs(extracted_num) < 0.01:
return False, True, 0.7
except (ValueError, TypeError):
if norm_extracted.lower() == norm_ground_truth.lower():
return True, True, 1.0
return False, False, 0.0
"""#### Reward Functions for GRPO"""
def match_format_exactly(completions, **kwargs):
"""Reward function for exact format matching"""
reasoning_start = "<reasoning>"
reasoning_end = "</reasoning>"
solution_start = "<answer>"
solution_end = "</answer>"
pattern = (
rf"^[\s]*{re.escape(reasoning_start)}.+?{re.escape(reasoning_end)}.*?"
rf"{re.escape(solution_start)}.+?{re.escape(solution_end)}[\s]*{{contextString}}quot;
)
responses = [completion[0]["content"] for completion in completions]
rewards = [3.0 if re.match(pattern, response, re.DOTALL) else 0.0 for response in responses]
return rewards
def match_format_approximately(completions, **kwargs):
"""Reward function for approximate format matching"""
reasoning_start = "<reasoning>"
reasoning_end = "</reasoning>"
solution_start = "<answerr>"
solution_end = "</answer>"
scores = []
for completion in completions:
score = 0
response = completion[0]["content"]
score += 0.5 if response.count(reasoning_start) == 1 else -1.0
score += 0.5 if response.count(reasoning_end) == 1 else -1.0
score += 0.5 if response.count(solution_start) == 1 else -1.0
score += 0.5 if response.count(solution_end) == 1 else -1.0
scores.append(score)
return scores
def check_answer_correctness(prompts, completions, answer, **kwargs):
"""Reward function for answer correctness"""
def extract_solution_answer(text):
pattern = r"<answer>(.*?)</answer>"
match = re.search(pattern, text, re.DOTALL)
if match:
return re.sub(r"[%$,]", "", match.group(1)).strip()
return ""
responses = [completion[0]["content"] for completion in completions]
extracted_responses = [extract_solution_answer(r) for r in responses]
scores = []
for guess, true_answer in zip(extracted_responses, answer):
score = 0
if not guess:
scores.append(0)
continue
if guess == true_answer:
score += 3.0
elif guess.strip() == true_answer.strip():
score += 1.5
else:
try:
ratio = float(guess) / float(true_answer)
if 0.9 <= ratio <= 1.1:
score += 1.0
elif 0.8 <= ratio <= 1.2:
score += 0.5
else:
score -= 1.5
except:
score -= 1.5
scores.append(score)
return scores
print("✅ Reward functions defined!")
"""#### Main Evaluation Function"""
import gc
"""#### Comparison and Memory Management"""
def compare_model_results(all_results):
"""Generate comprehensive comparison of multiple model results"""
print(f"\n{'='*80}")
print("COMPREHENSIVE MODEL COMPARISON")
print(f"{'='*80}")
# Main table
print(f"{'Model':<15} {'Format %':<10} {'Exact %':<10} {'Plausible %':<12} {'Confidence':<12}")
print("-" * 80)
for result in all_results:
print(f"{result['model_type']:<15} "
f"{result['correct_format_pct']:<10.1f} "
f"{result['exact_match_pct']:<10.1f} "
f"{result['plausible_match_pct']:<12.1f} "
f"{result['avg_confidence']:<12.3f}")
# Improvement analysis
if len(all_results) > 1:
print(f"\n{'='*50}")
print("IMPROVEMENT ANALYSIS")
print(f"{'='*50}")
base_result = all_results[0]
for result in all_results[1:]:
print(f"\n{result['model_type']} vs {base_result['model_type']}:")
format_improvement = result['correct_format_pct'] - base_result['correct_format_pct']
exact_improvement = result['exact_match_pct'] - base_result['exact_match_pct']
plausible_improvement = result['plausible_match_pct'] - base_result['plausible_match_pct']
print(f" Format compliance: {format_improvement:+.1f}%")
print(f" Exact matches: {exact_improvement:+.1f}%")
print(f" Plausible matches: {plausible_improvement:+.1f}%")
# Save comparison
comparison_data = {
"summary": all_results,
"best_model": max(all_results, key=lambda x: x['exact_match_pct']),
}
with open("model_comparison_comprehensive.json", "w") as f:
json.dump(comparison_data, f, indent=4)
print(f"\nBest performing model: {comparison_data['best_model']['model_type']} "
f"({comparison_data['best_model']['exact_match_pct']:.1f}% exact matches)")
def cleanup_memory():
"""Comprehensive memory cleanup"""
print("🧹 Cleaning up GPU memory...")
for _ in range(10):
torch.cuda.empty_cache()
gc.collect()
if torch.cuda.is_available():
allocated = torch.cuda.memory_allocated() / 1024**3
reserved = torch.cuda.memory_reserved() / 1024**3
print(f"GPU memory - Allocated: {allocated:.2f} GB, Reserved: {reserved:.2f} GB")
"""#### Data Loading and Preparation"""
from datasets import load_dataset
# Load GSM8K
gsm8k_dataset = load_dataset("openai/gsm8k", "main", split="train")
# Load LIMO (adjust this based on your access method)
limo_train = load_dataset("GAIR/LIMO", split="train")
# Prepare datasets
gsm8k_train = prepare_gsm8k_dataset(gsm8k_dataset)
limo_train = prepare_limo_dataset(limo_train)
print(f" GSM8K train: {len(gsm8k_train)}")
print(f" LIMO train: {len(limo_train) if limo_train else 0}")
# Store results
all_results = []
# Single temperature evaluation on combined dataset
results = evaluate_model_aime(
model=model,
tokenizer=tokenizer,
model_type="base",
temperature=0.3,
n_sampling=8,
max_tokens=32768,
top_p=0.95,
seed=0
)
from unsloth.chat_templates import get_chat_template
tokenizer = get_chat_template(
tokenizer,
chat_template = "llama-3.1",
)
def formatting_prompts_func(examples):
convos = examples["prompt"]
texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
return { "text" : texts, }
pass
limo_train = limo_train.map(formatting_prompts_func, batched = True,)
from trl import SFTTrainer
from transformers import DataCollatorForSeq2Seq, TrainingArguments
from unsloth import is_bfloat16_supported
print(f"\n{'*'*60}")
print("🎯 STAGE 1: Qlora Fine-Tuning on LIMO")
print(f"{'*'*60}")
model = FastLanguageModel.get_peft_model(
model,
r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
], # Remove QKVO if out of memory
lora_alpha = lora_rank,
use_gradient_checkpointing = "unsloth", # Enable long context finetuning
random_state = 3407,
)
if limo_train is not None:
trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = limo_train,
dataset_text_field = "text",
max_seq_length = max_seq_length,
data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
dataset_num_proc = 2,
packing = False, # Can make training 5x faster for short sequences.
args = TrainingArguments(
per_device_train_batch_size = 2,
gradient_accumulation_steps = 4,
warmup_steps = 5,
num_train_epochs = 1, # Set this for 1 full training run.
#max_steps = 60,
learning_rate = 2e-4,
fp16 = not is_bfloat16_supported(),
bf16 = is_bfloat16_supported(),
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.01,
lr_scheduler_type = "linear",
seed = 3407,
output_dir = "outputs",
report_to = "none", # Use this for WandB etc
),
)
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
trainer,
instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)
# Train
print(f"🚂 Starting SFT training on {len(limo_train)} examples...")
trainer.train()
# Save checkpoint
model.save_pretrained("qlora_checkpoint")
tokenizer.save_pretrained("qlora_checkpoint")
print("💾 Qlora checkpoint saved!")
# Cleanup
del trainer
cleanup_memory()
print("✅ Qlora training completed!")
else:
print("⚠️ Skipping Qlora training - no LIMO dataset available")
# Cleanup
cleanup_memory()
global PRINTED_TIMES
PRINTED_TIMES = 0
global PRINT_EVERY_STEPS
PRINT_EVERY_STEPS = 5
match_numbers = re.compile(
solution_start + r".*?([\d\.\,]{1,})",
flags = re.MULTILINE | re.DOTALL
)
def check_numbers(prompts, completions, answer, **kwargs):
question = prompts[0][-1]["content"]
responses = [completion[0]["content"] for completion in completions]
extracted_responses = [
guess.group(1)
if (guess := match_numbers.search(r)) is not None else None \
for r in responses
]
scores = []
# Print only every few steps
global PRINTED_TIMES
global PRINT_EVERY_STEPS
if PRINTED_TIMES % PRINT_EVERY_STEPS == 0:
print('*'*20, f"Question:\n{question}", f"\nAnswer:\n{answer[0]}", f"\nResponse:\n{responses[0]}", f"\nExtracted:\n{extracted_responses[0]}")
PRINTED_TIMES += 1
for guess, true_answer in zip(extracted_responses, answer):
if guess is None:
scores.append(0)
continue
# Convert to numbers
try:
true_answer = float(true_answer.strip())
# Remove commas like in 123,456
guess = float(guess.strip().replace(",", ""))
scores.append(1.5 if guess == true_answer else -0.5)
except:
scores.append(0)
continue
return scores
print(f"\n{'*'*60}")
print("🎯 STAGE 2: GRPO Fine-Tuning on GSM8K")
print(f"{'*'*60}")
# Get max prompt length
max_prompt_length, _ = get_max_prompt_length(gsm8k_train, tokenizer)
max_prompt_length = min(max_prompt_length + 10, 512) # Add buffer, cap at 512
print(f"Using max_prompt_length: {max_prompt_length}")
from trl import GRPOConfig, GRPOTrainer
training_args = GRPOConfig(
learning_rate = 5e-6,
weight_decay = 0.1,
warmup_ratio = 0.1,
lr_scheduler_type = "cosine",
optim = "adamw_torch_fused",
logging_steps = 1,
per_device_train_batch_size = 1,
gradient_accumulation_steps = 4, # Increase to 4 for smoother training
num_generations = 8, # Decrease if out of memory
max_prompt_length = max_prompt_length,
max_completion_length = max_seq_length - max_prompt_length,
# num_train_epochs = 1, # Set to 1 for a full training run
#max_steps = 250,
max_steps = 1000,
save_steps = 250,
max_grad_norm = 0.1,
report_to = "none", # Can use Weights & Biases
output_dir = "outputs",
)
trainer = GRPOTrainer(
model = model,
processing_class = tokenizer,
reward_funcs = [
match_format_exactly,
match_format_approximately,
check_answer_correctness,
check_numbers,
],
args = training_args,
train_dataset = gsm8k_train,
)
# Train
print(f"🚂 Starting GRPO training on {len(gsm8k_train)} examples...")
trainer.train()
# Save checkpoint
model.save_pretrained("grpo_checkpoint")
tokenizer.save_pretrained("grpo_checkpoint")
print("💾 GRPO checkpoint saved!")
# Cleanup
del trainer
del training_args
cleanup_memory()
print("✅ GRPO training completed!")
print(f"\n{'='*60}")
print("🔍 EVALUATION 3: Final GRPO Model")
print(f"{'='*60}")
grpo_results = evaluate_model_aime(
model=model,
tokenizer=tokenizer,
model_type="grpo",
temperature=0.3,
n_sampling=8,
max_tokens=32768,
top_p=0.95,
seed=0
)
all_results.append(grpo_results)
print("✅ Final model evaluation complete!")
print(f"\n{'='*60}")
print("💾 SAVING FINAL MODEL")
print(f"{'='*60}")
# Save as merged model
try:
model.save_pretrained_merged("final_merged_model", tokenizer, save_method="merged_16bit")
print("✅ Merged model saved to: final_merged_model/")
except Exception as e:
print(f"⚠️ Could not save merged model: {e}")
print("Final model saved as LoRA adapter only")
print("💾 Model saving complete!")
safe_remove_directory("./unsloth_compiled_cache")
result_queue.put(results)
# Clean up
del model
del tokenizer
torch.cuda.empty_cache()
gc.collect()
# # Merged model load 16 bits model AIME eval
# result_queue = mp.Queue()
# p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, False))
# p.start()
# p.join()
#
# merged_16bits = result_queue.get()
# all_results.append(merged_16bits)
#
# # Clean up
# del merged_model
# del merged_tokenizer
# del dataset_ppl
# torch.cuda.empty_cache()
# gc.collect()
#
# safe_remove_directory("./unsloth_compiled_cache")
#
# # Merged model load 8 bits model AIME eval
#
# result_queue = mp.Queue()
# p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, True))
# p.start()
# p.join()
#
# merged_16bits = result_queue.get()
# all_results.append(merged_16bits)
# Merged model load 4 bits AIME eval
# result_queue = mp.Queue()
# p = mp.Process(target=evaluate_merged_model, args=(result_queue, True, False))
# p.start()
# p.join()
#
# merged_16bits = result_queue.get()
# all_results.append(merged_16bits)
if __name__ == "__main__":
mp.set_start_method('spawn', force=True)
result_queue = mp.Queue()
all_results = []
# run main finetuning and grpo loop
p = mp.Process(target=training_run, args=(result_queue,))
p.start()
p.join()
results = result_queue.get()
all_results = results
# evaluate merged model loaded 16bits
p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, False))
p.start()
p.join()
merged_load_16bits = result_queue.get()
all_results.append(merged_load_16bits)
safe_remove_directory("./unsloth_compiled_cache")
# Merged model load 8 bits model AIME eval
p = mp.Process(target=evaluate_merged_model, args=(result_queue, False, True))
p.start()
p.join()
merged_load_8bits = result_queue.get()
all_results.append(merged_load_8bits)
safe_remove_directory("./unsloth_compiled_cache")
# Merged model load 4 bits model AIME eval
p = mp.Process(target=evaluate_merged_model, args=(result_queue, True, False))
p.start()
p.join()
merged_load_4bits = result_queue.get()
all_results.append(merged_load_4bits)
safe_remove_directory("./unsloth_compiled_cache")
# AIME-specific comparison function
print(f"\n{'='*80}")
print("🏆 FINAL TRAINING PIPELINE RESULTS")
print(f"{'='*80}")
# Use the AIME-specific comparison
compare_aime_results(all_results)
```
## /tests/saving/non_peft/test_mistral_non_peft.py
```py path="/tests/saving/non_peft/test_mistral_non_peft.py"
from unsloth import FastLanguageModel
from transformers import AutoModelForCausalLM
from peft import PeftModel
from pathlib import Path
import sys
import warnings
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
print(f"\n{'='*80}")
print("🔍 PHASE 1: Loading Base Model")
print(f"{'='*80}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name="unsloth/mistral-7b-v0.3",
max_seq_length=2048,
dtype=None,
load_in_4bit=True,
load_in_8bit=False,
full_finetuning=False,
)
print("✅ Base model loaded successfully!")
### Attemtping save merge
print(f"\n{'='*80}")
print("🔍 PHASE 2: Attempting save_pretrained_merged (Should Warn)")
print(f"{'='*80}")
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
model.save_pretrained_merged("test_output", tokenizer)
# Verify warning
assert len(w) >= 1, "Expected warning but none raised"
warning_msg = str(w[0].message)
expected_msg = "Model is not a PeftModel (no Lora adapters detected). Skipping Merge. Please use save_pretrained() or push_to_hub() instead!"
assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"
assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"
print("✅ Correct warning detected for non-PeftModel merge attempt!")
print(f"\n{'='*80}")
print("🔍 PHASE 3: Using save_pretrained (Should Succeed)")
print(f"{'='*80}")
try:
with warnings.catch_warnings():
warnings.simplefilter("error") # Treat warnings as errors here
model.save_pretrained("test_output")
print("✅ Standard save_pretrained completed successfully!")
except Exception as e:
assert False, f"Phase 3 failed: {e}"
safe_remove_directory("./test_output")
safe_remove_directory("./unsloth_compiled_cache")
```
## /tests/saving/non_peft/test_whisper_non_peft.py
```py path="/tests/saving/non_peft/test_whisper_non_peft.py"
from unsloth import FastLanguageModel, FastModel
from transformers import AutoModelForCausalLM, WhisperForConditionalGeneration
from peft import PeftModel
from pathlib import Path
import sys
import warnings
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
print(f"\n{'='*80}")
print("🔍 PHASE 1: Loading Base Model")
print(f"{'='*80}")
model, tokenizer = FastModel.from_pretrained(
model_name = "unsloth/whisper-large-v3",
dtype = None, # Leave as None for auto detection
load_in_4bit = False, # Set to True to do 4bit quantization which reduces memory
auto_model = WhisperForConditionalGeneration,
whisper_language = "English",
whisper_task = "transcribe",
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
print("✅ Base model loaded successfully!")
### Attemtping save merge
print(f"\n{'='*80}")
print("🔍 PHASE 2: Attempting save_pretrained_merged (Should Warn)")
print(f"{'='*80}")
with warnings.catch_warnings(record=True) as w:
warnings.simplefilter("always")
model.save_pretrained_merged("test_output", tokenizer)
# Verify warning
assert len(w) >= 1, "Expected warning but none raised"
warning_msg = str(w[0].message)
expected_msg = "Model is not a PeftModel (no Lora adapters detected). Skipping Merge. Please use save_pretrained() or push_to_hub() instead!"
assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"
assert expected_msg in warning_msg, f"Unexpected warning: {warning_msg}"
print("✅ Correct warning detected for non-PeftModel merge attempt!")
print(f"\n{'='*80}")
print("🔍 PHASE 3: Using save_pretrained (Should Succeed)")
print(f"{'='*80}")
try:
with warnings.catch_warnings():
warnings.simplefilter("error") # Treat warnings as errors here
model.save_pretrained("test_output")
print("✅ Standard save_pretrained completed successfully!")
except Exception as e:
assert False, f"Phase 3 failed: {e}"
safe_remove_directory("./test_output")
safe_remove_directory("./unsloth_compiled_cache")
```
## /tests/saving/test_unsloth_save.py
```py path="/tests/saving/test_unsloth_save.py"
import json
import os
import shutil
import tempfile
import pytest
import importlib
from unsloth import FastLanguageModel, FastModel
model_to_test = [
# Text Models
"unsloth/tinyllama",
"unsloth/tinyllama-bnb-4bit",
"unsloth/Qwen2.5-0.5B-Instruct",
"unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit",
"unsloth/Phi-4-mini-instruct",
"unsloth/Phi-4-mini-instruct-bnb-4bit",
"unsloth/Qwen2.5-0.5B",
# Vision Models
"unsloth/gemma-3-4b-it",
"unsloth/Llama-3.2-11B-Vision-Instruct-bnb-4bit",
"unsloth/Qwen2.5-VL-3B-Instruct-bnb-4bit"
]
torchao_models = [
"unsloth/tinyllama",
"unsloth/Qwen2.5-0.5B-Instruct",
#"unsloth/Phi-4-mini-instruct",
#"unsloth/Qwen2.5-0.5B",
# Skip the -bnb-4bit variants since they're already quantized
]
# Variables
save_file_sizes = {}
save_file_sizes["merged_16bit"] = {}
save_file_sizes["merged_4bit"] = {}
save_file_sizes["torchao"] = {}
tokenizer_files = [
"tokenizer_config.json",
"special_tokens_map.json",
]
@pytest.fixture(scope="session", params=model_to_test)
def loaded_model_tokenizer(request):
model_name = request.param
print("Loading model and tokenizer...")
model, tokenizer = FastModel.from_pretrained(
model_name, # use small model
max_seq_length=128,
dtype=None,
load_in_4bit=True,
)
# Apply LoRA
model = FastModel.get_peft_model(
model,
r=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_alpha=16,
use_gradient_checkpointing="unsloth",
)
return model, tokenizer
@pytest.fixture(scope="session", params=torchao_models)
def fp16_model_tokenizer(request):
"""Load model in FP16 for TorchAO quantization"""
model_name = request.param
print(f"Loading model in FP16 for TorchAO: {model_name}")
model, tokenizer = FastModel.from_pretrained(
model_name,
max_seq_length=128,
dtype=None,
load_in_4bit=False, # No BnB quantization
)
# Apply LoRA
model = FastModel.get_peft_model(
model,
r=16,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_alpha=16,
use_gradient_checkpointing="unsloth",
)
return model, tokenizer
@pytest.fixture(scope="session")
def model(loaded_model_tokenizer):
return loaded_model_tokenizer[0]
@pytest.fixture(scope="session")
def tokenizer(loaded_model_tokenizer):
return loaded_model_tokenizer[1]
@pytest.fixture
def temp_save_dir():
dir = tempfile.mkdtemp()
print(f"Temporary directory created at: {dir}")
yield dir
print(f"Temporary directory deleted: {dir}")
shutil.rmtree(dir)
def delete_quantization_config(model):
# Since merged, edit quantization_config
old_config = model.config
new_config = model.config.to_dict()
if "quantization_config" in new_config:
del new_config["quantization_config"]
original_model = model
new_config = type(model.config).from_dict(new_config)
while hasattr(original_model, "model"):
original_model = original_model.model
original_model.config = new_config
model.config = new_config
def test_save_merged_16bit(model, tokenizer, temp_save_dir: str):
save_path = os.path.join(temp_save_dir, "unsloth_merged_16bit", model.config._name_or_path.replace("/", "_"))
model.save_pretrained_merged(
save_path,
tokenizer=tokenizer,
save_method="merged_16bit"
)
# Check model files
assert os.path.isdir(save_path), f"Directory {save_path} does not exist."
assert os.path.isfile(os.path.join(save_path, "config.json")), "config.json not found."
weight_files = [f for f in os.listdir(save_path) if f.endswith(".bin") or f.endswith(".safetensors")]
assert len(weight_files) > 0, "No weight files found in the save directory."
# Check tokenizer files
for file in tokenizer_files:
assert os.path.isfile(os.path.join(save_path, file)), f"{file} not found in the save directory."
# Check config to see if it is 16bit by checking for quantization config
config_path = os.path.join(save_path, "config.json")
with open(config_path, "r") as f:
config = json.load(f)
assert "quantization_config" not in config, "Quantization config not found in the model config."
# Store the size of the model files
total_size = sum(os.path.getsize(os.path.join(save_path, f)) for f in weight_files)
save_file_sizes["merged_16bit"][model.config._name_or_path] = total_size
print(f"Total size of merged_16bit files: {total_size} bytes")
# Test loading the model from the saved path
loaded_model, loaded_tokenizer = FastLanguageModel.from_pretrained(
save_path,
max_seq_length=128,
dtype=None,
load_in_4bit=True,
)
def test_save_merged_4bit(model, tokenizer, temp_save_dir: str):
save_path = os.path.join(temp_save_dir, "unsloth_merged_4bit", model.config._name_or_path.replace("/", "_"))
model.save_pretrained_merged(
save_path,
tokenizer=tokenizer,
save_method="merged_4bit_forced"
)
# Check model files
assert os.path.isdir(save_path), f"Directory {save_path} does not exist."
assert os.path.isfile(os.path.join(save_path, "config.json")), "config.json not found."
weight_files = [f for f in os.listdir(save_path) if f.endswith(".bin") or f.endswith(".safetensors")]
assert len(weight_files) > 0, "No weight files found in the save directory."
# Check tokenizer files
for file in tokenizer_files:
assert os.path.isfile(os.path.join(save_path, file)), f"{file} not found in the save directory."
# Store the size of the model files
total_size = sum(os.path.getsize(os.path.join(save_path, f)) for f in weight_files)
save_file_sizes["merged_4bit"][model.config._name_or_path] = total_size
print(f"Total size of merged_4bit files: {total_size} bytes")
assert total_size < save_file_sizes["merged_16bit"][model.config._name_or_path], "Merged 4bit files are larger than merged 16bit files."
# Check config to see if it is 4bit
config_path = os.path.join(save_path, "config.json")
with open(config_path, "r") as f:
config = json.load(f)
assert "quantization_config" in config, "Quantization config not found in the model config."
# Test loading the model from the saved path
loaded_model, loaded_tokenizer = FastModel.from_pretrained(
save_path,
max_seq_length=128,
dtype=None,
load_in_4bit=True,
)
@pytest.mark.skipif(importlib.util.find_spec("torchao") is None, reason="require torchao to be installed")
def test_save_torchao(fp16_model_tokenizer, temp_save_dir: str):
model, tokenizer = fp16_model_tokenizer
save_path = os.path.join(temp_save_dir, "unsloth_torchao", model.config._name_or_path.replace("/", "_"))
from torchao.quantization import Int8DynamicActivationInt8WeightConfig
torchao_config = Int8DynamicActivationInt8WeightConfig()
model.save_pretrained_torchao(
save_path,
tokenizer=tokenizer,
torchao_config=torchao_config,
push_to_hub=False,
)
weight_files_16bit = [f for f in os.listdir(save_path) if f.endswith(".bin") or f.endswith(".safetensors")]
total_16bit_size = sum(os.path.getsize(os.path.join(save_path, f)) for f in weight_files_16bit)
save_file_sizes["merged_16bit"][model.config._name_or_path] = total_16bit_size
torchao_save_path = save_path + "-torchao"
# Check model files
assert os.path.isdir(torchao_save_path), f"Directory {torchao_save_path} does not exist."
assert os.path.isfile(os.path.join(torchao_save_path, "config.json")), "config.json not found."
weight_files = [f for f in os.listdir(torchao_save_path) if f.endswith(".bin") or f.endswith(".safetensors")]
assert len(weight_files) > 0, "No weight files found in the save directory."
# Check tokenizer files
for file in tokenizer_files:
assert os.path.isfile(os.path.join(torchao_save_path, file)), f"{file} not found in the save directory."
# Store the size of the model files
total_size = sum(os.path.getsize(os.path.join(torchao_save_path, f)) for f in weight_files)
save_file_sizes["torchao"][model.config._name_or_path] = total_size
assert total_size < save_file_sizes["merged_16bit"][model.config._name_or_path], "torchao files are larger than merged 16bit files."
# Check config to see if it is quantized with torchao
config_path = os.path.join(torchao_save_path, "config.json")
with open(config_path, "r") as f:
config = json.load(f)
assert "quantization_config" in config, "Quantization config not found in the model config."
# Test loading the model from the saved path
# can't set `load_in_4bit` to True because the model is torchao quantized
# can't quantize again with bitsandbytes
import torch.serialization
with torch.serialization.safe_globals([getattr]):
loaded_model, loaded_tokenizer = FastModel.from_pretrained(
torchao_save_path,
max_seq_length=128,
dtype=None,
load_in_4bit=False,
)
@pytest.mark.skipif(importlib.util.find_spec("torchao") is None, reason="require torchao to be installed")
def test_save_and_inference_torchao(fp16_model_tokenizer, temp_save_dir: str):
model, tokenizer = fp16_model_tokenizer
model_name = model.config._name_or_path
print(f"Testing TorchAO save and inference for: {model_name}")
save_path = os.path.join(temp_save_dir, "torchao_models", model_name.replace("/", "_"))
from torchao.quantization import Int8DynamicActivationInt8WeightConfig
torchao_config = Int8DynamicActivationInt8WeightConfig()
# Save with TorchAO
model.save_pretrained_torchao(
save_path,
tokenizer=tokenizer,
torchao_config=torchao_config,
push_to_hub=False,
)
torchao_save_path = save_path + "-torchao"
# Verify files exist
assert os.path.isdir(torchao_save_path), f"TorchAO directory {torchao_save_path} does not exist."
# Load with safe globals
import torch.serialization
with torch.serialization.safe_globals([getattr]):
loaded_model, loaded_tokenizer = FastModel.from_pretrained(
torchao_save_path,
max_seq_length=128,
dtype=None,
load_in_4bit=False,
)
FastModel.for_inference(loaded_model) # Enable native 2x faster inference
messages = [
{"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = loaded_tokenizer.apply_chat_template(
messages,
tokenize = True,
add_generation_prompt = True, # Must add for generation
return_tensors = "pt",
).to("cuda")
outputs = loaded_model.generate( # ← Use loaded_model, not model
input_ids=inputs,
max_new_tokens=64,
use_cache=False, # Avoid cache issues
temperature=1.5,
min_p=0.1,
do_sample=True,
pad_token_id=loaded_tokenizer.pad_token_id or loaded_tokenizer.eos_token_id,
)
#Decode with the LOADED tokenizer
generated_text = loaded_tokenizer.decode(outputs[0], skip_special_tokens=True)
input_text = loaded_tokenizer.decode(inputs[0], skip_special_tokens=True)
response_part = generated_text[len(input_text):].strip()
print(f"Input: {input_text}")
print(f"Full output: {generated_text}")
print(f"Response only: {response_part}")
```
## /tests/saving/text_to_speech_models/test_csm.py
```py path="/tests/saving/text_to_speech_models/test_csm.py"
from unsloth import FastLanguageModel, FastModel
from transformers import CsmForConditionalGeneration
import torch
# ruff: noqa
import sys
from pathlib import Path
from peft import PeftModel
import warnings
import requests
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.os_utils import require_package, require_python_package
require_package("ffmpeg", "ffmpeg")
require_python_package("soundfile")
import soundfile as sf
print(f"\n{'='*80}")
print("🔍 SECTION 1: Loading Model and LoRA Adapters")
print(f"{'='*80}")
model, tokenizer = FastModel.from_pretrained(
model_name = "unsloth/csm-1b",
max_seq_length= 2048, # Choose any for long context!
dtype = None, # Leave as None for auto-detection
auto_model = CsmForConditionalGeneration,
load_in_4bit = False, # Select True for 4bit - reduces memory usage
)
base_model_class = model.__class__.__name__
model = FastModel.get_peft_model(
model,
r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 32,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
print("✅ Model and LoRA adapters loaded successfully!")
print(f"\n{'='*80}")
print("🔍 SECTION 2: Checking Model Class Type")
print(f"{'='*80}")
assert isinstance(model, PeftModel), "Model should be an instance of PeftModel"
print("✅ Model is an instance of PeftModel!")
print(f"\n{'='*80}")
print("🔍 SECTION 3: Checking Config Model Class Type")
print(f"{'='*80}")
def find_lora_base_model(model_to_inspect):
current = model_to_inspect
if hasattr(current, "base_model"):
current = current.base_model
if hasattr(current, "model"):
current = current.model
return current
pass
config_model = find_lora_base_model(model) if isinstance(model, PeftModel) else model
assert config_model.__class__.__name__ == base_model_class, f"Expected config_model class to be {base_model_class}"
print("✅ config_model returns correct Base Model class:", str(base_model_class))
print(f"\n{'='*80}")
print("🔍 SECTION 4: Saving and Merging Model")
print(f"{'='*80}")
with warnings.catch_warnings():
warnings.simplefilter("error") # Treat warnings as errors
try:
model.save_pretrained_merged("csm", tokenizer)
print("✅ Model saved and merged successfully without warnings!")
except Exception as e:
assert False, f"Model saving/merging failed with exception: {e}"
print(f"\n{'='*80}")
print("🔍 SECTION 5: Loading Model for Inference")
print(f"{'='*80}")
model, processor = FastModel.from_pretrained(
model_name = "./csm",
max_seq_length= 2048, # Choose any for long context!
dtype = None, # Leave as None for auto-detection
auto_model = CsmForConditionalGeneration,
load_in_4bit = False, # Select True for 4bit - reduces memory usage
)
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("unsloth/csm-1b")
print("✅ Model loaded for inference successfully!")
print(f"\n{'='*80}")
print("🔍 SECTION 6: Running Inference")
print(f"{'='*80}")
from transformers import pipeline
import torch
output_audio_path = "csm_audio.wav"
try:
text = "We just finished fine tuning a text to speech model... and it's pretty good!"
speaker_id = 0
inputs = processor(f"[{speaker_id}]{text}", add_special_tokens=True).to("cuda")
audio_values = model.generate(
**inputs,
max_new_tokens=125, # 125 tokens is 10 seconds of audio, for longer speech increase this
# play with these parameters to get the best results
depth_decoder_temperature=0.6,
depth_decoder_top_k=0,
depth_decoder_top_p=0.9,
temperature=0.8,
top_k=50,
top_p=1.0,
#########################################################
output_audio=True
)
audio = audio_values[0].to(torch.float32).cpu().numpy()
sf.write("example_without_context.wav", audio, 24000)
print(f"✅ Audio generated and saved to {output_audio_path}!")
except Exception as e:
assert False, f"Inference failed with exception: {e}"
## assert that transcribed_text contains The birch canoe slid on the smooth planks. Glued the sheet to the dark blue background. It's easy to tell the depth of a well. Four hours of steady work faced us.
print("✅ All sections passed successfully!")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./csm")
```
## /tests/saving/text_to_speech_models/test_lasa.py
```py path="/tests/saving/text_to_speech_models/test_lasa.py"
from unsloth import FastLanguageModel, FastModel
from transformers import CsmForConditionalGeneration
import torch
# ruff: noqa
import sys
from pathlib import Path
from peft import PeftModel
import warnings
import requests
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.os_utils import require_package, require_python_package
require_package("ffmpeg", "ffmpeg")
require_python_package("soundfile")
require_python_package("xcodec2")
import soundfile as sf
from xcodec2.modeling_xcodec2 import XCodec2Model
XCODEC2_MODEL_NAME = "HKUST-Audio/xcodec2"
SAMPLE_RATE = 16000
DEVICE = "cuda"
try:
codec_model = XCodec2Model.from_pretrained(XCODEC2_MODEL_NAME)
except Exception as e:
raise f"ERROR loading XCodec2 model: {e}."
codec_model.to('cpu')
print(f"\n{'='*80}")
print("🔍 SECTION 1: Loading Model and LoRA Adapters")
print(f"{'='*80}")
max_seq_length = 2048
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/Llasa-1B",
max_seq_length = max_seq_length,
dtype = None, # Select None for auto detection
load_in_4bit = False, # Choose True for 4bit which reduces memory
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
base_model_class = model.__class__.__name__
model = FastLanguageModel.get_peft_model(
model,
r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "v_proj"],
lora_alpha = 128,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
print("✅ Model and LoRA adapters loaded successfully!")
print(f"\n{'='*80}")
print("🔍 SECTION 2: Checking Model Class Type")
print(f"{'='*80}")
assert isinstance(model, PeftModel), "Model should be an instance of PeftModel"
print("✅ Model is an instance of PeftModel!")
print(f"\n{'='*80}")
print("🔍 SECTION 3: Checking Config Model Class Type")
print(f"{'='*80}")
def find_lora_base_model(model_to_inspect):
current = model_to_inspect
if hasattr(current, "base_model"):
current = current.base_model
if hasattr(current, "model"):
current = current.model
return current
pass
config_model = find_lora_base_model(model) if isinstance(model, PeftModel) else model
assert config_model.__class__.__name__ == base_model_class, f"Expected config_model class to be {base_model_class}"
print("✅ config_model returns correct Base Model class:", str(base_model_class))
print(f"\n{'='*80}")
print("🔍 SECTION 4: Saving and Merging Model")
print(f"{'='*80}")
with warnings.catch_warnings():
warnings.simplefilter("error") # Treat warnings as errors
try:
model.save_pretrained_merged("lasa", tokenizer)
print("✅ Model saved and merged successfully without warnings!")
except Exception as e:
assert False, f"Model saving/merging failed with exception: {e}"
print(f"\n{'='*80}")
print("🔍 SECTION 5: Loading Model for Inference")
print(f"{'='*80}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "./lasa",
max_seq_length = max_seq_length,
dtype = None, # Select None for auto detection
load_in_4bit = False, # Choose True for 4bit which reduces memory
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
#from transformers import AutoProcessor
#processor = AutoProcessor.from_pretrained("unsloth/csm-1b")
print("✅ Model loaded for inference successfully!")
print(f"\n{'='*80}")
print("🔍 SECTION 6: Running Inference")
print(f"{'='*80}")
from transformers import pipeline
import torch
output_audio_path = "lasa_audio.wav"
input_text = "Hey there my name is Elise, <giggles> and I'm a speech generation model that can sound like a person."
FastLanguageModel.for_inference(model)
def ids_to_speech_tokens(speech_ids):
speech_tokens_str = []
for speech_id in speech_ids:
speech_tokens_str.append(f"<|s_{speech_id}|>")
return speech_tokens_str
def extract_speech_ids(speech_tokens_str):
speech_ids = []
for token_str in speech_tokens_str:
if token_str.startswith('<|s_') and token_str.endswith('|>'):
num_str = token_str[4:-2]
num = int(num_str)
speech_ids.append(num)
else:
print(f"Unexpected token: {token_str}")
return speech_ids
#TTS start!
with torch.inference_mode():
with torch.amp.autocast('cuda',dtype=model.dtype):
formatted_text = f"<|TEXT_UNDERSTANDING_START|>{input_text}<|TEXT_UNDERSTANDING_END|>"
# Tokenize the text
chat = [
{"role": "user", "content": "Convert the text to speech:" + formatted_text},
{"role": "assistant", "content": "<|SPEECH_GENERATION_START|>"}
]
input_ids = tokenizer.apply_chat_template(
chat,
tokenize=True,
return_tensors='pt',
continue_final_message=True
)
input_ids = input_ids.to('cuda')
speech_end_id = tokenizer.convert_tokens_to_ids('<|SPEECH_GENERATION_END|>')
# Generate the speech autoregressively
outputs = model.generate(
input_ids,
max_length=2048, # We trained our model with a max length of 2048
eos_token_id= speech_end_id ,
do_sample=True,
top_p=1.2, # Adjusts the diversity of generated content
temperature=1.2, # Controls randomness in output
)
# Extract the speech tokens
generated_ids = outputs[0][input_ids.shape[1]:-1]
speech_tokens = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
# Convert token <|s_23456|> to int 23456
speech_tokens = extract_speech_ids(speech_tokens)
speech_tokens = torch.tensor(speech_tokens).cpu().unsqueeze(0).unsqueeze(0)
# Decode the speech tokens to speech waveform
gen_wav = codec_model.decode_code(speech_tokens)
try:
sf.write(output_audio_path, gen_wav[0, 0, :].cpu().numpy(), 16000)
except Exception as e:
assert False, f"Inference failed with exception: {e}"
## assert that transcribed_text contains The birch canoe slid on the smooth planks. Glued the sheet to the dark blue background. It's easy to tell the depth of a well. Four hours of steady work faced us.
print("✅ All sections passed successfully!")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./lasa")
```
## /tests/saving/text_to_speech_models/test_orpheus.py
```py path="/tests/saving/text_to_speech_models/test_orpheus.py"
from unsloth import FastLanguageModel, FastModel
from transformers import CsmForConditionalGeneration
import torch
# ruff: noqa
import sys
from pathlib import Path
from peft import PeftModel
import warnings
import requests
REPO_ROOT = Path(__file__).parents[3]
sys.path.insert(0, str(REPO_ROOT))
from tests.utils.cleanup_utils import safe_remove_directory
from tests.utils.os_utils import require_package, require_python_package
require_package("ffmpeg", "ffmpeg")
require_python_package("soundfile")
require_python_package("snac")
import soundfile as sf
from snac import SNAC
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
snac_model = snac_model.to("cuda")
print(f"\n{'='*80}")
print("🔍 SECTION 1: Loading Model and LoRA Adapters")
print(f"{'='*80}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/orpheus-3b-0.1-ft",
max_seq_length= 2048, # Choose any for long context!
dtype = None, # Select None for auto detection
load_in_4bit = False, # Select True for 4bit which reduces memory usage
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
base_model_class = model.__class__.__name__
model = FastLanguageModel.get_peft_model(
model,
r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",],
lora_alpha = 64,
lora_dropout = 0, # Supports any, but = 0 is optimized
bias = "none", # Supports any, but = "none" is optimized
# [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
random_state = 3407,
use_rslora = False, # We support rank stabilized LoRA
loftq_config = None, # And LoftQ
)
print("✅ Model and LoRA adapters loaded successfully!")
print(f"\n{'='*80}")
print("🔍 SECTION 2: Checking Model Class Type")
print(f"{'='*80}")
assert isinstance(model, PeftModel), "Model should be an instance of PeftModel"
print("✅ Model is an instance of PeftModel!")
print(f"\n{'='*80}")
print("🔍 SECTION 3: Checking Config Model Class Type")
print(f"{'='*80}")
def find_lora_base_model(model_to_inspect):
current = model_to_inspect
if hasattr(current, "base_model"):
current = current.base_model
if hasattr(current, "model"):
current = current.model
return current
pass
config_model = find_lora_base_model(model) if isinstance(model, PeftModel) else model
assert config_model.__class__.__name__ == base_model_class, f"Expected config_model class to be {base_model_class}"
print("✅ config_model returns correct Base Model class:", str(base_model_class))
print(f"\n{'='*80}")
print("🔍 SECTION 4: Saving and Merging Model")
print(f"{'='*80}")
with warnings.catch_warnings():
warnings.simplefilter("error") # Treat warnings as errors
try:
model.save_pretrained_merged("orpheus", tokenizer)
print("✅ Model saved and merged successfully without warnings!")
except Exception as e:
assert False, f"Model saving/merging failed with exception: {e}"
print(f"\n{'='*80}")
print("🔍 SECTION 5: Loading Model for Inference")
print(f"{'='*80}")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/orpheus-3b-0.1-ft",
max_seq_length= 2048, # Choose any for long context!
dtype = None, # Select None for auto detection
load_in_4bit = False, # Select True for 4bit which reduces memory usage
# token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)
#from transformers import AutoProcessor
#processor = AutoProcessor.from_pretrained("unsloth/csm-1b")
print("✅ Model loaded for inference successfully!")
print(f"\n{'='*80}")
print("🔍 SECTION 6: Running Inference")
print(f"{'='*80}")
#@title Run Inference
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# Moving snac_model cuda to cpu
snac_model.to("cpu")
prompts = [
"Hey there my name is Elise, <giggles> and I'm a speech generation model that can sound like a person.",
]
chosen_voice = None # None for single-speaker
prompts_ = [(f"{chosen_voice}: " + p) if chosen_voice else p for p in prompts]
all_input_ids = []
for prompt in prompts_:
input_ids = tokenizer(prompt, return_tensors="pt").input_ids
all_input_ids.append(input_ids)
start_token = torch.tensor([[ 128259]], dtype=torch.int64) # Start of human
end_tokens = torch.tensor([[128009, 128260]], dtype=torch.int64) # End of text, End of human
all_modified_input_ids = []
for input_ids in all_input_ids:
modified_input_ids = torch.cat([start_token, input_ids, end_tokens], dim=1) # SOH SOT Text EOT EOH
all_modified_input_ids.append(modified_input_ids)
all_padded_tensors = []
all_attention_masks = []
max_length = max([modified_input_ids.shape[1] for modified_input_ids in all_modified_input_ids])
for modified_input_ids in all_modified_input_ids:
padding = max_length - modified_input_ids.shape[1]
padded_tensor = torch.cat([torch.full((1, padding), 128263, dtype=torch.int64), modified_input_ids], dim=1)
attention_mask = torch.cat([torch.zeros((1, padding), dtype=torch.int64), torch.ones((1, modified_input_ids.shape[1]), dtype=torch.int64)], dim=1)
all_padded_tensors.append(padded_tensor)
all_attention_masks.append(attention_mask)
all_padded_tensors = torch.cat(all_padded_tensors, dim=0)
all_attention_masks = torch.cat(all_attention_masks, dim=0)
input_ids = all_padded_tensors.to("cuda")
attention_mask = all_attention_masks.to("cuda")
generated_ids = model.generate(
input_ids=input_ids,
attention_mask=attention_mask,
max_new_tokens=1200,
do_sample=True,
temperature=0.6,
top_p=0.95,
repetition_penalty=1.1,
num_return_sequences=1,
eos_token_id=128258,
use_cache = True
)
token_to_find = 128257
token_to_remove = 128258
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_occurrence_idx = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
cropped_tensor = generated_ids
mask = cropped_tensor != token_to_remove
processed_rows = []
for row in cropped_tensor:
masked_row = row[row != token_to_remove]
processed_rows.append(masked_row)
code_lists = []
for row in processed_rows:
row_length = row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = row[:new_length]
trimmed_row = [t - 128266 for t in trimmed_row]
code_lists.append(trimmed_row)
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1]-4096)
layer_3.append(code_list[7*i+2]-(2*4096))
layer_3.append(code_list[7*i+3]-(3*4096))
layer_2.append(code_list[7*i+4]-(4*4096))
layer_3.append(code_list[7*i+5]-(5*4096))
layer_3.append(code_list[7*i+6]-(6*4096))
codes = [torch.tensor(layer_1).unsqueeze(0),
torch.tensor(layer_2).unsqueeze(0),
torch.tensor(layer_3).unsqueeze(0)]
# codes = [c.to("cuda") for c in codes]
audio_hat = snac_model.decode(codes)
return audio_hat
my_samples = []
for code_list in code_lists:
samples = redistribute_codes(code_list)
my_samples.append(samples)
output_path = "orpheus_audio.wav"
try:
for i, samples in enumerate(my_samples):
audio_data = samples.detach().squeeze().cpu().numpy()
import soundfile as sf
sf.write(output_path, audio_data, 24000) # Explicitly pass sample rate
print(f"✅ Audio saved to {output_path}!")
except Exception as e:
assert False, f"Inference failed with exception: {e}"
# Verify the file exists
import os
assert os.path.exists(output_path), f"Audio file not found at {output_path}"
print("✅ Audio file exists on disk!")
del my_samples, samples
## assert that transcribed_text contains The birch canoe slid on the smooth planks. Glued the sheet to the dark blue background. It's easy to tell the depth of a well. Four hours of steady work faced us.
print("✅ All sections passed successfully!")
safe_remove_directory("./unsloth_compiled_cache")
safe_remove_directory("./orpheus")
```
## /tests/utils/__init__.py
```py path="/tests/utils/__init__.py"
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from contextlib import contextmanager
@contextmanager
def timer(name):
start = time.time()
yield
end = time.time()
print(f"{name} took {end - start:.2f} seconds")
@contextmanager
def header_footer_context(title: str, char="-"):
print()
print(f"{char}" * 50 + f" {title} " + f"{char}" * 50)
yield
print(f"{char}" * (100 + len(title) + 2))
print()
```
## /tests/utils/perplexity_eval.md
# Language Model Perplexity Evaluator
A Python module for evaluating language models using perplexity metrics with sliding window approach for long sequences. This evaluator provides efficient computation of perplexity scores across datasets with model comparison capabilities.
## Basic Usage
```python
from perplexity_evaluator import ppl_model, add_to_comparison, print_model_comparison
# Simple perplexity evaluation
dataset = {"text": ["Your text samples here...", "Another text sample..."]}
perplexity = ppl_model(model, tokenizer, dataset)
print(f"Model Perplexity: {perplexity:.4f}")
# Add to comparison tracker
add_to_comparison("My Model", perplexity)
print_model_comparison()
```
## /unsloth/kernels/moe/__init__.py
```py path="/unsloth/kernels/moe/__init__.py"
```
## /unsloth/kernels/moe/grouped_gemm/__init__.py
```py path="/unsloth/kernels/moe/grouped_gemm/__init__.py"
```
The content has been capped at 50000 tokens. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.