```
├── .clang-format
├── .github/
   ├── workflows/
      ├── main.yml (200 tokens)
├── .gitignore (700 tokens)
├── .vscode/
   ├── settings.json
├── LICENSE (omitted)
├── README.md (700 tokens)
├── batch-main.py (700 tokens)
├── benches/
   ├── test_quantized_matmul.py (200 tokens)
   ├── utils.py
├── book/
   ├── .gitignore
   ├── book.toml (100 tokens)
   ├── sitemap.sh
   ├── src/
      ├── SUMMARY.md (200 tokens)
      ├── copyright.md (100 tokens)
      ├── discord-badge.svg (500 tokens)
      ├── glossary.md (100 tokens)
      ├── preface.md (800 tokens)
      ├── setup.md (400 tokens)
      ├── sitemap.txt
      ├── sitemap.xml (200 tokens)
      ├── week1-01-attention.md (1300 tokens)
      ├── week1-02-positional-encodings.md (800 tokens)
      ├── week1-03-gqa.md (1100 tokens)
      ├── week1-04-rmsnorm-and-mlp.md (1000 tokens)
      ├── week1-05-qwen2-model.md (1000 tokens)
      ├── week1-06-generate-response.md (700 tokens)
      ├── week1-07-sampling-prepare.md (900 tokens)
      ├── week1-overview.md (700 tokens)
      ├── week2-overview.md (300 tokens)
      ├── week3-overview.md
   ├── theme/
      ├── head.hbs._
├── main.py (600 tokens)
├── pdm.lock (omitted)
├── pyproject.toml (300 tokens)
├── scripts/
   ├── check-installation.py (100 tokens)
   ├── dev-tools.py (500 tokens)
   ├── diff_api.py (500 tokens)
├── src/
   ├── extensions/
      ├── .clangd
      ├── CMakeLists.txt (400 tokens)
      ├── bindings.cpp (200 tokens)
      ├── build.py (100 tokens)
      ├── src/
         ├── axpby.cpp (2k tokens)
         ├── axpby.h (600 tokens)
         ├── axpby.metal (300 tokens)
         ├── tiny_llm_ext.h
         ├── utils.cpp (100 tokens)
      ├── test.py (100 tokens)
      ├── tiny_llm_ext/
         ├── __init__.py (100 tokens)
   ├── extensions_ref/
      ├── .clangd
      ├── CMakeLists.txt (500 tokens)
      ├── bindings.cpp (300 tokens)
      ├── build.py (100 tokens)
      ├── src/
         ├── flash_attention.cpp (2.4k tokens)
         ├── flash_attention.metal (900 tokens)
         ├── quantized_matmul.cpp (1900 tokens)
         ├── quantized_matmul.metal (500 tokens)
         ├── tiny_llm_ext.h (600 tokens)
         ├── utils.cpp (100 tokens)
      ├── test.py (100 tokens)
      ├── tiny_llm_ext_ref/
         ├── __init__.py (100 tokens)
   ├── tiny_llm/
      ├── __init__.py (100 tokens)
      ├── attention.py (200 tokens)
      ├── basics.py (100 tokens)
      ├── embedding.py (100 tokens)
      ├── generate.py (100 tokens)
      ├── kv_cache.py (200 tokens)
      ├── layer_norm.py
      ├── positional_encoding.py (100 tokens)
      ├── quantize.py (200 tokens)
      ├── qwen2_week1.py (400 tokens)
      ├── qwen2_week2.py (500 tokens)
      ├── sampler.py
   ├── tiny_llm_ref/
      ├── __init__.py (100 tokens)
      ├── attention.py (900 tokens)
      ├── basics.py (100 tokens)
      ├── embedding.py (100 tokens)
      ├── generate.py (1700 tokens)
      ├── kv_cache.py (1600 tokens)
      ├── layer_norm.py (100 tokens)
      ├── positional_encoding.py (500 tokens)
      ├── quantize.py (400 tokens)
      ├── qwen2_week1.py (1600 tokens)
      ├── qwen2_week2.py (1800 tokens)
      ├── sampler.py (200 tokens)
├── tests/
   ├── .gitignore
   ├── tiny_llm_base.py
   ├── utils.py (400 tokens)
├── tests_refsol/
   ├── test_rope.py (200 tokens)
   ├── test_week_1_day_1.py (1400 tokens)
   ├── test_week_1_day_2.py (500 tokens)
   ├── test_week_1_day_3.py (900 tokens)
   ├── test_week_1_day_4.py (700 tokens)
   ├── test_week_1_day_5.py (600 tokens)
   ├── test_week_1_day_6.py
   ├── test_week_1_day_7.py
   ├── test_week_2_day_2.py (300 tokens)
   ├── test_week_2_day_3.py (300 tokens)
   ├── tiny_llm_base.py
   ├── utils.py
```


## /.clang-format

```clang-format path="/.clang-format" 
BasedOnStyle: Google
DerivePointerAlignment: false
PointerAlignment: Right
ColumnLimit: 120
IndentWidth: 4
AccessModifierOffset: -4

```

## /.github/workflows/main.yml

```yml path="/.github/workflows/main.yml" 
name: CI (main)

on:
  push:
    branches:
      - main

env:
  CARGO_TERM_COLOR: always

jobs:
  build:
    runs-on: ubuntu-22.04
    steps:
      - uses: actions/checkout@v2
      - name: setup rust toolchain
        run: rustup update && rustup toolchain install
      - uses: dtolnay/rust-toolchain@stable
      - run: cargo install mdbook-katex
      - uses: taiki-e/install-action@mdbook
      - name: patch for gh-pages build
        run: mv book/theme/head.hbs._ book/theme/head.hbs
      - name: build book
        run: cd book && mdbook build
      - uses: actions/upload-pages-artifact@v3
        with:
          path: book/book

  deploy:
    needs: build
    permissions:
      pages: write
      id-token: write
    environment:
      name: github-pages
      url: ${{ steps.deployment.outputs.page_url }}
    runs-on: ubuntu-22.04
    if: github.repository == 'skyzh/tiny-llm'
    steps:
      - name: Deploy to GitHub Pages
        id: deployment
        uses: actions/deploy-pages@v4

```

## /.gitignore

```gitignore path="/.gitignore" 
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# UV
#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#uv.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

# Ruff stuff:
.ruff_cache/

# PyPI configuration file
.pypirc

*.dylib
*.metallib

```

## /.vscode/settings.json

```json path="/.vscode/settings.json" 
{
    "cmake.ignoreCMakeListsMissing": true
}

```

## /README.md

# tiny-llm - LLM Serving in a Week

[![CI (main)](https://github.com/skyzh/tiny-llm/actions/workflows/main.yml/badge.svg)](https://github.com/skyzh/tiny-llm/actions/workflows/main.yml)

A course on LLM serving using MLX for system engineers. The codebase
is solely (almost!) based on MLX array/matrix APIs without any high-level neural network APIs, so that we
can build the model serving infrastructure from scratch and dig into the optimizations.

The goal is to learn the techniques behind efficiently serving a large language model (e.g., Qwen2 models).

Why MLX: nowadays it's easier to get a macOS-based local development environment than setting up an NVIDIA GPU.

Why Qwen2: this was the first LLM I've interacted with -- it's the go-to example in the vllm documentation. I spent some time looking at the vllm source code and built some knowledge around it.

## Book

The tiny-llm book is available at [https://skyzh.github.io/tiny-llm/](https://skyzh.github.io/tiny-llm/). You can follow the guide and start building.

## Community

You may join skyzh's Discord server and study with the tiny-llm community.

[![Join skyzh's Discord Server](book/src/discord-badge.svg)](https://skyzh.dev/join/discord)

## Roadmap

Week 1 is complete. Week 2 is in progress.

| Week + Chapter | Topic                                                       | Code | Test | Doc |
| -------------- | ----------------------------------------------------------- | ---- | ---- | --- |
| 1.1            | Attention                                                   | ✅    | ✅   | ✅  |
| 1.2            | RoPE                                                        | ✅    | ✅   | ✅  |
| 1.3            | Grouped Query Attention                                     | ✅    | ✅   | ✅  |
| 1.4            | RMSNorm and MLP                                             | ✅    | ✅   | ✅  |
| 1.5            | Load the Model                                              | ✅    | ✅   | ✅  |
| 1.6            | Generate Responses (aka Decoding)                           | ✅    | ✅   | ✅  |
| 1.7            | Sampling                                                    | ✅    | ✅   | ✅  |
| 2.1            | Key-Value Cache                                             | ✅    | 🚧   | 🚧  |
| 2.2            | Quantized Matmul and Linear - CPU                           | ✅    | 🚧   | 🚧  |
| 2.3            | Quantized Matmul and Linear - GPU                           | ✅    | 🚧   | 🚧  |
| 2.4            | Flash Attention 2 - CPU                                     | ✅    | 🚧   | 🚧  |
| 2.5            | Flash Attention 2 - GPU                                     | ✅    | 🚧   | 🚧  |
| 2.6            | Continuous Batching                                         | ✅    | 🚧   | 🚧  |
| 2.7            | Chunked Prefill                                             | ✅    | 🚧   | 🚧  |
| 3.1            | Paged Attention - Part 1                                    | 🚧    | 🚧   | 🚧  |
| 3.2            | Paged Attention - Part 2                                    | 🚧    | 🚧   | 🚧  |
| 3.3            | MoE (Mixture of Experts)                                    | 🚧    | 🚧   | 🚧  |
| 3.4            | Speculative Decoding                                        | 🚧    | 🚧   | 🚧  |
| 3.5            | Prefill-Decode Separation (requires two Macintosh devices)  | 🚧    | 🚧   | 🚧  |
| 3.6            | Parallelism                                                 | 🚧    | 🚧   | 🚧  |
| 3.7            | AI Agent     / Tool Calling                                 | 🚧    | 🚧   | 🚧  |

Other topics not covered: quantized/compressed kv cache, prefix/prompt cache; sampling, fine tuning; smaller kernels (softmax, silu, etc)


## /batch-main.py

```py path="/batch-main.py" 
from mlx_lm import load
import mlx.core as mx
import argparse
import random

parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="Qwen/Qwen2-7B-Instruct-MLX")

shanghai_wikipedia = """
Shanghai[a] is a direct-administered municipality and the most populous urban area in China. The city is located on the Chinese shoreline on the southern estuary of the Yangtze River, with the Huangpu River flowing through it. The population of the city proper is the second largest in the world after Chongqing, with around 24.87 million inhabitants in 2023, while the urban area is the most populous in China, with 29.87 million residents. As of 2022, the Greater Shanghai metropolitan area was estimated to produce a gross metropolitan product (nominal) of nearly 13 trillion RMB ($1.9 trillion).[13] Shanghai is one of the world's major centers for finance, business and economics, research, science and technology, manufacturing, transportation, tourism, and culture. The Port of Shanghai is the world's busiest container port.
""".strip()

shanghai_wikipedia += "Based on the previous information, "

prompts = [
    shanghai_wikipedia + "Where is Shanghai?",
    shanghai_wikipedia + "How much is the population of Shanghai?",
    shanghai_wikipedia + "What is the GDP of Shanghai?",
    shanghai_wikipedia + "What is the population of Shanghai?",
    shanghai_wikipedia + "What is the second largest city proper in China?",
    shanghai_wikipedia + "What is Shanghai known for?",
    shanghai_wikipedia + "What are the rivers in Shanghai?",
    shanghai_wikipedia + "Shanghai is the major center for what?",
    "What is the capital of France?",
    "Where is New York City?",
    "Where is Tokyo?",
    "What is the capital of China?",
    "Where is Pittsburgh?",
    "Where is Vancouver?",
    "Where is Toronto?",
    "Give me a short introduction to large language model.",
]

# shuffle prompts
random.shuffle(prompts)

parser.add_argument("--solution", type=str, default="tiny_llm")
parser.add_argument("--device", type=str, default="gpu")
parser.add_argument("--batch-size", type=int, default=5)
parser.add_argument("--prefill-step", type=int, default=128)
args = parser.parse_args()

if args.solution == "tiny_llm":
    print("Using your tiny_llm solution")
    from tiny_llm import Qwen2ModelWeek2, batch_generate

elif args.solution == "tiny_llm_ref" or args.solution == "ref":
    print("Using tiny_llm_ref solution")
    from tiny_llm_ref import Qwen2ModelWeek2, batch_generate

else:
    raise ValueError(f"Solution {args.solution} not supported")

mlx_model, tokenizer = load(args.model)

with mx.stream(mx.gpu if args.device == "gpu" else mx.cpu):
    tiny_llm_model = Qwen2ModelWeek2(mlx_model)
    encoded_prompts = []
    for idx, prompt in enumerate(prompts):
        print(f"Prompt {idx}: {prompt}")
        messages = [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt},
        ]
        prompt = tokenizer.apply_chat_template(
            messages,
            tokenize=False,
            add_generation_prompt=True,
        )
        encoded_prompts.append(prompt)
    result = batch_generate(
        tiny_llm_model,
        tokenizer,
        encoded_prompts,
        batch_size=args.batch_size,
        prefill_step=args.prefill_step,
    )
    for prompt_idx, text in result:
        print(f"Q: {prompts[prompt_idx]}")
        print(f"A: {text}")

```

## /benches/test_quantized_matmul.py

```py path="/benches/test_quantized_matmul.py" 
import mlx.core as mx
import mlx.nn as nn
import tiny_llm_ref
from .utils import assert_allclose
import numpy as np


def get_test_matmul_data():
    # Qwen2 7B matrix size
    init = nn.init.he_uniform(mx.float16)
    w = init(mx.zeros((512, 3584)))
    x = init(mx.zeros((300, 3584)))
    w_q, scales, biases = mx.quantize(w)
    res = mx.quantized_matmul(x, w_q, scales=scales, biases=biases)
    return w_q, scales, biases, x, res


def test_mlx_quantized_matmul(benchmark):
    with mx.stream(mx.gpu):
        w_q, scales, biases, x, res = get_test_matmul_data()
        result = benchmark(
            lambda: mx.quantized_matmul(x, w_q, scales=scales, biases=biases)
        )
        assert_allclose(result, res, precision=np.float16, rtol=1e-2)


def test_refsol_quantized_matmul(benchmark):
    with mx.stream(mx.gpu):
        w_q, scales, biases, x, res = get_test_matmul_data()
        result = benchmark(
            lambda: tiny_llm_ref.quantized_matmul(
                scales, biases, 64, 4, x, w_q, transpose_b=True
            )
        )
        assert_allclose(result, res, precision=np.float16, rtol=1e-2)

```

## /benches/utils.py

```py path="/benches/utils.py" 
../tests/utils.py
```

## /book/.gitignore

```gitignore path="/book/.gitignore" 
book

```

## /book/book.toml

```toml path="/book/book.toml" 
[book]
authors = ["Alex Chi", "Connor Zhang"]
language = "en"
multilingual = false
src = "src"
title = "Tiny LLM - LLM Serving in a Week"

[preprocessor.toc]
command = "mdbook-toc"
renderer = ["html"]

[output.html]
git-repository-url = "https://github.com/skyzh/tiny-llm"

[preprocessor.katex]
after = ["links"]

```

## /book/sitemap.sh

```sh path="/book/sitemap.sh" 
#!/bin/bash

mdbook build
static-sitemap-cli -b https://skyzh.github.io/tiny-llm -r book -f xml -o > src/sitemap.xml
static-sitemap-cli -b https://skyzh.github.io/tiny-llm -r book -f txt -o > src/sitemap.txt

```

## /book/src/SUMMARY.md

# LLM Serving in a Week

[Preface](./preface.md)
[Setting Up the Environment](./setup.md)

---

- [Week 1: From Matmul to Text](./week1-overview.md)
    - [Attention and Multi-Head Attention](./week1-01-attention.md)
    - [Positional Encodings and RoPE](./week1-02-positional-encodings.md)
    - [Grouped/Multi Query Attention](./week1-03-gqa.md)
    - [RMSNorm and MLP](./week1-04-rmsnorm-and-mlp.md)
    - [The Qwen2 Model](./week1-05-qwen2-model.md)
    - [Generating the Response](./week1-06-generate-response.md)
    - [Sampling and Preparing for Week 2](./week1-07-sampling-prepare.md)
    <!--
    - [Attention and Multi-Head Attention](./week1-01-attention.md)
    - [Positional Embeddings and RoPE](./week1-02-positional-embeddings.md)
    - [Grouped/Multi Query Attention](./week1-03-gqa.md)
    - [Multilayer Perceptron Layer and Transformer](./week1-04-mlp-transformer.md)
    - [Wiring the Qwen2 Model](./week1-05-model-1.md)
    - [Loading the Model](./week1-06-model-2.md)
    - [Generating the Response](./week1-07-generate.md)
    -->

- [Week 2: Optimizing]()

- [Week 3: Serving]()

---

[Glossary Index](./glossary.md)


## /book/src/copyright.md

<p style="text-align: center; margin-top: 3em"><small>Your feedback is greatly appreciated. Welcome to join our <a href="https://skyzh.dev/join/discord">Discord Community</a>.<br>Found an issue? Create an issue / pull request on <a href="https://github.com/skyzh/tiny-llm">github.com/skyzh/tiny-llm</a>.<br>tiny-llm-book © 2025 by Alex Chi Z is licensed under CC BY-NC-SA 4.0.</small></p>


## /book/src/discord-badge.svg

```svg path="/book/src/discord-badge.svg" 
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="145.5" height="28" role="img" aria-label="MDbot#9808"><title>MDbot#9808</title><g shape-rendering="crispEdges"><rect width="32" height="28" fill="#5865f2"/><rect x="32" width="113.5" height="28" fill="#555"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="100"><image x="9" y="7" width="14" height="14" xlink:href="data:image/svg+xml;base64,PHN2ZyBmaWxsPSJ3aGl0ZSIgcm9sZT0iaW1nIiB2aWV3Qm94PSIwIDAgMjQgMjQiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+PHRpdGxlPkRpc2NvcmQ8L3RpdGxlPjxwYXRoIGQ9Ik0yMC4zMTcgNC4zNjk4YTE5Ljc5MTMgMTkuNzkxMyAwIDAwLTQuODg1MS0xLjUxNTIuMDc0MS4wNzQxIDAgMDAtLjA3ODUuMDM3MWMtLjIxMS4zNzUzLS40NDQ3Ljg2NDgtLjYwODMgMS4yNDk1LTEuODQ0Ny0uMjc2Mi0zLjY4LS4yNzYyLTUuNDg2OCAwLS4xNjM2LS4zOTMzLS40MDU4LS44NzQyLS42MTc3LTEuMjQ5NWEuMDc3LjA3NyAwIDAwLS4wNzg1LS4wMzcgMTkuNzM2MyAxOS43MzYzIDAgMDAtNC44ODUyIDEuNTE1LjA2OTkuMDY5OSAwIDAwLS4wMzIxLjAyNzdDLjUzMzQgOS4wNDU4LS4zMTkgMTMuNTc5OS4wOTkyIDE4LjA1NzhhLjA4MjQuMDgyNCAwIDAwLjAzMTIuMDU2MWMyLjA1MjggMS41MDc2IDQuMDQxMyAyLjQyMjggNS45OTI5IDMuMDI5NGEuMDc3Ny4wNzc3IDAgMDAuMDg0Mi0uMDI3NmMuNDYxNi0uNjMwNC44NzMxLTEuMjk1MiAxLjIyNi0xLjk5NDJhLjA3Ni4wNzYgMCAwMC0uMDQxNi0uMTA1N2MtLjY1MjgtLjI0NzYtMS4yNzQzLS41NDk1LTEuODcyMi0uODkyM2EuMDc3LjA3NyAwIDAxLS4wMDc2LS4xMjc3Yy4xMjU4LS4wOTQzLjI1MTctLjE5MjMuMzcxOC0uMjkxNGEuMDc0My4wNzQzIDAgMDEuMDc3Ni0uMDEwNWMzLjkyNzggMS43OTMzIDguMTggMS43OTMzIDEyLjA2MTQgMGEuMDczOS4wNzM5IDAgMDEuMDc4NS4wMDk1Yy4xMjAyLjA5OS4yNDYuMTk4MS4zNzI4LjI5MjRhLjA3Ny4wNzcgMCAwMS0uMDA2Ni4xMjc2IDEyLjI5ODYgMTIuMjk4NiAwIDAxLTEuODczLjg5MTQuMDc2Ni4wNzY2IDAgMDAtLjA0MDcuMTA2N2MuMzYwNC42OTguNzcxOSAxLjM2MjggMS4yMjUgMS45OTMyYS4wNzYuMDc2IDAgMDAuMDg0Mi4wMjg2YzEuOTYxLS42MDY3IDMuOTQ5NS0xLjUyMTkgNi4wMDIzLTMuMDI5NGEuMDc3LjA3NyAwIDAwLjAzMTMtLjA1NTJjLjUwMDQtNS4xNzctLjgzODItOS42NzM5LTMuNTQ4NS0xMy42NjA0YS4wNjEuMDYxIDAgMDAtLjAzMTItLjAyODZ6TTguMDIgMTUuMzMxMmMtMS4xODI1IDAtMi4xNTY5LTEuMDg1Ny0yLjE1NjktMi40MTkgMC0xLjMzMzIuOTU1NS0yLjQxODkgMi4xNTctMi40MTg5IDEuMjEwOCAwIDIuMTc1NyAxLjA5NTIgMi4xNTY4IDIuNDE5IDAgMS4zMzMyLS45NTU1IDIuNDE4OS0yLjE1NjkgMi40MTg5em03Ljk3NDggMGMtMS4xODI1IDAtMi4xNTY5LTEuMDg1Ny0yLjE1NjktMi40MTkgMC0xLjMzMzIuOTU1NC0yLjQxODkgMi4xNTY5LTIuNDE4OSAxLjIxMDggMCAyLjE3NTcgMS4wOTUyIDIuMTU2OCAyLjQxOSAwIDEuMzMzMi0uOTQ2IDIuNDE4OS0yLjE1NjggMi40MTg5WiIvPjwvc3ZnPg=="/><text font-weight='bold' transform="scale(.1)" x="887.5" y="175" textLength="895" fill="#fff" >skyzh's server</text></g></svg>
```

## /book/src/glossary.md

# Glossary Index

- [Scaled Dot Product Attention](./week1-01-attention.md)
- [Multi Head Attention](./week1-01-attention.md)
- [Linear](./week1-01-attention.md)
- [Rotary Positional Encoding](./week1-02-positional-encodings.md)
- [Grouped Query Attention](./week1-03-gqa.md)
- [Qwen2 Attention Module](./week1-03-gqa.md)
- [RMSNorm](./week1-04-rmsnorm-and-mlp.md)
- [SiLU](./week1-04-rmsnorm-and-mlp.md)
- [SwiGLU](./week1-04-rmsnorm-and-mlp.md)
- [MLP](./week1-04-rmsnorm-and-mlp.md)
- [Embedding](./week1-05-qwen2-model.md)
- [Qwen2 Transformer Block](./week1-05-qwen2-model.md)
- [Week 1 Qwen2 Model](./week1-05-qwen2-model.md)
- [dequantize_linear](./week1-05-qwen2-model.md)

{{#include copyright.md}}


## /book/src/preface.md

# Preface

This course is designed for systems engineers who want to understand how LLMs work.

As a system engineer, I always wonder how things work internally and how to optimize them. I had a hard time figuring out
the LLM stuff. Most of the open source projects that serve LLMs are highly optimized with CUDA kernels and other low-level
optimizations. It is not easy to understand the whole picture by looking at a codebase of 100k lines of code. Therefore, I
decided to implement an LLM serving project from scratch -- with only matrix manipulations APIs, so that I can understand
what it takes to load those LLM model parameters and do the math magic to generate text.

You can think of this course as an LLM version of CMU Deep Learning Systems course's [needle](https://github.com/dlsyscourse/hw1/tree/main/python/needle) project.

## Prerequisites

You should have some experience with the basics of deep learning and have some idea of how PyTorch works. Some recommended
resources are:

- CMU [Intro to Machine Learning](https://www.cs.cmu.edu/~mgormley/courses/10601/) -- this course teaches you the basics of machine learning
- CMU [Deep Learning Systems](https://dlsyscourse.org) -- this course teaches you how to build PyTorch from scratch

## Environment Setup

This course uses [MLX](https://github.com/ml-explore/mlx), an array/machine learning library for Apple Silicon. Nowaways
it's much easier to get an Apple Silicon device than NVIDIA GPUs. In theory you can also do this course with PyTorch or
numpy, but we just don't have the test infra to support them. We test your implementation against PyTorch's CPU implementation
and MLX's implementation to ensure correctness.

## Course Structure

This course is divided into 3 weeks. We will serve the Qwen2-7B-Instruct model and optimize it throughout the course.

- Week 1: serve Qwen2 with purely matrix manipulation APIs. Just Python.
- Week 2: optimizations, implement C++/Metal custom kernels to make the model run faster.
- Week 3: more optimizations, batch the requests to serve the model with high throughput.

## How to Use This Book

The thing you are reading right now is the tiny-llm book. It is designed more like a guidebook instead of a textbook
that explains everything from scratch. In this course, we provide the materials that we find useful on the Internet
when the author(s) implemented the tiny-llm project. The Internet does a better job of explaining the concepts and I
do not think it is necessary to repeat everything here. Think of this as a guide (of a list of tasks) and some hints!
We will also unify the language of the Internet materials so that it is easier to correspond them to the codebase.
For example, we will have a unified dimension symbols for the tensors. You do not need to figure out what `H`, `L`, `E`
stands for and what dimension of the matrixes are passed into the function.

## About the Authors

This course is created by [Chi](https://github.com/skyzh) and [Connor](https://github.com/Connor1996).

Chi is a systems software engineer at [Neon](https://neon.tech) (now acquired by Databricks), focusing on storage systems.
Fascinated by the vibe of large language models (LLMs), he created this course to explore how LLM inference works.

Connor is a software engineer at [PingCAP](https://pingcap.com), developing the TiKV distributed key-value database.
Curious about the internals of LLMs, he joined this course to practice how to build a high-performance LLM serving system
from scratch, and contributed to building the course for the community.

## Community

You may join skyzh's Discord server and study with the tiny-llm community.

[![Join skyzh's Discord Server](discord-badge.svg)](https://skyzh.dev/join/discord)

## Get Started

Now, you can start to set up the environment following the instructions in [Setting Up the Environment](./setup.md) and
begin your journey to build tiny-llm!

{{#include copyright.md}}


## /book/src/setup.md

# Setting Up the Environment

To follow along this course, you will need a Macintosh device with Apple Silicon. We manage the codebase with pdm.

## Install pdm

Please follow the [offcial guide](https://pdm-project.org/en/latest/) to install pdm.

## Clone the Repository

```bash
git clone https://github.com/skyzh/tiny-llm
```

The repository is organized as follows:

```
src/tiny_llm -- your implementation
src/tiny_llm_week1_ref -- reference implementation of week 1
tests/ -- unit tests for your implementation
tests_ref_impl_week1/ -- unit tests for the reference implementation of week 1
book/ -- the book
```

We provide all reference implementations and you can refer to them if you get stuck in the course.

## Install Dependencies

```bash
cd tiny-llm
pdm install -v # this will automatically create a virtual environment and install all dependencies
```

## Check the Installation

```bash
pdm run check-installation
# The reference solution should pass all the *week 1* tests
pdm run test-refsol -- -- -k week_1
```

## Run Unit Tests

Your code is in `src/tiny_llm`. You can run the unit tests with:

```bash
pdm run test
```

## Download the Model Parameters

We will use the Qwen2-7B-Instruct model for this course. It takes ~20GB of memory in week 1 to load the model parameters.
If you do not have enough memory, you can consider using the smaller 0.5B model.

Follow the guide of [this page](https://huggingface.co/docs/huggingface_hub/main/en/guides/cli) to install the huggingface
cli.

The model parameters are hosted on Hugging Face. Once you authenticated your cli with the credentials, you can download
them with:

```bash
huggingface-cli login
huggingface-cli download Qwen/Qwen2-7B-Instruct-MLX
```

Then, you can run:

```bash
pdm run main --solution ref --loader week1
```

It should load the model and print some text.

In week 2, we will write some kernels in C++/Metal, and we will need to set up additional tools for that. We will cover it later.

{{#include copyright.md}}


## /book/src/sitemap.txt

https://skyzh.github.io/tiny-llm
https://skyzh.github.io/tiny-llm/glossary
https://skyzh.github.io/tiny-llm/preface
https://skyzh.github.io/tiny-llm/setup
https://skyzh.github.io/tiny-llm/week1-overview


## /book/src/sitemap.xml

```xml path="/book/src/sitemap.xml" 
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
    <url>
        <loc>https://skyzh.github.io/tiny-llm</loc>
        <lastmod>2025-04-26T03:38:02.503Z</lastmod>
    </url>
    <url>
        <loc>https://skyzh.github.io/tiny-llm/glossary</loc>
        <lastmod>2025-04-26T03:38:02.504Z</lastmod>
    </url>
    <url>
        <loc>https://skyzh.github.io/tiny-llm/preface</loc>
        <lastmod>2025-04-26T03:38:02.503Z</lastmod>
    </url>
    <url>
        <loc>https://skyzh.github.io/tiny-llm/setup</loc>
        <lastmod>2025-04-26T03:38:02.504Z</lastmod>
    </url>
    <url>
        <loc>https://skyzh.github.io/tiny-llm/week1-overview</loc>
        <lastmod>2025-04-26T03:38:02.504Z</lastmod>
    </url>
</urlset>

```

## /book/src/week1-01-attention.md

# Week 1 Day 1: Attention and Multi-Head Attention

In day 1, we will implement the basic attention layer and the multi-head attention layer. Attention layers take a input
sequence and focus on different parts of the sequence when generating the output. Attention layers are the key building
blocks of the Transformer models.

[📚 Reading: Transformer Architecture](https://huggingface.co/learn/llm-course/chapter1/6)

We use the Qwen2 model for text generation. The model is a decoder-only model. The input of the model is a sequence of
token embeddings. The output of the model is the most likely next token ID.

[📚 Reading: LLM Inference, the Decode Phase](https://huggingface.co/learn/llm-course/chapter1/8)

Back to the attention layer. The attention layer takes a query, a key, and a value. In a classic implementation, all
of them are of the same shape: `N.. x L x D`.

`N..` is zero or some number of dimensions for batches. Within each of the batch, `L` is the sequence length and `D` is
the dimension of the embedding for a given head in the sequence.

So, for example, if we have a sequence of 1024 tokens, where each of the token has a 512-dimensional embedding (head_dim),
we will pass a tensor of the shape `N.. x 1024 x 512` to the attention layer.

## Task 1: Implement `scaled_dot_product_attention_simple`

In this task, we will implement the scaled dot product attention function. We assume the input tensors (Q, K, V) have
the same dimensions. In the next few chapters, we will support more variants of attentions that might not have the same
dimensions for all tensors.

```
src/tiny_llm/attention.py
```

**📚 Readings**

* [Annotated Transformer](https://nlp.seas.harvard.edu/annotated-transformer/)
* [PyTorch Scaled Dot Product Attention API](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) (assume `enable_gqa=False`, assume dim_k=dim_v=dim_q and H_k=H_v=H_q)
* [MLX Scaled Dot Product Attention API](https://ml-explore.github.io/mlx/build/html/python/_autosummary/mlx.core.fast.scaled_dot_product_attention.html) (assume dim_k=dim_v=dim_q and H_k=H_v=H_q)
* [Attention is All You Need](https://arxiv.org/abs/1706.03762)

Implement `scaled_dot_product_attention` following the below attention function. The function takes key, value, and query of the same dimensions, and an optional mask matrix `M`.

$$
  \text{Attention} = \text{softmax}(\frac{QK^T}{\sqrt{d_k}} + M)V
$$

Note that $\frac{1}{\sqrt{d_k}}$ is the scale factor. The user might specify their own scale factor or use the default one.

```
L is seq_len, in PyTorch API it's S (source len)
D is head_dim

key: N.. x L x D
value: N.. x L x D
query: N.. x L x D
output: N.. x L x D
scale = 1/sqrt(D) if not specified
```

You may use `softmax` provided by mlx and implement it later in week 2.

Because we are always using the attention layer within the multi-head attention layer, the actual tensor shape when serving
the model will be:

```
key: 1 x H x L x D
value: 1 x H x L x D
query: 1 x H x L x D
output: 1 x H x L x D
mask: 1 x H x L x L
```

.. though the attention layer only cares about the last two dimensions. The test case will test any shape of the batching dimension.

At the end of this task, you should be able to pass the following tests:

```
pdm run test --week 1 --day 1 -- -k task_1
```

## Task 2: Implement `SimpleMultiHeadAttention`

In this task, we will implement the multi-head attention layer.

```
src/tiny_llm/attention.py
```

**📚 Readings**

* [Annotated Transformer](https://nlp.seas.harvard.edu/annotated-transformer/)
* [PyTorch SimpleMultiHeadAttention API](https://pytorch.org/docs/stable/generated/torch.nn.SimpleMultiHeadAttention.html) (assume dim_k=dim_v=dim_q and H_k=H_v=H_q)
* [MLX SimpleMultiHeadAttention API](https://ml-explore.github.io/mlx/build/html/python/nn/_autosummary/mlx.nn.SimpleMultiHeadAttention.html) (assume dim_k=dim_v=dim_q and H_k=H_v=H_q)
* [The Illustrated GPT-2 (Visualizing Transformer Language Models)](https://jalammar.github.io/illustrated-gpt2) helps you better understand what key, value, and query are.

Implement `SimpleMultiHeadAttention`. The layer takes a batch of vectors, maps it through the K, V, Q weight matrixes, and use the attention function we implemented in task 1 to compute the result. The output needs to be mapped using the O
weight matrix.

You will also need to implement the `linear` function in `basics.py` first. For `linear`, it takes a tensor of the shape `N.. x I`, a weight matrix of the shape `O x I`, and a bias vector of the shape `O`. The output is of the shape `N.. x O`. `I` is the input dimension and `O` is the output dimension.

For the `SimpleMultiHeadAttention` layer, the input tensors `query`, `key`, `value` have the shape `N x L x E`, where `E` is the dimension of the
embedding for a given token in the sequence. The `K/Q/V` weight matrixes will map the tensor into key, value, and query
separately, where the dimension `E` will be mapped into a dimension of size `H x D`, which means that the token embedding
gets mapped into `H` heads, each with a dimension of `D`. You can directly reshape the tensor to split the `H x D` dimension
into two dimensions of `H` and `D` to get `H` heads for the token.

Now, you have a tensor of the shape `N.. x L x H x D` for each of the key, value, and query. To apply the attention function, you first need to transpose them into shape `N.. x H x L x D`.

* This makes each attention head an independent batch, so that attention can be calculated separately for each head across the sequence `L`.
* If you kept `H` behind `L`, attention calculation would mix head and sequence dimensions, which is not what we want — each head should focus only on the relationships between tokens in its own subspace.

The attention function produces output for each of the head of the token. Then, you can transpose it back into `N.. x L x H x D` and reshape it
so that all heads get merged back together with a shape of `N.. x L x (H x D)`. Map it through the output weight matrix to get
the final output.

```
E is hidden_size or embed_dim or dims or model_dim
H is num_heads
D is head_dim
L is seq_len, in PyTorch API it's S (source len)

w_q/w_k/w_v: E x (H x D)
output/input: N x L x E
w_o: (H x D) x E
```

At the end of the task, you should be able to pass the following tests:

```
pdm run test --week 1 --day 1 -- -k task_2
```

You can run all tests for the day with:

```
pdm run test --week 1 --day 1
```

{{#include copyright.md}}


## /book/src/week1-02-positional-encodings.md

# Week 1 Day 2: Positional Encodings and RoPE

In day 2, we will implement the positional embedding used in the Qwen2 model: Rotary Postional Encoding. In a transformer
model, we need a way to embed the information of the position of a token into the input of the attention layers. In Qwen2,
positional embedding is applied within the multi head attention layer on the query and key vectors.

**📚 Readings**

- [You could have designed state of the art positional encoding](https://huggingface.co/blog/designing-positional-encoding)
- [Roformer: Enhanced Transformer with Rotary Positional Encoding](https://arxiv.org/pdf/2104.09864)

## Task 1: Implement Rotary Postional Encoding "RoPE"

You will need to modify the following file:

```
src/tiny_llm/positional_encoding.py
```

In traditional RoPE (as described in the readings), the positional encoding is applied to each head of the query and key vectors.
You can pre-compute the frequencies when initializing the `RoPE` class.

If `offset` is not provided, the positional encoding will be applied to the entire sequence: 0th frequency applied to the
0th token, up to the (L-1)-th token. Otherwise, the positional encoding will be applied to the sequence according to the
offset slice. If the offset slice is 5..10, then the sequence length provided to the layer would be 5, and the 0th token
will be applied with the 5th frequency.

You *only* need to consider `offset` being `None` or a single slice. The `list[slice]` case will be implemented when we
start implementing the continuous batching feature. Assume all batches provided use the same offset.

```
x: (N, L, H, D)
cos/sin_freqs: (MAX_SEQ_LEN, D // 2)
```

In the traditional form of RoPE, each head on the dimension of `D` is viewed as consequtive complex pairs. That is to
say, if D = 8, then, x[0] and x[1] are a pair, x[2] and x[3] are another pair, and so on. A pair gets the same frequency
from `cos/sin_freqs`.

Note that, practically, D can be even or odd. In the case of D being odd, the last dimension of `x` doesn’t have a matching pair,
and is typically left untouched in most implementations. For simplicity, we just assume that D is always even.

```
output[0] = x[0] * cos_freqs[0] + x[1] * -sin_freqs[0]
output[1] = x[0] * sin_freqs[0] + x[1] * cos_freqs[0]
output[2] = x[2] * cos_freqs[1] + x[3] * -sin_freqs[1]
output[3] = x[2] * sin_freqs[1] + x[3] * cos_freqs[1]
...and so on
```

You can do this by reshaping `x` to (N, L, H, D // 2, 2) and then applying the above formula to each pair.

**📚 Readings**

- [PyTorch RotaryPositionalEmbeddings API](https://pytorch.org/torchtune/stable/generated/torchtune.modules.RotaryPositionalEmbeddings.html)
- [MLX Implementation of RoPE before the custom metal kernel implementation](https://github.com/ml-explore/mlx/pull/676/files)

You can test your implementation by running the following command:

```
pdm run test --week 1 --day 2 -- -k task_1
```

## Task 2: Implement `RoPE` in the non-traditional form

The Qwen2 model uses a non-traditional form of RoPE. In this form, the head embedding dimension is split into two halves,
and the two halves are applied with different frequencies. Let's say `x1 = x[.., :HALF_DIM]` and `x2 = x[.., HALF_DIM:]`.

```
output[0] = x1[0] * cos_freqs[0] + x2[0] * -sin_freqs[0]
output[HALF_DIM] = x1[0] * sin_freqs[0] + x2[0] * cos_freqs[0]
output[1] = x1[1] * cos_freqs[1] + x2[1] * -sin_freqs[1]
output[HALF_DIM + 1] = x1[1] * sin_freqs[1] + x2[1] * cos_freqs[1]
...and so on
```

You can do this by directly getting the first half / second half of the embedding dimension of `x` and applying the
frequencies to each half separately.

**📚 Readings**

- [vLLM implementation of RoPE](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/rotary_embedding.py)


You can test your implementation by running the following command:

```
pdm run test --week 1 --day 2 -- -k task_2
```

At the end of the day, you should be able to pass all tests of this day:

```
pdm run test --week 1 --day 2
```

{{#include copyright.md}}


## /book/src/week1-03-gqa.md

# Week 1 Day 3: Grouped Query Attention (GQA)

In day 3, we will implement Grouped Query Attention (GQA). The Qwen2 models use GQA which is an optimization technique for multi-head attention that reduces the computational and memory costs associated with the Key (K) and Value (V) projections. Instead of each Query (Q) head having its own K and V heads (like in Multi-Head Attention, MHA), multiple Q heads share the same K and V heads. Multi-Query Attention (MQA) is a special case of GQA where all Q heads share a single K/V head pair.


**Readings**

*   [GQA Paper (Training Generalized Multi-Query Transformer Models from Pre-Trained Checkpoints)](https://arxiv.org/abs/2305.13245)
*   [Qwen layers implementation in mlx-lm](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/models/qwen2.py)
*   [PyTorch API (the case where enable_gqa=True)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
*   [torchtune.modules.MultiHeadAttention](https://pytorch.org/torchtune/0.3/generated/torchtune.modules.MultiHeadAttention.html)

## Task 1: Implement `scaled_dot_product_attention_grouped`

You will need to modify the following file:

```
src/tiny_llm/attention.py
```

In this task, we will implement the grouped scaled dot product attention function, which forms the core of GQA.

Implement `scaled_dot_product_attention_grouped` in `src/tiny_llm/attention.py`. This function is similar to the standard scaled dot product attention, but handles the case where the number of query heads is a multiple of the number of key/value heads.

The main progress is the same as the standard scaled dot product attention. The difference is that the K and V heads are shared across multiple Q heads. This means that instead of having `H_q` separate K and V heads, we have `H` K and V heads, and each K and V head is shared by `n_repeats = H_q // H` Q heads.  

The core idea is to reshape `query`, `key`, and `value` so that the K and V tensors can be effectively broadcasted to match the query heads within their groups during the `matmul` operations.
    *   Think about how to isolate the `H` and `n_repeats` dimensions in the `query` tensor.
    *   Consider adding a dimension of size 1 for `n_repeats` in the `key` and `value` tensors to enable broadcasting.
Then perform the scaled dot product attention calculation (`matmul`, scale, optional mask, `softmax`, `matmul`). Broadcasting should handle the head repetition implicitly.

Note that, leverage broadcasting instead of repeating the K and V tensors is more efficient. This is because broadcasting allows the same data to be used in multiple places without creating multiple copies of the data, which can save memory and improve performance.

At last, don't forget to reshape the final result back to the expected output shape.

```
N.. is zero or more dimensions for batches
H_q is the number of query heads
H is the number of key/value heads (H_q must be divisible by H)
L is the query sequence length
S is the key/value sequence length
D is the head dimension

query: N.. x H_q x L x D
key: N.. x H x S x D
value: N.. x H x S x D
mask: N.. x H_q x L x S
output: N.. x H_q x L x D
```

Please note that besides the grouped heads, we also extend the implementation that Q, K, and V might not have the same
sequence length.

You can test your implementation by running the following command:

```bash
pdm run test --week 1 --day 3 -- -k task_1
```

## Task 2: Causal Masking

**Readings**

- [Writing an LLM from scratch, part 9 -- causal attention](https://www.gilesthomas.com/2025/03/llm-from-scratch-9-causal-attention)

In this task, we will implement the causal masking for the grouped attention.

The causal masking is a technique that prevents the attention mechanism from attending to future tokens in the sequence.
When `mask` is set to `causal`, we will apply the causal mask.

The causal mask is a square matrix of shape `(L, S)`, where `L` is the query sequence length and `S` is the key/value sequence length.
The mask is a lower triangular matrix, where the elements on the diagonal and below the diagonal are 0, and the elements above the diagonal are -inf. For example, if `L = 3` and `S = 5`, the mask will be:

```
0   0   0   -inf -inf
0   0   0   0    -inf
0   0   0   0    0
```

Please implement the `causal_mask` function in `src/tiny_llm/attention.py` and then use it in the `scaled_dot_product_attention_grouped` function. Also note that our causal mask diagonal position is different from the PyTorch API.

You can test your implementation by running the following command:

```bash
pdm run test --week 1 --day 3 -- -k task_2
```

## Task 3: Qwen2 Grouped Query Attention

In this task, we will implement the Qwen2 Grouped Query Attention. You will need to modify the following file:

```
src/tiny_llm/qwen2_week1.py
```

`Qwen2MultiHeadAttention` implements the multi-head attention for Qwen2. You will need to implement the following pseudo code:

```
x: B, L, E
q = linear(x, wq, bq) -> B, L, H_q, D
k = linear(x, wk, bk) -> B, L, H, D
v = linear(x, wv, bv) -> B, L, H, D
q = rope(q, offset=slice(offset, offset + L))
k = rope(k, offset=slice(offset, offset + L))
(transpose as needed)
x = scaled_dot_product_attention_grouped(q, k, v, scale, mask) -> B, L, H_q, D ; Do this at float32 precision
(transpose as needed)
x = linear(x, wo) -> B, L, E
```

You can test your implementation by running the following command:

```bash
pdm run test --week 1 --day 3 -- -k task_3
```

At the end of the day, you should be able to pass all tests of this day:

```bash
pdm run test --week 1 --day 3
```

{{#include copyright.md}}


## /book/src/week1-04-rmsnorm-and-mlp.md

# Week 1 Day 4: RMSNorm and Multi Perceptron Layer

In day 4, we will implement two crucial components of the Qwen2 Transformer architecture: RMSNorm and the MLP (Multi-Layer Perceptron) block, also known as the FeedForward Network. RMSNorm is a layer normalization technique that helps stabilize training with less computational overhead compared to traditional layer normalization. The MLP block is a feedforward network that processes the output of the attention layers, applying non-linear transformations to enhance the model's expressiveness.


## Task 1: Implement `RMSNorm`

In this task, we will implement the `RMSNorm` layer.

```
src/tiny_llm/layer_norm.py
```

**📚 Readings**

* [Root Mean Square Layer Normalization](https://arxiv.org/abs/1910.07467)
* [Qwen2 layers implementation in mlx-lm (includes RMSNorm)](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/models/qwen2.py) - See `Qwen2RMSNorm`.


RMSNorm is defined as:

$$
y = \frac{x}{\sqrt{\text{mean}(x^2) + \epsilon}} \cdot \text{weight}
$$

Where:
-   `x` is the input tensor.
-   `weight` is a learnable scaling parameter.
-   `epsilon` (eps) is a small constant added for numerical stability (e.g., 1e-5 or 1e-6).
-   `mean(x^2)` is the sum of squares and then division by the number of elements.

The normalization is applied independently to each sample’s feature vector, typically over the last dimension of input.
Note that, mean calculation should be performed with `float32` accumulation to maintain precision before taking the square root, even if the input and weights are in a lower precision format (e.g., `float16` or `bfloat16`).

```
D is the embedding dimension.

x: N.. x D
weight: D
output: N.. x D
```

You can test your implementation by running:

```bash
pdm run test --week 1 --day 4 -- -k task_1
```

## Task 2: Implement the MLP Block

In this task, we will implement the MLP block named `Qwen2MLP`.

```
src/tiny_llm/qwen2_week1.py
```

The original Transformer model utilized a simple Feed-Forward Network (FFN) within each block. This FFN typically consisted of two linear transformations with a ReLU activation in between, applied position-wise.

Modern Transformer architectures, including Qwen2, often employ more advanced FFN variants for improved performance. Qwen2 uses a specific type of Gated Linear Unit (GLU) called SwiGLU.

**📚 Readings**
* [Attention is All You Need (Transformer Paper, Section 3.3 "Position-wise Feed-Forward Networks")](https://arxiv.org/abs/1706.03762)
* [GLU Paper(Language Modeling with Gated Convolutional Networks)](https://arxiv.org/pdf/1612.08083)
* [SilU(Swish) activation function](https://arxiv.org/pdf/1710.05941)
* [SwiGLU Paper(GLU Variants Improve Transformer)](https://arxiv.org/abs/2002.05202v1)
* [PyTorch SiLU documentation](https://pytorch.org/docs/stable/generated/torch.nn.SiLU.html)
* [Qwen2 layers implementation in mlx-lm (includes MLP)](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/models/qwen2.py)

Essientially, SwiGLU is a combination of GLU and the SiLU (Sigmoid Linear Unit) activation function:
-  GLU is a gating mechanism that allows the model to learn which parts of the input to focus on. It typically involves an element-wise product of two linear projections of the input, one of which might be passed through an activation function. Compared to ReLU used in the original FFN, GLU can help the model learn more complex relationships in the data, deciding which features to keep and which to discard.
-  SiLU (Sigmoid Linear Unit) is a smooth, non-monotonic activation function that has been shown to perform well in various deep learning tasks. Compared to ReLU and sigmoid used in GLU, it is fully differentiable without the zero-gradient “dead zones”, retains non-zero output even for negative inputs.

You need to implement the `silu` function in `basics.py` first. For `silu`, it takes a tensor of the shape `N.. x I` and returns a tensor of the same shape.
The `silu` function is defined as:
$$
\text{SiLU}(x) = x * \text{sigmoid}(x) = \frac{x}{1 + e^{-x}}
$$


Then implement `Qwen2MLP`. The structure for Qwen2's MLP block is:
*  A gate linear projection ($W_{gate}$).
*  An up linear projection ($W_{up}$).
*  A SiLU activation function applied to the output of $W_{gate}$.
*  An element-wise multiplication of the SiLU-activated $W_{gate}$ output and the $W_{up}$ output. This forms the "gated" part.
*  A final down linear projection ($W_{down}$).

This can be expressed as:
$$
\text{MLP}(x) = (\text{SiLU}(W_{gate}(x)) \odot W_{up}(x))W_{down}
$$
Where $\odot$ denotes element-wise multiplication. All linear projections in Qwen2's MLP are typically implemented without bias.

```
N.. is zero or more dimensions for batches
E is hidden_size (embedding dimension of the model)
I is intermediate_size (dimension of the hidden layer in MLP)
L is the sequence length

input: N.. x L x E
w_gate: I x E
w_up: I x E
w_down: E x I
output: N.. x L x E
```

You can test your implementation by running:

```bash
pdm run test --week 1 --day 4 -- -k task_2
```

At the end of the day, you should be able to pass all tests of this day:

```bash
pdm run test --week 1 --day 4
```


{{#include copyright.md}}


## /book/src/week1-05-qwen2-model.md

# Week 1 Day 5: The Qwen2 Model

In day 5, we will implement the Qwen2 model.

Before we start, please make sure you have downloaded the models:

```bash
huggingface-cli download Qwen/Qwen2-0.5B-Instruct-MLX
huggingface-cli download Qwen/Qwen2-7B-Instruct-MLX
```

Otherwise, some of the tests will be skipped.

## Task 1: Implement `Qwen2TransformerBlock`

```
src/tiny_llm/qwen2_week1.py
```

**📚 Readings**

- [A Simplified Explanation of the Transformer Block](https://medium.com/@akhileshkapse/a-simplified-explanation-of-the-transformer-block-must-read-blog-for-nlp-enthusiasts-12ef240a62ac)
- [Attention is All You Need](https://arxiv.org/pdf/1706.03762)

Qwen2 uses the following transformer block structure:

```
  input
/ |
| input_layernorm (RMSNorm)
| |
| Qwen2MultiHeadAttention
\ |
  Add (residual)
/ |
| post_attention_layernorm (RMSNorm)
| |
| MLP
\ |
  Add (residual)
  |
output
```

You should pass all tests for this task by running:

```bash
# Download the models if you haven't done so
huggingface-cli download Qwen/Qwen2-0.5B-Instruct-MLX
huggingface-cli download Qwen/Qwen2-7B-Instruct-MLX
# Run the tests
pdm run test --week 1 --day 5 -- -k task_1
```

## Task 2: Implement `Embedding`

```
src/tiny_llm/embedding.py
```

**📚 Readings**

- [LLM Embeddings Explained: A Visual and Intuitive Guide](https://huggingface.co/spaces/hesamation/primer-llm-embedding)

The embedding layer maps one or more tokens (represented as an interger) to one or more vector of dimension `embedding_dim`.
In this task, you will implement the embedding layer.

```
Embedding::__call__
weight: vocab_size x embedding_dim
Input: N.. (tokens)
Output: N.. x embedding_dim (vectors)
```

This can be done with a simple array index lookup operation.

In the Qwen2 model, the embedding layer can also be used as a linear layer to map the embeddings back to the token space.

```
Embedding::as_linear
weight: vocab_size x embedding_dim
Input: N.. x embedding_dim
Output: N.. x vocab_size
```

You should pass all tests for this task by running:

```bash
# Download the models if you haven't done so; we need to tokenizers
huggingface-cli download Qwen/Qwen2-0.5B-Instruct-MLX
huggingface-cli download Qwen/Qwen2-7B-Instruct-MLX
# Run the tests
pdm run test --week 1 --day 5 -- -k task_2
```

## Task 3: Implement `Qwen2ModelWeek1`

Now that we have built all the components of the Qwen2 model, we can implement the Qwen2ModelWeek1 class.

```
src/tiny_llm/qwen2_week1.py
```

**📚 Readings**

- [Qwen2.5-7B-Instruct model parameters](https://huggingface.co/Qwen/Qwen2.5-7B-Instruct?show_file_info=model.safetensors.index.json)

In this course, you will not implement the process of loading the model parameters from the tensor files. Instead, we
will load the model using the `mlx-lm` library, and then we will place the loaded parameters into our model. Therefore,
the `Qwen2ModelWeek1` class will take a MLX model as the constructor argument.

The Qwen2 model has the following layers:

```
input
| (tokens: N..)
Embedding
| (N.. x hidden_size); note that hidden_size==embedding_dim
Qwen2TransformerBlock
| (N.. x hidden_size)
Qwen2TransformerBlock
| (N.. x hidden_size)
...
|
RMSNorm 
| (N.. x hidden_size)
Embedding::as_linear  OR  Linear (lm_head)
| (N.. x vocab_size)
output
```

You can access the number of layers, hidden size, and other model parameters from `mlx_model.args`. Note that different
size of the Qwen2 models use different strategies to map the embeddings back to the token space. For the 0.5b model, it
directly uses the `Embedding::as_linear` layer. For the 7b model, it has a separate `lm_head` linear layer. You can
decide which strategy to use based on the `mlx_model.args.tie_word_embeddings` argument. If it is true, then you should
use `Embedding::as_linear`. Otherwise, the `lm_head` linear layer will be available and you should load its parameters.

The input to the model is a sequence of tokens. The output is the logits (probability distribution) of the next token.
In the next day, we will implement the process of generating the response from the model, and decide the next token
based on the probability distribution output.

Also note that the MLX model we are using (Qwen2-7B/0.5B-Instruct) is a quantized model. Therefore, you also need to
dequantize the weights before loading them into our tiny-llm model. You can use the provided `quantize::dequantize_linear`
function to dequantize the weights.

You also need to make sure that you set `mask=causal` when the input sequence is longer than 1. We will explain why
in the next day.

You should pass all tests for this task by running:


```bash
# Download the models if you haven't done so
huggingface-cli download Qwen/Qwen2-0.5B-Instruct-MLX
huggingface-cli download Qwen/Qwen2-7B-Instruct-MLX
# Run the tests
pdm run test --week 1 --day 5 -- -k task_3
```

At the end of the day, you should be able to pass all tests of this day:

```bash
pdm run test --week 1 --day 5
```


{{#include copyright.md}}


## /book/src/week1-06-generate-response.md

# Week 1 Day 6: Generating the Response: Prefill and Decode

In day 6, we will implement the process of generating the response when using the LLM as a chatbot. The implementation
is not a lot of code, but given that it uses a large portion of the code we implemented in the previous days, we want
to allocate this day to debug the implementation and make sure everything is working as expected.

# Task 1: Implement `simple_generate`

```
src/tiny_llm/generate.py
```

The `simple_generate` function takes a model, a tokenizer, and a prompt, and generates the response. The generation
process is done in two parts: first prefill, and then decode.

First thing is to implement the `_step` sub-function. It takes a list of tokens `y`, and the offset of the first token
provided to the model. The model will return the logits: the probability distribution of the next token for each position.

```
y: N.. x S, where in week 1 we don't implement batch, so N.. = 1
offset: int
output_logits: N.. x S x vocab_size
```

You only need the last token's logits to decide the next token. Therefore, you need to select the last token's logits
from the output logits.

```
logits = output_logits[:, -1, :]
```

Then, you can optionally apply the log-sum-exp trick to normalize the logits to avoid numerical instability. As we only
do argmax sampling, the log-sum-exp trick is not necessary. Then, you need to sample the next token from the logits.
You can use the `mx.argmax` function to sample the token with the highest probability over the last dimension
(the vocab_size axis). The function returns the next token number. This decoding strategy is called greedy decoding as we always
pick the token with the highest probability.

- 📚 [The Log-Sum-Exp Trick](https://gregorygundersen.com/blog/2020/02/09/log-sum-exp/)
- 📚 [Decoding Strategies in Large Language Models](https://mlabonne.github.io/blog/posts/2023-06-07-Decoding_strategies.html)

With the `_step` function implemented, you can now implement the full `simple_generate` function. The function will
first prefill the model with the prompt. As the prompt is a string, you need to first convert it to a list of tokens
by using the tokenizer `tokenizer.encode`.

* The prefill step is done by calling the `_step` function with all the tokens in the prompt with `offset=0`. It gives back
the first token in the response.
* The decode step is done by calling the `_step` function with all the previous tokens and the offset of the last token.

You will need to implement a while loop to keep generating the response until the model outputs the EOS `tokenizer.eos_token_id` token.
In the loop, you will need to store all previous tokens in a list, and use the detokenizer `tokenizer.detokenizer` to print the response.

An example of the sequences provided to the `_step` function is as below:

```
tokenized_prompt: [1, 2, 3, 4, 5, 6]
prefill: _step(model, [1, 2, 3, 4, 5, 6], 0) # returns 7
decode: _step(model, [1, 2, 3, 4, 5, 6, 7], 7) # returns 8
decode: _step(model, [1, 2, 3, 4, 5, 6, 7, 8], 8) # returns 9
...
```

We will optimize the `decode` process to use key-value cache to speed up the generation next week.

You can test your implementation by running the following command:

```bash
pdm run main --solution tiny_llm --loader week1 --model Qwen/Qwen2-0.5B-Instruct-MLX \
  --prompt "Give me a short introduction to large language model"
pdm run main --solution tiny_llm --loader week1 --model Qwen/Qwen2-7B-Instruct-MLX \
  --prompt "Give me a short introduction to large language model"
```

It should gives you a reasonable response of "what is a large language model". Replace `--solution tiny_llm` with
`--solution ref` to use the reference solution.

{{#include copyright.md}}


## /book/src/week1-07-sampling-prepare.md

# Week 1 Day 7: Sampling and Preparing for Week 2

In day 7, we will implement various sampling strategies. And we will get you prepared for week 2.

## Task 1: Sampling

We implemented the default greedy sampling strategy in the previous day. In this task, we will implement the temperature,
top-k, and top-p (nucleus) sampling strategies.

```
src/tiny_llm/sampler.py
```

- 📚 [mlx-lm sampler implementation](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/sample_utils.py)

**Temperature Sampling**

The first sampling strategy is the temperature sampling. When `temp=0`, we use the default greedy strategy. When it is
larger than 0, we will randomly select the next token based on the logprobs. The temperature parameter scales the distribution.
When the value is larger, the distribution will be more uniform, making the lower probability token more likely to be
selected, and therefore making the model more creative.

To implement temperature sampling, simply divide the logprobs by the temperature and use `mx.random.categorical` to
randomly select the next token.

```bash
pdm run main --solution tiny_llm --loader week1 --model Qwen/Qwen2-0.5B-Instruct-MLX --sampler-temp 0.5
```

**Top-k Sampling**

In top-k sampling, we will only keep the top-k tokens with the highest probabilities before sampling the probabilities.
This is done before the final temperature scaling.

You can use `mx.argpartition` to partition the output so that you can know the indices of the top-k elements, and then,
mask those logprobs outside the top-k with `-mx.inf`. After that, do temperature sampling.

```bash
pdm run main --solution tiny_llm --loader week1 --model Qwen/Qwen2-0.5B-Instruct-MLX --sampler-temp 0.5 --sampler-top-k 10
```

**Top-p (Nucleus) Sampling**

In top-p (nucleus) sampling, we will only keep the top-p tokens with the highest cumulative probabilities before sampling
the probabilities. This is done before the final temperature scaling.

There are multiple ways of implementing it. One way is to first use `mx.argsort` to sort the logprobs (from highest
probability to lowest), and then, do a `cumsum` over the sorted logprobs to get the cumulative probabilities. Then, mask
those logprobs outside the top-p with `-mx.inf`. After that, do temperature sampling.

```bash
pdm run main --solution tiny_llm --loader week1 --model Qwen/Qwen2-0.5B-Instruct-MLX --sampler-temp 0.5 --sampler-top-p 0.9
```

## Task 2: Prepare for Week 2

In week 2, we will optimize the serving infrastructure of the Qwen2 model. We will write some C++ code and Metal kernel
to make some operations run faster. You will need Xcode and its command-line tools, which include the Metal compiler,
to compile the C++ code and Metal kernels.

1.  **Install Xcode:**
    Install Xcode from the Mac App Store or from the [Apple Developer website](https://developer.apple.com/xcode/) (this may require an Apple Developer account).
2.  **Launch Xcode and Install Components:**
    After installation, launch Xcode at least once. It may prompt you to install additional macOS components; please do so (this is usually the default option).
3.  **Install Xcode Command Line Tools:**
    Open your Terminal and run:
    ```bash
    xcode-select --install
    ```
4.  **Set Default Xcode Path (if needed):**
    Ensure that your command-line tools are pointing to your newly installed Xcode. You can do this by running:
    ```bash
    sudo xcode-select --switch /Applications/Xcode.app/Contents/Developer
    ```
    *(Adjust the path if your Xcode is installed in a different location).*
5.  **Accept Xcode License:**
    You may also need to accept the Xcode license:
    ```bash
    sudo xcodebuild -license accept
    ```
6.  **Install CMake:**
    ```bash
    brew install cmake
    ```

(This instruction is graciously provided by [Liu Jinyi](https://github.com/KKKZOZ).)

You can test your installation by compiling the code in `src/extensions` with a `axpby` function as part of the official
mlx extension tutorial:

```bash
pdm run build-ext
pdm run build-ext-test
```

It should print `correct: True`.

If you are not familiar with C++ or Metal programming, we also suggest doing some small exercises to get familiar with
them. You can implement some element-wise operations like `exp`, `sin`, `cos` and replace the MLX ones in your model
implementation.

That's all for week 1! We have implemented all the components to serve the Qwen2 model. Now we are ready to start week 2,
where we will optimize the serving infrastructure and make it run blazing fast on your Apple Silicon device.

{{#include copyright.md}}


## /book/src/week1-overview.md

# Week 1: From Matmul to Text

In this week, we will start from the basic matrix operations and see how those these matrix manipulations can turn the
Qwen2 model parameters into a model that generates text. We will implement the neural network layers used in the Qwen2
model using mlx's matrix APIs.

We will use the Qwen2-7B-Instruct model for this week. As we need to dequantize the model parameters, the model of 4GB
download size needs 20GB of memory in week 1. If you do not have enough memory, you can consider using the smaller 0.5B model.

The MLX version of the Qwen2-7B-Instruct model we downloaded in the setup is an int4 quantized version of the original bfloat16 model.

## What We will Cover

* Attention, Multi-Head Attention, and Grouped/Multi Query Attention
* Positional Embeddings and RoPE
* Put the attention layers together and implement the whole Transformer block
* Implement the MLP layer and the whole Transformer model
* Load the Qwen2 model parameters and generate text

## What We will Not Cover

To make the journey as interesting as possible, we will skip a few things for now:

* How to quantize/dequantize a model -- that will be part of week 2. The Qwen2 model is quantized so we will need to
  dequantize them before we can use them in our layer implementations.
* Actually we still used some APIs other than matrix manipulations -- like softmax, exp, log, etc. But they are simple
  and not implementing them would not affect the learning experience.
* Tokenizer -- we will not implement the tokenizer from scratch. We will use the `mlx_lm` tokenizer to tokenize the input.
* Loading the model weights -- I don't think it's an interesting thing to learn how to decode those tensor dump files, so
  we will use the `mlx_lm` to load the model and steal the weights from the loaded model into our layer implementations.

## Basic Matrix APIs

Although MLX does not offer an introductory guide for beginners, its Python API is designed to be highly compatible with NumPy. To get started, you can refer to [NumPy: The Absolute Basic for Beginners](https://numpy.org/doc/stable/user/absolute_beginners.html) to learn essential matrix operations.

You can also refer to the [MLX Operations API](https://ml-explore.github.io/mlx/build/html/python/ops.html#operations)
for more details.

## Qwen2 Models

You can try the Qwen2 model with MLX/vLLM. You can read the blog post below to have some idea of what we will build
within this course. At the end of this week, we will be able to chat with the model -- that is to say, use Qwen2 to
generate text, as a causal language model.

The reference implementation of the Qwen2 model can be found in huggingface transformers, vLLM, and mlx-lm. You may
utilize these resources to better understand the internals of the model and what we will implement in this week.

**📚 Readings**

- [Qwen2.5: A Party of Foundation Models!](https://qwenlm.github.io/blog/qwen2.5/)
- [Key Concepts of the Qwen2 Model](https://qwen.readthedocs.io/en/latest/getting_started/concepts.html)
- [Huggingface Transformers - Qwen2](https://github.com/huggingface/transformers/tree/main/src/transformers/models/qwen2)
- [vLLM Qwen2](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/qwen2.py)
- [mlx-lm Qwen2](https://github.com/ml-explore/mlx-lm/blob/main/mlx_lm/models/qwen2.py)
- [Qwen2 Technical Report](https://arxiv.org/pdf/2407.10671)
- [Qwen2.5 Technical Report](https://arxiv.org/pdf/2412.15115)

{{#include copyright.md}}


## /book/src/week2-overview.md

https://github.com/ml-explore/mlx/blob/main/mlx/backend/cpu/quantized.cpp
https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/layers/linear.py
MLX uses INT4 W4A16
https://ml-explore.github.io/mlx/build/html/dev/extensions.html
https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-metal/ggml-metal.metal
https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/quantized.h#L962

pdm run ./build_ext.sh

speculative decoding
prefill and decode separation
quantized kv cache
Assert return data type

https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/sdpa_vector.h
https://github.com/philipturner/metal-flash-attention
https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h
https://triton-lang.org/main/getting-started/tutorials/06-fused-attention.html
https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/sdpa_vector.h

attention mask why
https://www.shashankshekhar.com/blog/apple-metal-vs-nvidia-cuda
https://arxiv.org/pdf/2308.16369

padding
https://huggingface.co/docs/transformers/pad_truncation

https://siboehm.com/articles/22/CUDA-MMM
https://github.com/ggml-org/llama.cpp/blob/master/ggml/src/ggml-metal/ggml-metal.metal


## /book/src/week3-overview.md


## /book/theme/head.hbs._

```_ path="/book/theme/head.hbs._" 
<script defer src="https://eu.umami.is/script.js" data-website-id="e48af962-317d-483f-bc5d-663973169528"></script>

```

## /main.py

```py path="/main.py" 
from mlx_lm import load
import mlx_lm
import mlx.core as mx
import argparse

import mlx_lm.sample_utils

parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, default="Qwen/Qwen2-7B-Instruct-MLX")
parser.add_argument(
    "--prompt",
    type=str,
    default="Give me a short introduction to large language model.",
)
parser.add_argument("--solution", type=str, default="tiny_llm")
parser.add_argument("--loader", type=str, default="week1")
parser.add_argument("--device", type=str, default="gpu")
parser.add_argument("--sampler-temp", type=float, default=0)
parser.add_argument("--sampler-top-p", type=float, default=None)
parser.add_argument("--sampler-top-k", type=int, default=None)
args = parser.parse_args()

use_mlx = False
if args.solution == "tiny_llm":
    print("Using your tiny_llm solution")
    from tiny_llm import (
        Qwen2ModelWeek1,
        Qwen2ModelWeek2,
        simple_generate,
        simple_generate_with_kv_cache,
        sampler,
    )

elif args.solution == "tiny_llm_ref" or args.solution == "ref":
    print("Using tiny_llm_ref solution")
    from tiny_llm_ref import (
        Qwen2ModelWeek1,
        Qwen2ModelWeek2,
        simple_generate,
        simple_generate_with_kv_cache,
        sampler,
    )

elif args.solution == "mlx":
    use_mlx = True
    from mlx_lm.generate import stream_generate

    print("Using the original mlx model")
else:
    raise ValueError(f"Solution {args.solution} not supported")

mlx_model, tokenizer = load(args.model)

with mx.stream(mx.gpu if args.device == "gpu" else mx.cpu):
    if use_mlx:
        tiny_llm_model = mlx_model
    else:
        if args.loader == "week1":
            print("Using Qwen2ModelWeek1 loader")
            tiny_llm_model = Qwen2ModelWeek1(mlx_model)
        elif args.loader == "week2":
            print("Using Qwen2ModelWeek2 loader")
            tiny_llm_model = Qwen2ModelWeek2(mlx_model)
        else:
            raise ValueError(f"Loader {args.loader} not supported")
    messages = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": args.prompt},
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    if not use_mlx:
        sampler = sampler.make_sampler(
            args.sampler_temp, top_p=args.sampler_top_p, top_k=args.sampler_top_k
        )
        if args.loader == "week1":
            simple_generate(tiny_llm_model, tokenizer, prompt, sampler=sampler)
        elif args.loader == "week2":
            simple_generate_with_kv_cache(tiny_llm_model, tokenizer, prompt)
    else:
        sampler = mlx_lm.sample_utils.make_sampler(
            args.sampler_temp, top_p=args.sampler_top_p, top_k=args.sampler_top_k
        )
        for resp in stream_generate(tiny_llm_model, tokenizer, prompt, sampler=sampler):
            print(resp.text, end="", flush=True)

```

## /pyproject.toml

```toml path="/pyproject.toml" 
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"

[project]
name = "tiny-llm"
version = "0.1.0"
requires-python = ">=3.10, <3.13"
readme = "README.md"
dependencies = [
    "mlx>=0.25.0",
    "torch>=2.6.0",
    "torchtune>=0.6.1",
    "torchao>=0.10.0",
    "mlx-lm>=0.23.0",
    "numpy>=2.2.4",
    "pytest>=8.3.5",
    "ruff>=0.11.6",
    # this should not usually appear in a project dependency list but we add it to simplify the setup process
    "setuptools>=62",
    "nanobind==2.4.0",
    "pytest-benchmark>=5.1.0"
]

[tool.pdm.scripts]
build-ext.cmd = "python build.py"
build-ext.working_dir = "src/extensions"
build-ext-test.cmd = "python test.py"
build-ext-test.working_dir = "src/extensions"
build-ext-ref.cmd = "python build.py"
build-ext-ref.working_dir = "src/extensions_ref"
main.cmd = "python main.py"
main-week1.cmd = "python main.py --loader week1"
main-week2.cmd = "python main.py --loader week2"
batch-main.cmd = "python batch-main.py"
test.cmd = "python scripts/dev-tools.py test"
check-installation.cmd = "python scripts/check-installation.py"
test-refsol.cmd = "python scripts/dev-tools.py test-refsol"
bench.cmd = "pytest benches"
format = "ruff format"
format-cpp-ref.shell = "find src/extensions_ref -type file \\( -name '*.h' -or -name '*.cpp' \\) | xargs -n1 clang-format -i"
format-cpp.shell = "find src/extensions -type file \\( -name '*.h' -or -name '*.cpp' \\) | xargs -n1 clang-format -i"
copy-test.cmd = "python scripts/dev-tools.py copy-test"
book.cmd = "mdbook serve book/"

[tool.pytest.ini_options]
addopts = [
    "--import-mode=importlib",
]
pythonpath = "src"

```

## /scripts/check-installation.py

```py path="/scripts/check-installation.py" 
import mlx.core as mx
import torch

with mx.stream(mx.cpu):
    a = mx.array([1, 2, 3])
    b = mx.array([4, 5, 6])
    c = mx.add(a, b)
    print(c)

with mx.stream(mx.gpu):
    a = mx.array([1, 2, 3])
    b = mx.array([4, 5, 6])
    c = mx.add(a, b)
    print(c)

print(
    torch.add(
        torch.tensor([1, 2, 3], device="cpu"), torch.tensor([4, 5, 6], device="cpu")
    )
)

```

## /scripts/dev-tools.py

```py path="/scripts/dev-tools.py" 
import argparse
import shutil
import os
import pytest


def copy_test(args, skip_if_exists=False):
    source_file = f"tests_refsol/test_week_{args.week}_day_{args.day}.py"
    target_file = f"tests/test_week_{args.week}_day_{args.day}.py"
    if skip_if_exists and os.path.exists(target_file):
        return
    print(f"copying {source_file} to {target_file}")
    shutil.copyfile(source_file, target_file)


def test(args):
    if args.week and args.day:
        copy_test(args, skip_if_exists=True)
        pytest.main(
            ["-v", f"tests/test_week_{args.week}_day_{args.day}.py"] + args.remainders
        )
    elif args.week or args.day:
        print("Please provide both week and day")
        exit(1)
    else:
        pytest.main(["-v", "tests"] + args.remainders)


def test_refsol(args):
    if args.week and args.day:
        pytest.main(
            ["-v", f"tests_refsol/test_week_{args.week}_day_{args.day}.py"]
            + args.remainders
        )
    elif args.week or args.day:
        print("Please provide both week and day")
        exit(1)
    else:
        pytest.main(["-v", "tests_refsol"] + args.remainders)


def main():
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers()
    copy_test_parser = subparsers.add_parser("copy-test")
    copy_test_parser.add_argument("--week", type=int, required=True)
    copy_test_parser.add_argument("--day", type=int, required=True)
    copy_test_parser.set_defaults(copy_test_parser=True)
    test_parser = subparsers.add_parser("test")
    test_parser.add_argument("--week", type=int, required=False)
    test_parser.add_argument("--day", type=int, required=False)
    test_parser.add_argument("remainders", nargs="*")
    test_parser.set_defaults(test_parser=True)
    test_refsol_parser = subparsers.add_parser("test-refsol")
    test_refsol_parser.add_argument("--week", type=int, required=False)
    test_refsol_parser.add_argument("--day", type=int, required=False)
    test_refsol_parser.add_argument("remainders", nargs="*")
    test_refsol_parser.set_defaults(test_refsol_parser=True)
    args = parser.parse_args()
    if hasattr(args, "copy_test_parser"):
        copy_test(args)
    if hasattr(args, "test_parser"):
        test(args)
    if hasattr(args, "test_refsol_parser"):
        test_refsol(args)


if __name__ == "__main__":
    main()

```

## /scripts/diff_api.py

```py path="/scripts/diff_api.py" 
import inspect
import sys
import difflib

import tiny_llm
import tiny_llm_ref


def export_public_members(module):
    if not module.__name__.startswith("tiny_llm"):
        return []
    print(f"Processing {module.__name__}")

    public_members_info = []
    for name, member in inspect.getmembers(module):
        if not name.startswith("_"):
            if inspect.isfunction(member):
                if member.__module__ != module.__name__:
                    continue
                # only if this is function definition
                # Get the function type annotations
                annotations = member.__annotations__
                path = f"{module.__name__}.{name}"
                public_members_info.append((path, annotations))
            if inspect.isclass(member):
                if member.__module__ != module.__name__:
                    continue
                path = f"{module.__name__}.{name}"
                public_members_info.append((path, member.__annotations__))
                for attr_name, attr_value in member.__dict__.items():
                    if (
                        not attr_name.startswith("_")
                        or attr_name == "__init__"
                        or attr_name == "__call__"
                    ):
                        path = f"{module.__name__}.{name}.{attr_name}"
                        public_members_info.append((path, attr_value.__annotations__))
            if inspect.ismodule(member):
                public_members_info.extend(export_public_members(member))

    return sorted(public_members_info, key=lambda x: x[0])


def stringify_member(members):
    return [
        f"{member[0]}: {str(member[1])}\n".replace("tiny_llm_ref.", "tiny_llm.")
        for member in members
    ]


start_code = stringify_member(export_public_members(tiny_llm))
ref_sol = stringify_member(export_public_members(tiny_llm_ref))

print("--- tiny_llm/apis.txt ---", flush=True)
sys.stdout.writelines(start_code)
print("--- tiny_llm_ref/apis.txt ---", flush=True)
sys.stdout.writelines(ref_sol)

result = list(
    difflib.unified_diff(
        start_code,
        ref_sol,
        fromfile="tiny_llm/apis.txt",
        tofile="tiny_llm_ref/apis.txt",
        n=0,
    )
)

sys.stdout.writelines(result)

if len(result) > 0:
    sys.exit(1)

```

## /src/extensions/.clangd

```clangd path="/src/extensions/.clangd" 
CompileFlags:
    CompilationDatabase: build/tiny_llm_ext._ext

```

## /src/extensions/CMakeLists.txt

cmake_minimum_required(VERSION 3.27)

project(_ext LANGUAGES CXX)

# ----------------------------- Setup -----------------------------
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)

# ----------------------------- Dependencies -----------------------------
find_package(
  Python 3.8
  COMPONENTS Interpreter Development.Module
  REQUIRED)
execute_process(
  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE nanobind_ROOT)
find_package(nanobind CONFIG REQUIRED)

execute_process(
  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE MLX_ROOT)
find_package(MLX CONFIG REQUIRED)

# ----------------------------- Extensions -----------------------------

# Add library
add_library(tiny_llm_ext)

# Add sources
target_sources(
  tiny_llm_ext
  PUBLIC
  ${CMAKE_CURRENT_LIST_DIR}/src/axpby.cpp
  ${CMAKE_CURRENT_LIST_DIR}/src/utils.cpp
)

# Add include headers
target_include_directories(tiny_llm_ext PUBLIC ${CMAKE_CURRENT_LIST_DIR} ${CMAKE_CURRENT_LIST_DIR}/src)

# Link to mlx
target_link_libraries(tiny_llm_ext PUBLIC mlx)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # so that clangd can pick this up

# ----------------------------- Metal -----------------------------

# Build metallib
if(MLX_BUILD_METAL)
  mlx_build_metallib(
    TARGET
    tiny_llm_ext_metallib
    TITLE
    tiny_llm_ext
    SOURCES
    ${CMAKE_CURRENT_LIST_DIR}/src/axpby.metal
    INCLUDE_DIRS
    ${PROJECT_SOURCE_DIR}
    ${MLX_INCLUDE_DIRS}
    OUTPUT_DIRECTORY
    ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})

  add_dependencies(tiny_llm_ext tiny_llm_ext_metallib)
endif()

# ----------------------------- Python Bindings -----------------------------
nanobind_add_module(
  _ext
  NB_STATIC
  STABLE_ABI
  LTO
  NOMINSIZE
  NB_DOMAIN
  mlx
  ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp)
target_link_libraries(_ext PRIVATE tiny_llm_ext)

if(BUILD_SHARED_LIBS)
  target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
endif()


## /src/extensions/bindings.cpp

```cpp path="/src/extensions/bindings.cpp" 
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <nanobind/stl/variant.h>

#include "tiny_llm_ext.h"
#include "axpby.h"

namespace nb = nanobind;
using namespace nb::literals;

NB_MODULE(_ext, m) {
    m.doc() = "tiny-llm extensions for MLX";

    m.def("load_library", &tiny_llm_ext::load_library, "device"_a, "path"_a);

    m.def("axpby", &tiny_llm_ext::axpby, "x"_a, "y"_a, "alpha"_a, "beta"_a, nb::kw_only(), "stream"_a = nb::none(),
          R"(
        Scale and sum two vectors element-wise
        ``z = alpha * x + beta * y``

        Follows numpy style broadcasting between ``x`` and ``y``
        Inputs are upcasted to floats if needed

        Args:
            x (array): Input array.
            y (array): Input array.
            alpha (float): Scaling factor for ``x``.
            beta (float): Scaling factor for ``y``.

        Returns:
            array: ``alpha * x + beta * y``
      )");
}

```

## /src/extensions/build.py

```py path="/src/extensions/build.py" 
from pathlib import Path
import shutil
from mlx import extension
from setuptools import Distribution
import inspect
import mlx
import os

if __name__ == "__main__":
    src_dir = Path(__file__).parent
    distribution = Distribution(
        {
            "name": "tiny_llm_ext",
            "ext_modules": [extension.CMakeExtension("tiny_llm_ext._ext")],
            "package_data": {"tiny_llm_ext": ["*.so", "*.dylib", "*.metallib"]},
        }
    )
    cmd = extension.CMakeBuild(distribution)
    cmd.initialize_options()
    cmd.build_temp = Path("build")
    cmd.build_lib = Path("build") / "lib"
    cmd.inplace = True
    cmd.ensure_finalized()
    cmd.run()

```

## /src/extensions/src/axpby.cpp

```cpp path="/src/extensions/src/axpby.cpp" 
// Copyright © 2023-2025 Apple Inc.

#include "axpby.h"

#include <iostream>
#include <sstream>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/utils.h"

#ifdef _METAL_
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/utils.h"
#endif

namespace tiny_llm_ext {

///////////////////////////////////////////////////////////////////////////////
// Operation Implementation
///////////////////////////////////////////////////////////////////////////////

/**
 *  Scale and sum two vectors element-wise
 *  z = alpha * x + beta * y
 *
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
mx::array axpby(const mx::array &x,              // Input mx::array x
                const mx::array &y,              // Input mx::array y
                const float alpha,               // Scaling factor for x
                const float beta,                // Scaling factor for y
                mx::StreamOrDevice s /* = {} */  // Stream on which to schedule the operation
) {
    // Promote dtypes between x and y as needed
    auto promoted_dtype = promote_types(x.dtype(), y.dtype());

    // Upcast to float32 for non-floating point inputs x and y
    auto out_dtype =
        mx::issubdtype(promoted_dtype, mx::float32) ? promoted_dtype : promote_types(promoted_dtype, mx::float32);

    // Cast x and y up to the determined dtype (on the same stream s)
    auto x_casted = mx::astype(x, out_dtype, s);
    auto y_casted = mx::astype(y, out_dtype, s);

    // Broadcast the shapes of x and y (on the same stream s)
    auto broadcasted_inputs = broadcast_arrays({x_casted, y_casted}, s);
    auto out_shape = broadcasted_inputs[0].shape();

    // Construct the array as the output of the Axpby primitive
    // with the broadcasted and upcasted arrays as inputs
    return mx::array(
        /* const mx::Shape& shape = */ out_shape,
        /* mx::Dtype dtype = */ out_dtype,
        /* std::shared_ptr<mx::Primitive> primitive = */
        std::make_shared<Axpby>(to_stream(s), alpha, beta),
        /* const std::vector<mx::array>& inputs = */ broadcasted_inputs);
}

///////////////////////////////////////////////////////////////////////////////
// Primitive Common Backend Implementation
///////////////////////////////////////////////////////////////////////////////

template <typename T>
void axpby_impl(const mx::array &x, const mx::array &y, mx::array &out, float alpha_, float beta_, mx::Stream stream) {
    out.set_data(mx::allocator::malloc(out.nbytes()));

    // Get the CPU command encoder and register input and output arrays
    auto &encoder = mx::cpu::get_command_encoder(stream);
    encoder.set_input_array(x);
    encoder.set_input_array(y);
    encoder.set_output_array(out);

    // Launch the CPU kernel
    encoder.dispatch([x_ptr = x.data<T>(), y_ptr = y.data<T>(), out_ptr = out.data<T>(), size = out.size(),
                      shape = out.shape(), x_strides = x.strides(), y_strides = y.strides(), alpha_, beta_]() {
        // Cast alpha and beta to the relevant types
        T alpha = static_cast<T>(alpha_);
        T beta = static_cast<T>(beta_);

        // Do the element-wise operation for each output
        for (size_t out_idx = 0; out_idx < size; out_idx++) {
            // Map linear indices to offsets in x and y
            auto x_offset = mx::elem_to_loc(out_idx, shape, x_strides);
            auto y_offset = mx::elem_to_loc(out_idx, shape, y_strides);

            // We allocate the output to be contiguous and regularly strided
            // (defaults to row major) and hence it doesn't need additional mapping
            out_ptr[out_idx] = alpha * x_ptr[x_offset] + beta * y_ptr[y_offset];
        }
    });
}

void Axpby::eval_cpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) {
    auto &x = inputs[0];
    auto &y = inputs[1];
    auto &out = outputs[0];

    // Dispatch to the correct dtype
    if (out.dtype() == mx::float32) {
        return axpby_impl<float>(x, y, out, alpha_, beta_, stream());
    } else if (out.dtype() == mx::float16) {
        return axpby_impl<mx::float16_t>(x, y, out, alpha_, beta_, stream());
    } else if (out.dtype() == mx::bfloat16) {
        return axpby_impl<mx::bfloat16_t>(x, y, out, alpha_, beta_, stream());
    } else if (out.dtype() == mx::complex64) {
        return axpby_impl<mx::complex64_t>(x, y, out, alpha_, beta_, stream());
    } else {
        throw std::runtime_error("Axpby is only supported for floating point types.");
    }
}

///////////////////////////////////////////////////////////////////////////////
// Primitive Metal Backend Implementation
///////////////////////////////////////////////////////////////////////////////

#ifdef _METAL_

/** Evaluate primitive on GPU */
void Axpby::eval_gpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) {
    // Prepare inputs
    auto &x = inputs[0];
    auto &y = inputs[1];
    auto &out = outputs[0];

    // Each primitive carries the stream it should execute on
    // and each stream carries its device identifiers
    auto &s = stream();
    // We get the needed metal device using the stream
    auto &d = mx::metal::device(s.device);

    // Prepare to specialize based on contiguity
    bool contiguous_kernel = (x.flags().row_contiguous && y.flags().row_contiguous) ||
                             (x.flags().col_contiguous && y.flags().col_contiguous);

    // Allocate output memory with strides based on specialization
    if (contiguous_kernel) {
        out.set_data(mx::allocator::malloc(x.data_size() * out.itemsize()), x.data_size(), x.strides(), x.flags());
    } else {
        out.set_data(mx::allocator::malloc(out.nbytes()));
    }

    // Resolve name of kernel (corresponds to axpby.metal)
    std::ostringstream kname;
    kname << "axpby_";
    kname << (contiguous_kernel ? "contiguous_" : "general_");
    kname << type_to_name(out);

    // Make a kernel from this metal library
    auto kernel = d.get_kernel(kname.str(), "tiny_llm_ext");

    // Prepare to encode kernel
    auto &compute_encoder = d.get_command_encoder(s.index);
    compute_encoder.set_compute_pipeline_state(kernel);

    // Kernel parameters are registered with buffer indices corresponding to
    // those in the kernel declaration at axpby.metal
    int ndim = out.ndim();
    size_t nelem = out.size();

    // Encode input arrays to kernel
    compute_encoder.set_input_array(x, 0);
    compute_encoder.set_input_array(y, 1);

    // Encode output arrays to kernel
    compute_encoder.set_output_array(out, 2);

    // Encode alpha and beta
    compute_encoder.set_bytes(alpha_, 3);
    compute_encoder.set_bytes(beta_, 4);

    // Encode shape, strides and ndim if needed
    if (!contiguous_kernel) {
        compute_encoder.set_vector_bytes(x.shape(), 5);
        compute_encoder.set_vector_bytes(x.strides(), 6);
        compute_encoder.set_vector_bytes(y.strides(), 7);
        compute_encoder.set_bytes(ndim, 8);
    }

    // We launch 1 thread for each input and make sure that the number of
    // threads in any given threadgroup is not higher than the max allowed
    size_t tgp_size = std::min(nelem, kernel->maxTotalThreadsPerThreadgroup());

    // Fix the 3D size of each threadgroup (in terms of threads)
    MTL::Size group_dims = MTL::Size(tgp_size, 1, 1);

    // Fix the 3D size of the launch grid (in terms of threads)
    MTL::Size grid_dims = MTL::Size(nelem, 1, 1);

    // Launch the grid with the given number of threads divided among
    // the given threadgroups
    compute_encoder.dispatch_threads(grid_dims, group_dims);
}

#else  // Metal is not available

/** Fail evaluation on GPU */
void Axpby::eval_gpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &out) {
    throw std::runtime_error("Axpby has no GPU implementation.");
}

#endif

///////////////////////////////////////////////////////////////////////////////
// Primitive Transforms
///////////////////////////////////////////////////////////////////////////////

/** The Jacobian-vector product. */
std::vector<mx::array> Axpby::jvp(const std::vector<mx::array> &primals, const std::vector<mx::array> &tangents,
                                  const std::vector<int> &argnums) {
    // Forward mode diff that pushes along the tangents
    // The jvp transform on the primitive can built with ops
    // that are scheduled on the same stream as the primitive

    // If argnums = {0}, we only push along x in which case the
    // jvp is just the tangent scaled by alpha
    // Similarly, if argnums = {1}, the jvp is just the tangent
    // scaled by beta
    if (argnums.size() > 1) {
        auto scale = argnums[0] == 0 ? alpha_ : beta_;
        auto scale_arr = mx::array(scale, tangents[0].dtype());
        return {mx::multiply(scale_arr, tangents[0], stream())};
    }
    // If, argnums = {0, 1}, we take contributions from both
    // which gives us jvp = tangent_x * alpha + tangent_y * beta
    else {
        return {axpby(tangents[0], tangents[1], alpha_, beta_, stream())};
    }
}

/** The vector-Jacobian product. */
std::vector<mx::array> Axpby::vjp(const std::vector<mx::array> &primals, const std::vector<mx::array> &cotangents,
                                  const std::vector<int> &argnums, const std::vector<mx::array> &) {
    // Reverse mode diff
    std::vector<mx::array> vjps;
    for (auto arg : argnums) {
        auto scale = arg == 0 ? alpha_ : beta_;
        auto scale_arr = mx::array(scale, cotangents[0].dtype());
        vjps.push_back(mx::multiply(scale_arr, cotangents[0], stream()));
    }
    return vjps;
}

/** Vectorize primitive along given axis */
std::pair<std::vector<mx::array>, std::vector<int>> Axpby::vmap(const std::vector<mx::array> &inputs,
                                                                const std::vector<int> &axes) {
    throw std::runtime_error("Axpby has no vmap implementation.");
}

/** Equivalence check **/
bool Axpby::is_equivalent(const Primitive &other) const {
    const Axpby &r_other = static_cast<const Axpby &>(other);
    return alpha_ == r_other.alpha_ && beta_ == r_other.beta_;
}

}  // namespace tiny_llm_ext

```

## /src/extensions/src/axpby.h

```h path="/src/extensions/src/axpby.h" 
// Copyright © 2023-2025 Apple Inc.

#pragma once

#include "mlx/ops.h"
#include "mlx/primitives.h"

namespace mx = mlx::core;

namespace tiny_llm_ext {

///////////////////////////////////////////////////////////////////////////////
// Operation
///////////////////////////////////////////////////////////////////////////////

/**
 *  Scale and sum two vectors element-wise
 *  z = alpha * x + beta * y
 *
 *  Follow numpy style broadcasting between x and y
 *  Inputs are upcasted to floats if needed
 **/
mx::array axpby(const mx::array &x,        // Input array x
                const mx::array &y,        // Input array y
                const float alpha,         // Scaling factor for x
                const float beta,          // Scaling factor for y
                mx::StreamOrDevice s = {}  // Stream on which to schedule the operation
);

///////////////////////////////////////////////////////////////////////////////
// Primitive
///////////////////////////////////////////////////////////////////////////////

class Axpby : public mx::Primitive {
public:
    explicit Axpby(mx::Stream stream, float alpha, float beta) : mx::Primitive(stream), alpha_(alpha), beta_(beta) {};

    /**
     * A primitive must know how to evaluate itself on the CPU/GPU
     * for the given inputs and populate the output array.
     *
     * To avoid unnecessary allocations, the evaluation function
     * is responsible for allocating space for the array.
     */
    void eval_cpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) override;
    void eval_gpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) override;

    /** The Jacobian-vector product. */
    std::vector<mx::array> jvp(const std::vector<mx::array> &primals, const std::vector<mx::array> &tangents,
                               const std::vector<int> &argnums) override;

    /** The vector-Jacobian product. */
    std::vector<mx::array> vjp(const std::vector<mx::array> &primals, const std::vector<mx::array> &cotangents,
                               const std::vector<int> &argnums, const std::vector<mx::array> &outputs) override;

    /**
     * The primitive must know how to vectorize itself across
     * the given axes. The output is a pair containing the array
     * representing the vectorized computation and the axis which
     * corresponds to the output vectorized dimension.
     */
    std::pair<std::vector<mx::array>, std::vector<int>> vmap(const std::vector<mx::array> &inputs,
                                                             const std::vector<int> &axes) override;

    /** Print the primitive. */
    void print(std::ostream &os) override { os << "Axpby"; }

    /** Equivalence check **/
    bool is_equivalent(const mx::Primitive &other) const override;

private:
    float alpha_;
    float beta_;
};

}  // namespace tiny_llm_ext

```

## /src/extensions/src/axpby.metal

```metal path="/src/extensions/src/axpby.metal" 
// Copyright © 2023-2025 Apple Inc.

#include <metal_stdlib>

#include "mlx/backend/metal/kernels/utils.h"

template <typename T>
[[kernel]] void axpby_general(
    device const T* x [[buffer(0)]],
    device const T* y [[buffer(1)]],
    device T* out [[buffer(2)]],
    constant const float& alpha [[buffer(3)]],
    constant const float& beta [[buffer(4)]],
    constant const int* shape [[buffer(5)]],
    constant const int64_t* x_strides [[buffer(6)]],
    constant const int64_t* y_strides [[buffer(7)]],
    constant const int& ndim [[buffer(8)]],
    uint index [[thread_position_in_grid]]) {
  auto x_offset = elem_to_loc(index, shape, x_strides, ndim);
  auto y_offset = elem_to_loc(index, shape, y_strides, ndim);
  out[index] =
      static_cast<T>(alpha) * x[x_offset] + static_cast<T>(beta) * y[y_offset];
}

template <typename T>
[[kernel]] void axpby_contiguous(
    device const T* x [[buffer(0)]],
    device const T* y [[buffer(1)]],
    device T* out [[buffer(2)]],
    constant const float& alpha [[buffer(3)]],
    constant const float& beta [[buffer(4)]],
    uint index [[thread_position_in_grid]]) {
  out[index] =
      static_cast<T>(alpha) * x[index] + static_cast<T>(beta) * y[index];
}

// clang-format off
#define instantiate_axpby(type_name, type)                             \
  instantiate_kernel("axpby_general_" #type_name, axpby_general, type) \
  instantiate_kernel(                                                  \
          "axpby_contiguous_" #type_name, axpby_contiguous, type)

instantiate_axpby(float32, float);
instantiate_axpby(float16, half);
instantiate_axpby(bfloat16, bfloat16_t);
instantiate_axpby(complex64, complex64_t);
// clang-format on

```

## /src/extensions/src/tiny_llm_ext.h

```h path="/src/extensions/src/tiny_llm_ext.h" 
#pragma once

#include "mlx/ops.h"
#include "mlx/primitives.h"

namespace mx = mlx::core;

namespace tiny_llm_ext {

void load_library(mx::Device d, const char *path);

}  // namespace tiny_llm_ext

```

## /src/extensions/src/utils.cpp

```cpp path="/src/extensions/src/utils.cpp" 
#include "tiny_llm_ext.h"

#ifdef _METAL_
#include "mlx/backend/metal/device.h"
#endif

namespace tiny_llm_ext {

void load_library(mx::Device d, const char *path) {
#ifdef _METAL_
    auto &md = mx::metal::device(d);
    md.register_library("tiny_llm_ext", path);
#endif
}

}  // namespace tiny_llm_ext

```

## /src/extensions/test.py

```py path="/src/extensions/test.py" 
# Copyright © 2023-2024 Apple Inc.

from tiny_llm_ext import axpby
import mlx.core as mx
import numpy as np

a = mx.ones((3, 4))
b = mx.ones((3, 4))
c = axpby(a, b, 4.0, 2.0, stream=mx.cpu)

print(f"c shape: {c.shape}")
print(f"c dtype: {c.dtype}")
print(f"c correct: {mx.all(c == 6.0).item()}")

```

## /src/extensions/tiny_llm_ext/__init__.py

```py path="/src/extensions/tiny_llm_ext/__init__.py" 
# Copyright © 2023 Apple Inc.

from pathlib import Path

import mlx.core as mx

try:
    from ._ext import *

    current_path = Path(__file__).parent
    load_library(mx.gpu, str(current_path))
except ImportError:
    print("Failed to load C++/Metal extension")

```

## /src/extensions_ref/.clangd

```clangd path="/src/extensions_ref/.clangd" 
CompileFlags:
    CompilationDatabase: build/tiny_llm_ext_ref._ext

```

## /src/extensions_ref/CMakeLists.txt

cmake_minimum_required(VERSION 3.27)

project(_ext LANGUAGES CXX)

# ----------------------------- Setup -----------------------------
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_POSITION_INDEPENDENT_CODE ON)

option(BUILD_SHARED_LIBS "Build extensions as a shared library" ON)

# ----------------------------- Dependencies -----------------------------
find_package(
  Python 3.8
  COMPONENTS Interpreter Development.Module
  REQUIRED)
execute_process(
  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE nanobind_ROOT)
find_package(nanobind CONFIG REQUIRED)

execute_process(
  COMMAND "${Python_EXECUTABLE}" -m mlx --cmake-dir
  OUTPUT_STRIP_TRAILING_WHITESPACE
  OUTPUT_VARIABLE MLX_ROOT)
find_package(MLX CONFIG REQUIRED)

# ----------------------------- Extensions -----------------------------

# Add library
add_library(tiny_llm_ext_ref)

# Add sources
target_sources(
  tiny_llm_ext_ref
  PUBLIC
  ${CMAKE_CURRENT_LIST_DIR}/src/quantized_matmul.cpp
  ${CMAKE_CURRENT_LIST_DIR}/src/flash_attention.cpp
  ${CMAKE_CURRENT_LIST_DIR}/src/utils.cpp
)

# Add include headers
target_include_directories(tiny_llm_ext_ref PUBLIC ${CMAKE_CURRENT_LIST_DIR} ${CMAKE_CURRENT_LIST_DIR}/src)

# Link to mlx
target_link_libraries(tiny_llm_ext_ref PUBLIC mlx)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON) # so that clangd can pick this up

# ----------------------------- Metal -----------------------------

# Build metallib
if(MLX_BUILD_METAL)
  mlx_build_metallib(
    TARGET
    tiny_llm_ext_ref_metallib
    TITLE
    tiny_llm_ext_ref
    SOURCES
    ${CMAKE_CURRENT_LIST_DIR}/src/quantized_matmul.metal
    ${CMAKE_CURRENT_LIST_DIR}/src/flash_attention.metal
    INCLUDE_DIRS
    ${PROJECT_SOURCE_DIR}
    ${MLX_INCLUDE_DIRS}
    OUTPUT_DIRECTORY
    ${CMAKE_LIBRARY_OUTPUT_DIRECTORY})

  add_dependencies(tiny_llm_ext_ref tiny_llm_ext_ref_metallib)
endif()

# ----------------------------- Python Bindings -----------------------------
nanobind_add_module(
  _ext
  NB_STATIC
  STABLE_ABI
  LTO
  NOMINSIZE
  NB_DOMAIN
  mlx
  ${CMAKE_CURRENT_LIST_DIR}/bindings.cpp)
target_link_libraries(_ext PRIVATE tiny_llm_ext_ref)

if(BUILD_SHARED_LIBS)
  target_link_options(_ext PRIVATE -Wl,-rpath,@loader_path)
endif()


## /src/extensions_ref/bindings.cpp

```cpp path="/src/extensions_ref/bindings.cpp" 
// Copyright © 2023-2024 Apple Inc.

#include <nanobind/nanobind.h>
#include <nanobind/stl/variant.h>

#include "tiny_llm_ext.h"

namespace nb = nanobind;
using namespace nb::literals;

NB_MODULE(_ext, m) {
    m.doc() = "tiny-llm extensions for MLX";

    m.def("load_library", &tiny_llm_ext_ref::load_library, "device"_a, "path"_a);

    m.def("quantized_matmul", &tiny_llm_ext_ref::quantized_matmul, "scales"_a, "biases"_a, "group_size"_a, "bits"_a,
          "a"_a, "b"_a, "transpose_b"_a = false, "stream"_a = nb::none(),
          R"(
        Quantized matmul layer

        Args:
            scales (array): Scaling factors for ``a``.
            biases (array): Biases for ``a``.
            group_size (int): Group size for ``a``.
            bits (int): Number of bits for ``a``.
            a (array): Input array.
            b (array): Input array.
            transpose_b (bool): Whether to transpose ``b`` before multiplication.

        Returns:
            array: ``a * b``
      )");

    m.def("flash_attention", &tiny_llm_ext_ref::flash_attention, "query"_a, "key"_a, "value"_a, "scale"_a = 1.0,
          "num_kv_heads"_a, "num_heads"_a, "stream"_a = nb::none(), R"(
        Flash attention layer

        Args:
            query (array): Query array.
            key (array): Key array.
            value (array): Value array.
            scale (float): Scaling factor.

        Returns:
            array: ``softmax(query @ key.T * scale) @ value``
      )");
}

```

## /src/extensions_ref/build.py

```py path="/src/extensions_ref/build.py" 
from pathlib import Path
import shutil
from mlx import extension
from setuptools import Distribution
import inspect
import mlx
import os

if __name__ == "__main__":
    src_dir = Path(__file__).parent
    distribution = Distribution(
        {
            "name": "tiny_llm_ext_ref",
            "ext_modules": [extension.CMakeExtension("tiny_llm_ext_ref._ext")],
            "package_data": {"tiny_llm_ext_ref": ["*.so", "*.dylib", "*.metallib"]},
        }
    )
    cmd = extension.CMakeBuild(distribution)
    cmd.initialize_options()
    cmd.build_temp = Path("build")
    cmd.build_lib = Path("build") / "lib"
    cmd.inplace = True
    cmd.ensure_finalized()
    cmd.run()

```

## /src/extensions_ref/src/flash_attention.cpp

```cpp path="/src/extensions_ref/src/flash_attention.cpp" 
#include <cstdint>
#include <iostream>
#include <sstream>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/utils.h"
#include "tiny_llm_ext.h"

#ifdef _METAL_
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/utils.h"
#endif

namespace tiny_llm_ext_ref {
mx::array flash_attention(const mx::array &q, const mx::array &k, const mx::array &v, const float scale,
                          const int num_kv_heads, const int num_heads, mx::StreamOrDevice s) {
    if (q.dtype() != mx::float32 || k.dtype() != mx::float32 || v.dtype() != mx::float32) {
        throw std::runtime_error("flash_attention: all input arrays must be float32");
    }
    if (q.shape().size() != 3 || k.shape().size() != 3 || v.shape().size() != 3) {
        throw std::runtime_error("flash_attention: all input arrays must be 3D");
    }
    if (num_heads % num_kv_heads != 0) {
        throw std::runtime_error("flash_attention: num_heads must be divisible by num_kv_heads");
    }
    // Q: [N, S, E]
    // K: [N_KV, L, E]
    // V: [N_KV, L, E]
    // O: [N, S, E]

    if (q.shape()[0] % num_heads != 0) {
        throw std::runtime_error("flash_attention: q.shape[0] must be divisible by num_heads");
    }
    if (k.shape()[0] % num_kv_heads != 0 || v.shape()[0] % num_kv_heads != 0) {
        throw std::runtime_error("flash_attention: k.shape[0] and v.shape[0] must be divisible by num_kv_heads");
    }
    if (q.shape()[2] != k.shape()[2] || q.shape()[2] != v.shape()[2]) {
        throw std::runtime_error("flash_attention: q.shape[2] must be equal to k.shape[2] and v.shape[2]");
    }
    if (q.shape()[0] / num_heads != k.shape()[0] / num_kv_heads) {
        throw std::runtime_error("flash_attention: number of heads mismatch");
    }
    if (k.shape()[1] != v.shape()[1]) {
        throw std::runtime_error("flash_attention: k.shape[1] must be equal to v.shape[1]");
    }

    return mx::array(q.shape(), mx::float32,
                     std::make_shared<FlashAttention>(to_stream(s), scale, num_kv_heads, num_heads), {q, k, v});
}

void FlashAttention::eval_cpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) {
    auto &q = inputs[0];
    auto &k = inputs[1];
    auto &v = inputs[2];
    auto &out = outputs[0];

    out.set_data(mx::allocator::malloc(out.nbytes()));

    auto &encoder = mx::cpu::get_command_encoder(stream());
    encoder.set_input_array(q);
    encoder.set_input_array(k);
    encoder.set_input_array(v);
    encoder.set_output_array(out);

    if (!q.flags().row_contiguous) {
        throw std::runtime_error("flash_attention: q must be contiguous");
    }
    if (!k.flags().row_contiguous) {
        throw std::runtime_error("flash_attention: k must be contiguous");
    }
    if (!v.flags().row_contiguous) {
        throw std::runtime_error("flash_attention: v must be contiguous");
    }

    // Launch the CPU kernel
    encoder.dispatch([out_ptr = out.data<float>(), out_shape = out.shape(), q = mx::array::unsafe_weak_copy(q),
                      k = mx::array::unsafe_weak_copy(k), v = mx::array::unsafe_weak_copy(v), num_heads = num_heads_,
                      num_kv_heads = num_kv_heads_, scale = scale_]() {
        const int64_t N = q.shape()[0];
        const int64_t S = q.shape()[1];
        const int64_t L = k.shape()[1];
        const int64_t E = q.shape()[2];
        const int64_t N_Q_HEAD = S * E;
        const int64_t N_K_HEAD = L * E;
        const int64_t Br = 32;
        const int64_t Bc = 32;
        const int64_t Tr = (S + Br - 1) / Br;
        const int64_t Tc = (L + Bc - 1) / Bc;

        const int64_t q_kv_heads_ratio = num_heads / num_kv_heads;
        const float *q_ptr = q.data<float>();
        const float *k_ptr = k.data<float>();
        const float *v_ptr = v.data<float>();

        for (int64_t n = 0; n < N; n++) {
            const float *q_batch = q_ptr + n * N_Q_HEAD;
            const float *k_batch = k_ptr + (n / q_kv_heads_ratio) * N_K_HEAD;
            const float *v_batch = v_ptr + (n / q_kv_heads_ratio) * N_K_HEAD;
            for (int64_t i = 0; i < Tr; i++) {
                std::vector<float> q_i(Br * E, 0.0);
                int br_upper_bound = std::min(S - i * Br, Br);
                // Load Qi
                for (int64_t a = 0; a < br_upper_bound; a++) {
                    for (int64_t b = 0; b < E; b++) {
                        int q_idx = (i * Br + a) * E + b;
                        q_i[a * E + b] = q_batch[q_idx];
                    }
                }
                std::vector<float> o_i(Br * E, 0.0);
                std::vector<float> l_i(Br, 0.0);
                std::vector<float> m_i(Br, -std::numeric_limits<float>::infinity());
                for (int64_t j = 0; j < Tc; j++) {
                    int bc_upper_bound = std::min(L - j * Bc, Bc);
                    // Each kernel processes a block of Br x Bc
                    // Load Kj and Vj
                    std::vector<float> k_j(Bc * E, 0.0);
                    std::vector<float> v_j(Bc * E, 0.0);
                    for (int64_t a = 0; a < bc_upper_bound; a++) {
                        int64_t kv_idx_base = j * Bc + a;
                        for (int64_t b = 0; b < E; b++) {
                            int kv_idx = kv_idx_base * E + b;
                            if (kv_idx_base < L) {
                                k_j[a * E + b] = k_batch[kv_idx];
                                v_j[a * E + b] = v_batch[kv_idx];
                            }
                        }
                    }

                    std::vector<float> s_i(Br * Bc, 0.0);
                    // Compute s_i = q_i * k_j^T
                    for (int64_t a = 0; a < br_upper_bound; a++) {
                        for (int64_t b = 0; b < bc_upper_bound; b++) {
                            for (int64_t c = 0; c < E; c++) {
                                s_i[a * Bc + b] += q_i[a * E + c] * k_j[b * E + c];
                            }
                        }
                    }

                    // m_i from iteration j = max(m_i from iteration j-1, rowmax(s_i))
                    std::vector<float> m_i_diff(Br, 0.0);
                    for (int64_t a = 0; a < br_upper_bound; a++) {
                        float rowmax = -std::numeric_limits<float>::infinity();
                        for (int64_t b = 0; b < bc_upper_bound; b++) {
                            rowmax = std::max(rowmax, s_i[a * Bc + b]);
                        }
                        float max = std::max(m_i[a], rowmax);
                        m_i_diff[a] = m_i[a] - max;
                        m_i[a] = max;
                    }

                    // compute p_j
                    std::vector<float> p(Br * Bc, 0.0);
                    for (int64_t a = 0; a < br_upper_bound; a++) {
                        for (int64_t b = 0; b < bc_upper_bound; b++) {
                            p[a * Bc + b] = std::exp(s_i[a * Bc + b] - m_i[a]);
                        }
                    }

                    // compute l
                    for (int64_t a = 0; a < br_upper_bound; a++) {
                        // compute rowsum(p)
                        float rowsum = 0.0;
                        for (int64_t b = 0; b < bc_upper_bound; b++) {
                            rowsum += p[a * Bc + b];
                        }
                        l_i[a] = std::exp(m_i_diff[a]) * l_i[a] + rowsum;
                    }

                    // compute o_i = diag(std::exp(m_i_diff)) * o_i from prev iteration + p * v_j
                    for (int64_t a = 0; a < br_upper_bound; a++) {
                        for (int64_t c = 0; c < E; c++) {
                            // compute p @ v_j
                            float res = 0;
                            for (int64_t b = 0; b < bc_upper_bound; b++) {
                                res += p[a * Bc + b] * v_j[b * E + c];
                            }
                            o_i[a * E + c] = std::exp(m_i_diff[a]) * o_i[a * E + c] + res;
                        }
                    }
                }
                // o_i = diag(l_i)^-1 * o_i
                for (int64_t a = 0; a < br_upper_bound; a++) {
                    for (int64_t b = 0; b < E; b++) {
                        o_i[a * E + b] /= l_i[a];
                    }
                }
                // l_i = m_i + log(l_i)
                for (int64_t a = 0; a < br_upper_bound; a++) {
                    l_i[a] = m_i[a] + std::log(l_i[a]);
                }
                // store o_i
                for (int64_t a = 0; a < br_upper_bound; a++) {
                    for (int64_t b = 0; b < E; b++) {
                        int out_idx = i * Br + a;
                        if (out_idx < S) {
                            out_ptr[n * N_Q_HEAD + out_idx * E + b] = o_i[a * E + b];
                        }
                    }
                }
                // ignore l_i -- we might use it in the future
            }
        }
    });
}

void FlashAttention::eval_gpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) {
    const auto &q = inputs[0];
    const auto &k = inputs[1];
    const auto &v = inputs[2];
    auto &out = outputs[0];

    auto &s = stream();
    auto &d = mx::metal::device(s.device);
    out.set_data(mx::allocator::malloc(out.nbytes()));

    // Make a kernel from this metal library
    auto kernel = d.get_kernel("flash_attention_f32_e128", "tiny_llm_ext_ref");

    // Prepare to encode kernel
    auto &compute_encoder = d.get_command_encoder(s.index);
    compute_encoder.set_compute_pipeline_state(kernel);

    // Kernel parameters are registered with buffer indices corresponding to
    // those in the kernel declaration at axpby.metal
    int ndim = out.ndim();

    // Encode input arrays to kernel
    compute_encoder.set_input_array(q, 0);
    compute_encoder.set_input_array(k, 1);
    compute_encoder.set_input_array(v, 2);

    // Encode output arrays to kernel
    compute_encoder.set_output_array(out, 3);

    if (!q.flags().row_contiguous) {
        throw std::runtime_error("flash_attention: q must be contiguous");
    }
    if (!k.flags().row_contiguous) {
        throw std::runtime_error("flash_attention: k must be contiguous");
    }
    if (!v.flags().row_contiguous) {
        throw std::runtime_error("flash_attention: v must be contiguous");
    }

    const int64_t N = q.shape()[0];
    const int64_t S = q.shape()[1];
    const int64_t L = k.shape()[1];
    const int64_t E = q.shape()[2];

    compute_encoder.set_bytes(N, 4);
    compute_encoder.set_bytes(S, 5);
    compute_encoder.set_bytes(L, 6);
    compute_encoder.set_bytes(E, 7);

    compute_encoder.set_bytes(num_kv_heads_, 8);
    compute_encoder.set_bytes(num_heads_, 9);
    compute_encoder.set_bytes(scale_, 10);

    size_t tgp_size = kernel->maxTotalThreadsPerThreadgroup();
    size_t simd_width = kernel->threadExecutionWidth();

    const int64_t Br = 32;
    const int64_t Bc = 32;
    if (simd_width * Br > tgp_size) {
        throw std::runtime_error("flash_attention: simd_width * Br must be equal to tgp_size");
    }
    if (Bc > simd_width) {
        throw std::runtime_error("flash_attention: Bc must be less than simd_width");
    }

    if (E > 128) {
        throw std::runtime_error("flash_attention: E must be less than 128");
    }

    if (Br > 32) {
        throw std::runtime_error("flash_attention: Br must be less than 32");
    }

    const int64_t Tr = (S + Br - 1) / Br;
    const int64_t Tc = (L + Bc - 1) / Bc;

    compute_encoder.set_bytes(Br, 11);
    compute_encoder.set_bytes(Bc, 12);
    compute_encoder.set_bytes(Tr, 13);
    compute_encoder.set_bytes(Tc, 14);

    MTL::Size num_threadgroups = MTL::Size(N, Tr, 1);
    MTL::Size num_threads_per_group = MTL::Size(Br, simd_width, 1);

    compute_encoder.dispatch_threadgroups(num_threadgroups, num_threads_per_group);
}
}  // namespace tiny_llm_ext_ref

```

## /src/extensions_ref/src/flash_attention.metal

```metal path="/src/extensions_ref/src/flash_attention.metal" 
#include <metal_stdlib>
using namespace metal;

[[kernel]] void flash_attention_f32_e128(
    device const float* q [[buffer(0)]],
    device const float* k [[buffer(1)]],
    device const float* v [[buffer(2)]],
    device float* out [[buffer(3)]],
    [[maybe_unused]] device const int64_t &N [[buffer(4)]],
    device const int64_t &S [[buffer(5)]],
    device const int64_t &L [[buffer(6)]],
    device const int64_t &E [[buffer(7)]],
    device const int64_t &num_kv_heads [[buffer(8)]],
    device const int64_t &num_heads [[buffer(9)]],
    device const float &scale [[buffer(10)]],
    device const int64_t &Br [[buffer(11)]],
    device const int64_t &Bc [[buffer(12)]],
    [[maybe_unused]] device const int64_t &Tr [[buffer(13)]],
    device const int64_t &Tc [[buffer(14)]],
    uint2 group_id [[threadgroup_position_in_grid]],
    uint simd_gid [[simdgroup_index_in_threadgroup]],
    uint simd_lid [[thread_index_in_simdgroup]]) {

    int n = group_id.x;
    int i = group_id.y; // loop over Tr
    int a = simd_gid; // max=Br
    int b = simd_lid; // max=Bc

    // We do not use the shared memory for the threadgroup in this course --
    // this is left as an exercise for the students. For example, you can allocate
    // 128*32*sizeof(float) bytes * number of arrays and use them as the threadgroup
    // shared memory.

    bool is_i_in_range = i * Br + a < S && a < Br;

    const int q_kv_ratio = num_heads / num_kv_heads;
    device const float *q_ptr = q + n * S * E + i * Br * E;
    device const float *k_ptr_base = k + (n / q_kv_ratio) * L * E;
    device const float *v_ptr_base = v + (n / q_kv_ratio) * L * E;
    threadgroup float o_i[32][128]; // Br x E, each simd group shares an o_i, only lane 0 writes to it

    if (simd_lid == 0) {
        for (int c = 0; c < E; c++) {
            o_i[a][c] = 0.0;
        }
    }

    // q_ptr: S * E
    // k_ptr: L * E
    // v_ptr: L * E
    // To access q[a, c]: use a * E + c
    // To access k/v[b, c]: use b * E + c

    float m_i = -1e9; // per thread; sync to threadgroup memory later
    float l_i = 0.0; // per thread; sync to threadgroup memory later

    for (int j = 0; j < Tc; j++) {
        bool is_j_in_range = j * Bc + b < L && b < Bc;

        device const float *k_ptr = k_ptr_base + j * Bc * E;
        device const float *v_ptr = v_ptr_base + j * Bc * E;

        // compute s_i = q_i @ k_j^T; store the result of each cell in thread local memory
        float s_a_b = 0.0;
        for (int c = 0; c < E; c++) {
            if (is_i_in_range && is_j_in_range) {
                s_a_b += q_ptr[a * E + c] * k_ptr[b * E + c];
            }
        }
        // for each cell, get the rowmax of the corresponding row, and compute m_i in each
        // of the cells
        float rowmax = simd_max(s_a_b);
        float new_max = max(m_i, rowmax);
        float m_i_diff = m_i - new_max;
        float m_i_diff_exp = exp(m_i_diff);
        m_i = new_max;

        // compute matrix p_j for each of the cell
        float p_a_b;
        if (is_i_in_range && is_j_in_range) {
            p_a_b = exp(s_a_b - m_i);
        } else {
            p_a_b = 0.0;
        }

        // compute l
        // get the rowsum of each row of p_j in all of the cells
        float rowsum = simd_sum(p_a_b);
        l_i = m_i_diff_exp * l_i + rowsum;

        // compute o_i, where O is Br x E; note that this does not align
        // with the threadgroup we dispatch, so we have to do threadgroup sync
        for (int c = 0; c < E; c++) {
            float v;
            if (is_i_in_range && is_j_in_range) {
                v = p_a_b * v_ptr[b * E + c];
            } else {
                v = 0.0;
            }
            float res = simd_sum(v); // res = sum(p_a_b * v_j) on each cell
            // only lane 0 will write to threadgroup memory
            if (simd_lid == 0) {
                o_i[a][c] = m_i_diff_exp * o_i[a][c] + res;
            }
        }
    }

    // write to output
    if (simd_lid == 0) {
        for (int c = 0; c < E; c++) {
            o_i[a][c] /= l_i;
        }
        for (int c = 0; c < E; c++) {
            if (is_i_in_range) {
                out[n * S * E + (i * Br + a) * E + c] = o_i[a][c];
            }
        }
    }
}

```

## /src/extensions_ref/src/quantized_matmul.cpp

```cpp path="/src/extensions_ref/src/quantized_matmul.cpp" 
#include <cstdint>
#include <iostream>
#include <sstream>

#include "mlx/backend/common/utils.h"
#include "mlx/backend/cpu/encoder.h"
#include "mlx/utils.h"
#include "tiny_llm_ext.h"

#ifdef _METAL_
#include "mlx/backend/metal/device.h"
#include "mlx/backend/metal/utils.h"
#endif

namespace tiny_llm_ext_ref {

mx::array quantized_matmul(const mx::array &scales,         // Input array scales
                           const mx::array &biases,         // Input array biases
                           const int group_size,            // Group size
                           const int bits,                  // Number of bits
                           const mx::array &a,              // Input array a (not quantized)
                           const mx::array &b,              // Input array b (quantized)
                           const bool transpose_b,          // Whether to transpose b
                           mx::StreamOrDevice s /* = {} */  // Stream on which to schedule the operation
) {
    if (scales.dtype() != mx::float16 || biases.dtype() != mx::float16) {
        throw std::runtime_error("quantized_matmul: scales and biases must be float16");
    }
    if (b.dtype() != mx::uint32) {
        throw std::runtime_error("quantized_matmul: b must be uint32");
    }
    if (a.dtype() != mx::float16) {
        throw std::runtime_error("quantized_matmul: a must be float16");
    }
    if (a.shape().size() != 2) {
        throw std::runtime_error("quantized_matmul: a must be a 2D array");
    }
    if (b.shape().size() != 2) {
        throw std::runtime_error("quantized_matmul: b must be a 2D array");
    }
    if (bits != 4) {
        throw std::runtime_error("quantized_matmul: bits must be 4");
    }
    if (group_size != 64) {
        throw std::runtime_error("quantized_matmul: group_size must be 64");
    }
    auto out_shape = a.shape();
    if (out_shape.size() != 2) {
        throw std::runtime_error("quantized_matmul: a must be a 2D array");
    }
    out_shape[1] = b.shape()[0];
    if (!transpose_b) {
        throw std::runtime_error("quantized_matmul: b must be transposed");
    }

    if (scales.shape() != biases.shape()) {
        throw std::runtime_error("quantized_matmul: scales and biases must have the same shape");
    }
    if (b.shape()[0] != scales.shape()[0]) {
        throw std::runtime_error("quantized_matmul: b must have the same number of rows as scales");
    }
    if (b.shape()[1] != scales.shape()[1] * group_size / 8) {
        throw std::runtime_error("quantized_matmul: a must have the same number of columns as scales");
    }
    if (a.shape()[1] != b.shape()[1] * 8) {
        throw std::runtime_error("quantized_matmul: a must have the same number of columns as b");
    }

    return mx::array(
        /* const mx::Shape& shape = */ out_shape,
        /* mx::Dtype dtype = */ mx::float16,
        /* std::shared_ptr<mx::Primitive> primitive = */
        std::make_shared<QuantizedMatmul>(to_stream(s), group_size, bits),
        /* const std::vector<mx::array>& inputs = */ {scales, biases, a, b});
}

void quantized_matmul_impl(const mx::array &scales, const mx::array &biases, const mx::array &a, const mx::array &b,
                           mx::array &out, int group_size, int bits, mx::Stream stream) {
    out.set_data(mx::allocator::malloc(out.nbytes()));

    auto &encoder = mx::cpu::get_command_encoder(stream);
    encoder.set_input_array(scales);
    encoder.set_input_array(biases);
    encoder.set_input_array(a);
    encoder.set_input_array(b);
    encoder.set_output_array(out);

    if (!a.flags().row_contiguous) {
        throw std::runtime_error("quantized_matmul: a must be contiguous");
    }
    if (!b.flags().row_contiguous) {
        throw std::runtime_error("quantized_matmul: b must be contiguous");
    }

    // Launch the CPU kernel
    encoder.dispatch([out_ptr = out.data<float16_t>(), out_shape = out.shape(), out_strides = out.strides(),
                      a = mx::array::unsafe_weak_copy(a), b = mx::array::unsafe_weak_copy(b),
                      scales = mx::array::unsafe_weak_copy(scales), biases = mx::array::unsafe_weak_copy(biases)]() {
        int M = a.shape()[0];
        int N = a.shape()[1];
        int K = b.shape()[0];
        const int group_size = 64;
        const int bits = 4;
        const int group_per_row = N / group_size;
        const float16_t *a_ptr = a.data<float16_t>();
        const uint32_t *b_ptr = b.data<uint32_t>();
        const float16_t *scales_ptr = scales.data<float16_t>();
        const float16_t *biases_ptr = biases.data<float16_t>();
        uint32_t item_mask = (1 << bits) - 1;
        for (int i = 0; i < M; i++) {
            for (int k = 0; k < K; k++) {
                float sum = 0;
                for (int group_idx = 0; group_idx < group_per_row; group_idx++) {
                    int64_t scales_loc =
                        mx::elem_to_loc(k * group_per_row + group_idx, scales.shape(), scales.strides());
                    int64_t biases_loc =
                        mx::elem_to_loc(k * group_per_row + group_idx, biases.shape(), biases.strides());
                    float16_t scale = scales_ptr[scales_loc];
                    float16_t bias = biases_ptr[biases_loc];
                    int64_t b_loc = mx::elem_to_loc((k * N + group_idx * group_size) / 8, b.shape(), b.strides());
                    int64_t a_loc = mx::elem_to_loc(i * N + group_idx * group_size, a.shape(), a.strides());
                    const int packs_per_item = 32 / bits;
                    for (int item_idx = 0; item_idx < group_size; item_idx += packs_per_item) {
                        uint32_t b_val = b_ptr[b_loc];
                        uint8_t *b_bytes = reinterpret_cast<uint8_t *>(&b_val);
                        for (int pack_idx = 0; pack_idx < packs_per_item; pack_idx++) {
                            uint8_t item_val = (b_bytes[pack_idx / 2] >> ((pack_idx % 2) * bits)) & item_mask;
                            float b = static_cast<float>(item_val) * scale + bias;
                            float a = a_ptr[a_loc];
                            sum += a * b;
                            a_loc += 1;
                        }
                        b_loc += 1;
                    }
                }
                int64_t out_loc = mx::elem_to_loc(i * K + k, out_shape, out_strides);
                out_ptr[out_loc] = static_cast<float16_t>(sum);
            }
        }
    });
}

void QuantizedMatmul::eval_cpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) {
    auto &scales = inputs[0];
    auto &biases = inputs[1];
    auto &a = inputs[2];
    auto &b = inputs[3];
    auto &out = outputs[0];

    // TODO: dispatch to f32, f16, bf16
    quantized_matmul_impl(scales, biases, a, b, out, group_size_, bits_, stream());
}

void QuantizedMatmul::eval_gpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) {
    auto &scales = inputs[0];
    auto &biases = inputs[1];
    auto &a = inputs[2];
    auto &b = inputs[3];
    auto &out = outputs[0];

    auto &s = stream();
    auto &d = mx::metal::device(s.device);
    out.set_data(mx::allocator::malloc(out.nbytes()));

    // Make a kernel from this metal library
    auto kernel = d.get_kernel("quantized_matmul_w4a16_g64", "tiny_llm_ext_ref");

    // Prepare to encode kernel
    auto &compute_encoder = d.get_command_encoder(s.index);
    compute_encoder.set_compute_pipeline_state(kernel);

    // Encode input arrays to kernel
    compute_encoder.set_input_array(scales, 0);
    compute_encoder.set_input_array(biases, 1);
    compute_encoder.set_input_array(a, 2);
    compute_encoder.set_input_array(b, 3);
    // Encode output arrays to kernel
    compute_encoder.set_output_array(out, 4);

    if (!a.flags().row_contiguous) {
        throw std::runtime_error("quantized_matmul: a must be contiguous");
    }
    if (!b.flags().row_contiguous) {
        throw std::runtime_error("quantized_matmul: b must be contiguous");
    }

    int M = a.shape()[0];
    int N = a.shape()[1];
    int K = b.shape()[0];

    if (N % group_size_ != 0) {
        throw std::runtime_error("quantized_matmul: N must be divisible by group_size");
    }

    // Encode matrix parameters
    compute_encoder.set_bytes(M, 5);
    compute_encoder.set_bytes(N, 6);
    compute_encoder.set_bytes(K, 7);

    size_t tgp_size = kernel->maxTotalThreadsPerThreadgroup();
    const int x_size = 32;
    const int y_size = tgp_size / x_size;
    if (tgp_size < x_size * y_size) {
        throw std::runtime_error("quantized_matmul: tgp_size must be larger than x*y");
    }
    MTL::Size num_threadgroups = MTL::Size((M + x_size - 1) / x_size, (K + y_size - 1) / y_size, 1);
    MTL::Size num_threads_per_group = MTL::Size(x_size, y_size, 1);

    // MTL::Size num_threadgroups = MTL::Size((M * K + tgp_size - 1) / tgp_size, 1, 1);
    // MTL::Size num_threads_per_group = MTL::Size(tgp_size, 1, 1);

    // Launch the grid with the given number of threads divided among
    // the given threadgroups
    compute_encoder.dispatch_threadgroups(num_threadgroups, num_threads_per_group);
}

bool QuantizedMatmul::is_equivalent(const Primitive &other) const {
    const QuantizedMatmul &r_other = static_cast<const QuantizedMatmul &>(other);
    return group_size_ == r_other.group_size_ && bits_ == r_other.bits_;
}

}  // namespace tiny_llm_ext_ref

```

## /src/extensions_ref/src/quantized_matmul.metal

```metal path="/src/extensions_ref/src/quantized_matmul.metal" 
[[kernel]] void quantized_matmul_w4a16_g64(
    device const half* scales [[buffer(0)]],
    device const half* biases [[buffer(1)]],
    device const half* a [[buffer(2)]],
    device const uint32_t* b [[buffer(3)]],
    device half* out [[buffer(4)]],
    device const int &M [[buffer(5)]],
    device const int &N [[buffer(6)]],
    device const int &K [[buffer(7)]],
    uint3 group_id [[threadgroup_position_in_grid]],
    uint3 thread_id [[thread_position_in_threadgroup]],
    uint3 threads_per_threadgroup [[threads_per_threadgroup]],
    threadgroup char * shmem [[threadgroup(0)]]) {
    const int group_size = 64;
    const int bits = 4;
    const int packs_per_item = 32 / bits;
    const int groups_per_row = N / group_size;
    // Each thread processes an element in the output matrix
    const int i = group_id.x * threads_per_threadgroup.x + thread_id.x;
    const int k = group_id.y * threads_per_threadgroup.y + thread_id.y;
    float sum = 0;
    int scales_biases_loc = k * groups_per_row;
    const int mask = (1 << bits) - 1;
    // A: M * N, B: K * N where N gets quantized
    if (i < M && k < K) {
        int b_loc = k * N / packs_per_item;
        int a_loc = i * N;
        for (int group_idx = 0; group_idx < groups_per_row; group_idx++) {
            const float scale = scales[scales_biases_loc];
            const float bias = biases[scales_biases_loc];
            for (int item_idx = 0; item_idx < group_size; item_idx += packs_per_item) {
                uint32_t b_val_packed = b[b_loc];
                sum += (static_cast<float>((b_val_packed >> 0) & mask) * scale + bias) * static_cast<float>(a[a_loc]);
                sum += (static_cast<float>((b_val_packed >> 4) & mask) * scale + bias) * static_cast<float>(a[a_loc + 1]);
                sum += (static_cast<float>((b_val_packed >> 8) & mask) * scale + bias) * static_cast<float>(a[a_loc + 2]);
                sum += (static_cast<float>((b_val_packed >> 12) & mask) * scale + bias) * static_cast<float>(a[a_loc + 3]);
                sum += (static_cast<float>((b_val_packed >> 16) & mask) * scale + bias) * static_cast<float>(a[a_loc + 4]);
                sum += (static_cast<float>((b_val_packed >> 20) & mask) * scale + bias) * static_cast<float>(a[a_loc + 5]);
                sum += (static_cast<float>((b_val_packed >> 24) & mask) * scale + bias) * static_cast<float>(a[a_loc + 6]);
                sum += (static_cast<float>((b_val_packed >> 28) & mask) * scale + bias) * static_cast<float>(a[a_loc + 7]);
                a_loc += packs_per_item;
                b_loc += 1;
            }
            scales_biases_loc += 1;
        }
        out[i * K + k] = sum;
    }
}

```

## /src/extensions_ref/src/tiny_llm_ext.h

```h path="/src/extensions_ref/src/tiny_llm_ext.h" 
#pragma once

#include "mlx/ops.h"
#include "mlx/primitives.h"

namespace mx = mlx::core;

namespace tiny_llm_ext_ref {

void load_library(mx::Device d, const char *path);

mx::array quantized_matmul(const mx::array &scales,   // Input array scales
                           const mx::array &biases,   // Input array biases
                           const int group_size,      // Group size
                           const int bits,            // Number of bits
                           const mx::array &a,        // Input array a (not quantized)
                           const mx::array &b,        // Input array b (quantized)
                           const bool transpose_b,    // Whether to transpose b
                           mx::StreamOrDevice s = {}  // Stream on which to schedule the operation
);

class QuantizedMatmul : public mx::Primitive {
public:
    explicit QuantizedMatmul(mx::Stream stream, const int group_size, const int bits)
        : mx::Primitive(stream), group_size_(group_size), bits_(bits) {};

    void eval_cpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) override;
    void eval_gpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) override;

    std::pair<std::vector<mx::array>, std::vector<int>> vmap(const std::vector<mx::array> &inputs,
                                                             const std::vector<int> &axes) override {
        throw std::runtime_error("QuantizedMatmul has no vmap implementation.");
    }

    void print(std::ostream &os) override { os << "QuantizedMatmul"; }

    bool is_equivalent(const mx::Primitive &other) const override;

private:
    int group_size_;
    int bits_;
};

mx::array flash_attention(const mx::array &q, const mx::array &k, const mx::array &v, const float scale,
                          const int num_kv_heads, const int num_heads, mx::StreamOrDevice s = {});

class FlashAttention : public mx::Primitive {
public:
    explicit FlashAttention(mx::Stream stream, const float scale, const int num_kv_heads, const int num_heads)
        : mx::Primitive(stream), scale_(scale), num_kv_heads_(num_kv_heads), num_heads_(num_heads) {};

    void eval_cpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) override;
    void eval_gpu(const std::vector<mx::array> &inputs, std::vector<mx::array> &outputs) override;

    std::pair<std::vector<mx::array>, std::vector<int>> vmap(const std::vector<mx::array> &inputs,
                                                             const std::vector<int> &axes) override {
        throw std::runtime_error("FlashAttention has no vmap implementation.");
    }

    void print(std::ostream &os) override { os << "FlashAttention"; }

    bool is_equivalent(const mx::Primitive &other) const override {
        const FlashAttention &r_other = static_cast<const FlashAttention &>(other);
        return scale_ == r_other.scale_ && num_kv_heads_ == r_other.num_kv_heads_ && num_heads_ == r_other.num_heads_;
    }

private:
    float scale_;
    int num_kv_heads_;
    int num_heads_;
};

}  // namespace tiny_llm_ext_ref

```

## /src/extensions_ref/src/utils.cpp

```cpp path="/src/extensions_ref/src/utils.cpp" 
#include "tiny_llm_ext.h"

#ifdef _METAL_
#include "mlx/backend/metal/device.h"
#endif

namespace tiny_llm_ext_ref {

void load_library(mx::Device d, const char *path) {
#ifdef _METAL_
    auto &md = mx::metal::device(d);
    md.register_library("tiny_llm_ext_ref", path);
#endif
}

}  // namespace tiny_llm_ext_ref

```

## /src/extensions_ref/test.py

```py path="/src/extensions_ref/test.py" 
from tiny_llm_ext_ref import quantized_matmul
import mlx.core as mx
import numpy as np

precision = np.float16
input = mx.array(np.random.randn(3, 64).astype(precision))
weight = mx.array(np.random.randn(5, 64).astype(precision))
w_q, scales, biases = mx.quantize(weight)
user_out = quantized_matmul(
    scales=scales,
    biases=biases,
    group_size=64,
    bits=4,
    a=input,
    b=w_q,
    transpose_b=True,
)
print(user_out)

```

## /src/extensions_ref/tiny_llm_ext_ref/__init__.py

```py path="/src/extensions_ref/tiny_llm_ext_ref/__init__.py" 
# Copyright © 2023 Apple Inc.

from pathlib import Path

import mlx.core as mx

try:
    from ._ext import *

    current_path = Path(__file__).parent
    load_library(mx.gpu, str(current_path))
except ImportError:
    print("Failed to load C++/Metal extension")

```

## /src/tiny_llm/__init__.py

```py path="/src/tiny_llm/__init__.py" 
from .attention import *
from .basics import *
from .embedding import *
from .layer_norm import *
from .positional_encoding import *
from .quantize import *
from .generate import *
from .qwen2_week1 import Qwen2ModelWeek1
from .qwen2_week2 import Qwen2ModelWeek2
from .sampler import *

```

## /src/tiny_llm/attention.py

```py path="/src/tiny_llm/attention.py" 
import mlx.core as mx
from .basics import softmax, linear


def scaled_dot_product_attention_simple(
    query: mx.array,
    key: mx.array,
    value: mx.array,
    scale: float | None = None,
    mask: mx.array | None = None,
) -> mx.array:
    pass


class SimpleMultiHeadAttention:
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        wq: mx.array,
        wk: mx.array,
        wv: mx.array,
        wo: mx.array,
    ):
        pass

    def __call__(
        self,
        query: mx.array,
        key: mx.array,
        value: mx.array,
        mask: mx.array | None = None,
    ) -> mx.array:
        pass


def causal_mask(L: int, S: int, dtype: mx.Dtype) -> mx.array:
    pass


def scaled_dot_product_attention_grouped(
    query: mx.array,
    key: mx.array,
    value: mx.array,
    scale: float | None = None,
    mask: mx.array | str | None = None,
) -> mx.array:
    pass


def flash_attention(
    query: mx.array,
    key: mx.array,
    value: mx.array,
    scale: float | None = None,
) -> mx.array:
    pass

```

## /src/tiny_llm/basics.py

```py path="/src/tiny_llm/basics.py" 
import mlx.core as mx
import math


def softmax(x: mx.array, axis: int) -> mx.array:
    # TODO: manual implementation
    return mx.softmax(x, axis=axis)


def linear(
    x: mx.array,
    w: mx.array,
    bias: mx.array | None = None,
) -> mx.array:
    pass


def silu(x: mx.array) -> mx.array:
    pass

```

## /src/tiny_llm/embedding.py

```py path="/src/tiny_llm/embedding.py" 
import mlx.core as mx


class Embedding:
    def __init__(self, vocab_size: int, embedding_dim: int, weight: mx.array):
        pass

    def __call__(self, x: mx.array) -> mx.array:
        pass

    def as_linear(self, x: mx.array) -> mx.array:
        pass

```

## /src/tiny_llm/generate.py

```py path="/src/tiny_llm/generate.py" 
import mlx.core as mx
from mlx_lm.tokenizer_utils import TokenizerWrapper
from .qwen2_week1 import Qwen2ModelWeek1
from .qwen2_week2 import Qwen2ModelWeek2
from typing import Callable


def simple_generate(
    model: Qwen2ModelWeek1,
    tokenizer: TokenizerWrapper,
    prompt: str,
    sampler: Callable[[mx.array], mx.array] | None,
) -> str:
    pass


def simple_generate_with_kv_cache(
    model: Qwen2ModelWeek2, tokenizer: TokenizerWrapper, prompt: str
) -> str:
    pass


def batch_generate(
    model: any,
    tokenizer: TokenizerWrapper,
    prompts: list[str],
    max_seq_len=512,
    batch_size=5,
    prefill_step=128,
):
    pass

```

## /src/tiny_llm/kv_cache.py

```py path="/src/tiny_llm/kv_cache.py" 
from typing import Optional

import mlx.core as mx


class TinyKvCache:
    def update_and_fetch(
        self, key: mx.array, value: mx.array
    ) -> tuple[mx.array, mx.array, int]:
        pass


class BatchingKvCache(TinyKvCache):
    def __init__(self, max_active_requests: int, max_seq_len: int):
        pass

    def update_and_fetch(
        self, key: mx.array, value: mx.array
    ) -> tuple[mx.array, mx.array, int]:
        pass

    def add_request(self, prefilled: TinyKvCache, id: int):
        pass

    def remove_request(self, id: int):
        pass


class TinyKvFullCache(TinyKvCache):
    def __init__(self):
        pass

    def update_and_fetch(
        self, key: mx.array, value: mx.array
    ) -> tuple[mx.array, mx.array, int]:
        pass


class TinyKvRotatingCache(TinyKvCache):
    def __init__(self, max_seq_len: int):
        pass

    def update_and_fetch(
        self, key: mx.array, value: mx.array, offset: int
    ) -> tuple[mx.array, mx.array]:
        pass

```

## /src/tiny_llm/layer_norm.py

```py path="/src/tiny_llm/layer_norm.py" 
import mlx.core as mx


class RMSNorm:
    def __init__(self, dim: int, weight: mx.array, eps: float = 1e-5):
        pass

    def __call__(self, x: mx.array) -> mx.array:
        pass

```

## /src/tiny_llm/positional_encoding.py

```py path="/src/tiny_llm/positional_encoding.py" 
import mlx.core as mx


class RoPE:
    def __init__(
        self,
        dims: int,
        seq_len: int,
        base: int = 10000,
        traditional: bool = False,
    ):
        pass

    def __call__(
        self, x: mx.array, offset: list[slice] | slice | None = None
    ) -> mx.array:
        pass

```

## /src/tiny_llm/quantize.py

```py path="/src/tiny_llm/quantize.py" 
import mlx.core as mx
from typing import Any


def dequantize_linear(mx_layer: Any) -> mx.array:
    w = mx.dequantize(
        mx_layer.weight,
        mx_layer.scales,
        mx_layer.biases,
        mx_layer.group_size,
        mx_layer.bits,
    )
    return w


class QuantizedWeights:
    def __init__(
        self,
        scales: mx.array,
        biases: mx.array,
        group_size: int,
        bits: int,
        weight: mx.array,
    ):
        self.scales = scales
        self.biases = biases
        self.group_size = group_size
        self.bits = bits
        self.weight = weight

    @staticmethod
    def from_mlx_layer(mlx_layer: Any) -> "QuantizedWeights":
        return QuantizedWeights(
            scales=mlx_layer.scales,
            biases=mlx_layer.biases,
            group_size=mlx_layer.group_size,
            bits=mlx_layer.bits,
            weight=mlx_layer.weight,
        )


def quantized_matmul(
    scales: mx.array,
    biases: mx.array,
    group_size: int,
    bits: int,
    a: mx.array,
    b: mx.array,
    transpose_b: bool = False,
) -> mx.array:
    pass


def quantized_linear(
    x: mx.array,
    w: QuantizedWeights,
    bias: mx.array | None = None,
) -> mx.array:
    pass

```

## /src/tiny_llm/qwen2_week1.py

```py path="/src/tiny_llm/qwen2_week1.py" 
import mlx.core as mx
from .basics import linear, silu
from .attention import scaled_dot_product_attention_grouped
from .layer_norm import RMSNorm
from .positional_encoding import RoPE
from typing import Any
from .embedding import Embedding
from .quantize import dequantize_linear


class Qwen2MultiHeadAttention:
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        wq: mx.array,
        wk: mx.array,
        wv: mx.array,
        wo: mx.array,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        pass

    def __call__(
        self,
        x: mx.array,
        offset: int,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        pass


class Qwen2MLP:
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        w_gate: mx.array,
        w_up: mx.array,
        w_down: mx.array,
    ):
        pass

    def __call__(self, x: mx.array) -> mx.array:
        pass


class Qwen2TransformerBlock:
    def __init__(
        self,
        num_attention_heads: int,
        num_kv_heads: int,
        hidden_size: int,
        intermediate_size: int,
        rms_norm_eps: float,
        wq: mx.array,
        wk: mx.array,
        wv: mx.array,
        wo: mx.array,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        w_gate: mx.array,
        w_up: mx.array,
        w_down: mx.array,
        w_input_layernorm: mx.array,
        w_post_attention_layernorm: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        pass

    def __call__(
        self,
        x: mx.array,
        offset: int,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        pass


class Qwen2ModelWeek1:
    def __init__(self, mlx_model: Any):
        pass

    def __call__(
        self,
        inputs: mx.array,
        offset: int,
    ) -> mx.array:
        pass

```

## /src/tiny_llm/qwen2_week2.py

```py path="/src/tiny_llm/qwen2_week2.py" 
import mlx.core as mx
from .basics import linear, silu
from .attention import scaled_dot_product_attention_grouped
from .layer_norm import RMSNorm
from .positional_encoding import RoPE
from typing import Any
from .embedding import Embedding
from .quantize import dequantize_linear, QuantizedWeights
from .kv_cache import TinyKvCache


class Qwen2MultiHeadAttention:
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        wq: QuantizedWeights,
        wk: QuantizedWeights,
        wv: QuantizedWeights,
        wo: QuantizedWeights,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        pass

    def __call__(
        self,
        x: mx.array,
        offsets: list[int],
        cache: TinyKvCache,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        pass


class Qwen2MLP:
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        w_gate: QuantizedWeights,
        w_up: QuantizedWeights,
        w_down: QuantizedWeights,
    ):
        pass

    def __call__(self, x: mx.array) -> mx.array:
        pass


class Qwen2TransformerBlock:
    def __init__(
        self,
        num_attention_heads: int,
        num_kv_heads: int,
        hidden_size: int,
        intermediate_size: int,
        rms_norm_eps: float,
        wq: QuantizedWeights,
        wk: QuantizedWeights,
        wv: QuantizedWeights,
        wo: QuantizedWeights,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        w_gate: QuantizedWeights,
        w_up: QuantizedWeights,
        w_down: QuantizedWeights,
        w_input_layernorm: mx.array,
        w_post_attention_layernorm: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        pass

    def __call__(
        self,
        x: mx.array,
        offset: int,
        cache: TinyKvCache,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        pass


class Qwen2ModelWeek2:
    def __init__(self, mlx_model: Any):
        pass

    def __call__(
        self,
        inputs: mx.array,
        offset: int,
        cache: list[TinyKvCache],
    ) -> mx.array:
        pass

```

## /src/tiny_llm/sampler.py

```py path="/src/tiny_llm/sampler.py" 
import mlx.core as mx
import copy


def make_sampler(temp: float, top_p: float, top_k: int | None):
    def sample(logprobs: mx.array):
        if temp == 0:
            return mx.argmax(logprobs, axis=-1)
        pass

    return sample

```

## /src/tiny_llm_ref/__init__.py

```py path="/src/tiny_llm_ref/__init__.py" 
from .attention import *
from .basics import *
from .embedding import *
from .layer_norm import *
from .positional_encoding import *
from .quantize import *
from .generate import *
from .kv_cache import *
from .qwen2_week1 import Qwen2ModelWeek1
from .qwen2_week2 import Qwen2ModelWeek2
from .sampler import *

```

## /src/tiny_llm_ref/attention.py

```py path="/src/tiny_llm_ref/attention.py" 
import mlx.core as mx
from .basics import softmax, linear
from extensions_ref import tiny_llm_ext_ref


def scaled_dot_product_attention_simple(
    query: mx.array,
    key: mx.array,
    value: mx.array,
    scale: float | None = None,
    mask: mx.array | None = None,
) -> mx.array:
    """
    A simple implementation of scaled dot product attention. Assuming Q,K,V are of the same shape.
    Assuming mask is always a float array.
    """
    factor = mx.rsqrt(query.shape[-1]) if scale is None else scale
    scores = mx.matmul(query, key.swapaxes(-2, -1)) * factor
    if mask is not None:
        scores = scores + mask
    return mx.matmul(softmax(scores, axis=-1), value)


def causal_mask(L: int, S: int, dtype: mx.Dtype) -> mx.array:
    mask = mx.tril(mx.ones((L, S)), k=(S - L))
    mask = mx.where(mask, mx.array(0), mx.array(-mx.inf)).astype(dtype)
    return mask


def scaled_dot_product_attention_grouped(
    query: mx.array,
    key: mx.array,
    value: mx.array,
    scale: float | None = None,
    mask: mx.array | str | None = None,
) -> mx.array:
    factor = mx.rsqrt(query.shape[-1]) if scale is None else mx.array(scale)
    factor = factor.astype(query.dtype)
    expected_shape = query.shape

    H_q, L, D = query.shape[-3:]
    H, S, _ = key.shape[-3:]
    assert H_q % H == 0
    n_repeats = H_q // H

    query = query.reshape(-1, H, n_repeats, L, D)
    key = key.reshape(-1, H, 1, S, D)
    value = value.reshape(-1, H, 1, S, D)

    scores = mx.matmul(query, key.swapaxes(-2, -1)) * factor
    if mask is not None:
        if mask == "causal":
            mask = causal_mask(L, S, scores.dtype)
            scores = scores + mask
        else:
            mask = mask.reshape(-1, H, n_repeats, mask.shape[-2], mask.shape[-1])
            scores = scores + mask
    result = mx.matmul(softmax(scores, axis=-1), value)
    return result.reshape(expected_shape)


def flash_attention(
    query: mx.array,
    key: mx.array,
    value: mx.array,
    scale: float | None = None,
) -> mx.array:
    *B, H_q, S, E = query.shape
    _, H, L, _ = key.shape
    assert H_q % H == 0
    query = query.reshape(-1, S, E)
    key = key.reshape(-1, L, E)
    value = value.reshape(-1, L, E)
    query = mx.contiguous(query)
    key = mx.contiguous(key)
    value = mx.contiguous(value)
    result = tiny_llm_ext_ref.flash_attention(
        query, key, value, scale, num_heads=H_q, num_kv_heads=H
    )
    return result.reshape(*B, H_q, S, E)


class SimpleMultiHeadAttention:
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        wq: mx.array,
        wk: mx.array,
        wv: mx.array,
        wo: mx.array,
    ):
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        assert hidden_size % num_heads == 0
        self.head_dim = hidden_size // num_heads
        self.scale = mx.rsqrt(self.head_dim)
        assert wq.shape == (hidden_size, num_heads * self.head_dim)
        assert wk.shape == (hidden_size, num_heads * self.head_dim)
        assert wv.shape == (hidden_size, num_heads * self.head_dim)
        assert wo.shape == (num_heads * self.head_dim, hidden_size)
        self.wq = wq
        self.wk = wk
        self.wv = wv
        self.wo = wo

    def __call__(
        self,
        query: mx.array,
        key: mx.array,
        value: mx.array,
        mask: mx.array | None = None,
    ) -> mx.array:
        N, L, _ = query.shape
        assert query.shape == key.shape == value.shape
        projection_q = (
            linear(query, self.wq)
            .reshape(N, L, self.num_heads, self.head_dim)
            .transpose(0, 2, 1, 3)
        )
        projection_k = (
            linear(key, self.wk)
            .reshape(N, L, self.num_heads, self.head_dim)
            .transpose(0, 2, 1, 3)
        )
        projection_v = (
            linear(value, self.wv)
            .reshape(N, L, self.num_heads, self.head_dim)
            .transpose(0, 2, 1, 3)
        )
        x = scaled_dot_product_attention_simple(
            projection_q,
            projection_k,
            projection_v,
            scale=self.scale,
            mask=mask,
        )
        x = x.transpose(0, 2, 1, 3).reshape(N, L, self.hidden_size)
        return linear(x, self.wo)

```

## /src/tiny_llm_ref/basics.py

```py path="/src/tiny_llm_ref/basics.py" 
import mlx.core as mx
import math


def softmax(x: mx.array, axis: int) -> mx.array:
    # TODO: manual implementation
    return mx.softmax(x, axis=axis)


def linear(
    x: mx.array,
    w: mx.array,
    bias: mx.array | None = None,
) -> mx.array:
    if bias is not None:
        return mx.matmul(x, w.T) + bias
    else:
        return mx.matmul(x, w.T)


def silu(x: mx.array) -> mx.array:
    return x / (1 + mx.exp(-x))

```

## /src/tiny_llm_ref/embedding.py

```py path="/src/tiny_llm_ref/embedding.py" 
import mlx.core as mx
from .basics import linear


class Embedding:
    def __init__(self, vocab_size: int, embedding_dim: int, weight: mx.array):
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.weight = weight

    def __call__(self, x: mx.array) -> mx.array:
        return self.weight[x, :]

    def as_linear(self, x: mx.array) -> mx.array:
        return linear(x, self.weight)

```

## /src/tiny_llm_ref/generate.py

```py path="/src/tiny_llm_ref/generate.py" 
import mlx.core as mx
from mlx_lm.tokenizer_utils import TokenizerWrapper
from .kv_cache import *
from .qwen2_week1 import Qwen2ModelWeek1
from .qwen2_week2 import Qwen2ModelWeek2
from typing import Callable


def simple_generate(
    model: Qwen2ModelWeek1,
    tokenizer: TokenizerWrapper,
    prompt: str,
    sampler: Callable[[mx.array], mx.array] | None,
) -> str:
    def _step(model, y, offset):
        logits = model(y[None], offset)
        logits = logits[:, -1, :]
        logprobs = logits - mx.logsumexp(
            logits, keepdims=True
        )  # optional -- for numerical stability
        if sampler is None:
            y = mx.argmax(logprobs, axis=-1)
        else:
            y = sampler(logprobs)
        return y

    # prefill with the prompt
    tokens = mx.array(tokenizer.encode(prompt, add_special_tokens=False))
    detokenizer = tokenizer.detokenizer
    detokenizer.reset()
    # generate/decode
    while True:
        token = _step(model, tokens, tokens.size)
        mx.eval(token)
        tokens = mx.concat([tokens, token])
        if token.item() == tokenizer.eos_token_id:
            break
        detokenizer.add_token(token.item())
        print(detokenizer.last_segment, end="", flush=True)


def simple_generate_with_kv_cache(
    model: Qwen2ModelWeek2, tokenizer: TokenizerWrapper, prompt: str
) -> str:
    kv_cache = [TinyKvFullCache() for _ in range(model.num_hidden_layers)]

    def _step(model, y, offset, kv_cache):
        logits = model(y[None], offset, kv_cache)
        logits = logits[:, -1, :]
        logprobs = logits - mx.logsumexp(logits, keepdims=True)
        sampler = lambda x: mx.argmax(x, axis=-1)
        y = sampler(logprobs)
        return y, logprobs.squeeze(0)

    # prefill with the prompt
    tokens = mx.array(tokenizer.encode(prompt, add_special_tokens=False))
    offset = 0
    prefill_max = 64
    total_tokens = tokens.size
    while tokens.size > prefill_max:
        token, _ = _step(model, tokens[:prefill_max], offset, kv_cache)
        for i in kv_cache:
            mx.eval(i.key_values[0])
            mx.eval(i.key_values[1])
        offset += prefill_max
        tokens = tokens[prefill_max:]
        print(f"Prefill progress: {offset}/{total_tokens}", flush=True)
    detokenizer = tokenizer.detokenizer
    detokenizer.reset()
    # generate/decode
    while True:
        token, _ = _step(model, tokens, offset, kv_cache)
        mx.eval(token)
        detokenizer.add_token(token.item())
        print(detokenizer.last_segment, end="", flush=True)
        if token.item() == tokenizer.eos_token_id:
            break
        offset += tokens.size
        tokens = token


def _step(model, y, offsets, kv_cache):
    logits = model(y, offsets, kv_cache)
    logits = logits[:, -1, :]
    logprobs = logits - mx.logsumexp(logits, keepdims=True)
    sampler = lambda x: mx.argmax(x, axis=-1)
    y = sampler(logprobs)
    return y


class _PrefillRequest:
    def __init__(
        self, model: any, tokenizer: TokenizerWrapper, prompt: str, max_step: int = 128
    ):
        self.prompt = prompt
        self.kv_cache = [TinyKvFullCache() for _ in range(model.num_hidden_layers)]
        self.model = model
        self.prefill_tokens = mx.array(
            tokenizer.encode(prompt, add_special_tokens=False)
        )
        self.offset = 0
        self.max_step = max_step

    def prefill(self):
        # returns None if prefill is not done
        tokens_to_prefill = min(self.max_step, self.prefill_tokens.size - self.offset)
        token = _step(
            self.model,
            self.prefill_tokens[self.offset : self.offset + tokens_to_prefill][None],
            [self.offset],
            self.kv_cache,
        )
        self.offset += tokens_to_prefill
        for i in self.kv_cache:
            mx.eval(i.key_values[0])
            mx.eval(i.key_values[1])
        if self.offset == self.prefill_tokens.size:
            mx.eval(token)
            return token, self.kv_cache, self.offset
        else:
            return None


def _print_progress(
    detokenizers: list[TokenizerWrapper],
    prompt_idx: list[int],
    is_idle: list[bool],
    pending_prefill_requests: _PrefillRequest | None,
):
    for i in range(len(detokenizers)):
        if is_idle[i]:
            print(f"Decode {i}: idle", flush=True)
        else:
            print(f"Decode {i}[{prompt_idx[i]}]: {detokenizers[i].text}", flush=True)
    if pending_prefill_requests is not None:
        print(
            f"Prefill {pending_prefill_requests.offset}/{pending_prefill_requests.prefill_tokens.size}",
            flush=True,
        )
    else:
        print("Prefill: idle", flush=True)


def batch_generate(
    model: any,
    tokenizer: TokenizerWrapper,
    prompts: list[str],
    max_seq_len=512,
    batch_size=5,
    prefill_step=128,
):
    is_idle = [True] * batch_size
    prompt_idx = [0] * batch_size
    next_tokens = mx.array([0] * batch_size)
    offsets = mx.array([0] * batch_size)
    detokenizers = [None] * batch_size
    kv_cache = [
        BatchingKvCache(max_active_requests=batch_size, max_seq_len=max_seq_len)
        for _ in range(model.num_hidden_layers)
    ]
    result = []
    pending_prefill_requests = None

    print(f"Processing {len(prompts)} prompts")
    prompts = enumerate(prompts)
    more_prompts = True
    while True:
        if not more_prompts and all(is_idle):
            break
        # prefill until no idle slots
        while any(is_idle) and more_prompts and pending_prefill_requests is None:
            try:
                idx, prompt = next(prompts)
            except StopIteration:
                more_prompts = False
                break
            pending_prefill_requests = _PrefillRequest(
                model, tokenizer, prompt, prefill_step
            )
            break

        if pending_prefill_requests is not None:
            res = pending_prefill_requests.prefill()
            if res is not None:
                pending_prefill_requests = None
                token, prefill_kv_cache, offset = res

                if token.item() == tokenizer.eos_token_id:
                    # if the first token is eos, we skip this prompt
                    continue

                for i in range(batch_size):
                    if is_idle[i]:
                        detokenizers[i] = tokenizer.detokenizer.__class__(
                            tokenizer._tokenizer
                        )
                        detokenizers[i].add_token(token.item())
                        prompt_idx[i] = idx
                        is_idle[i] = False
                        for prefill_cache, batch_cache in zip(
                            prefill_kv_cache, kv_cache
                        ):
                            batch_cache.add_request(prefill_cache, i)
                        next_tokens[i] = token
                        offsets[i] = offset
                        break

        if not all(is_idle):
            next_tokens = mx.array(next_tokens)
            # decode
            next_tokens = _step(model, next_tokens.reshape(-1, 1), offsets, kv_cache)
            offsets += 1
            for i in range(batch_size):
                if not is_idle[i]:
                    detokenizers[i].add_token(next_tokens[i].item())
                    if (
                        next_tokens[i].item() == tokenizer.eos_token_id
                        or offsets[i] >= max_seq_len
                    ):
                        print(
                            f"(Finished) {prompt_idx[i]}: " + detokenizers[i].text,
                            flush=True,
                        )
                        result.append((prompt_idx[i], detokenizers[i].text))
                        print(f"Removing request {i}", flush=True)
                        batch_cache.remove_request(i)
                        is_idle[i] = True
                        continue
                    else:
                        print(
                            f"(In Progress) {prompt_idx[i]}: " + detokenizers[i].text,
                            flush=True,
                        )
        _print_progress(detokenizers, prompt_idx, is_idle, pending_prefill_requests)
    return result

```

## /src/tiny_llm_ref/kv_cache.py

```py path="/src/tiny_llm_ref/kv_cache.py" 
from typing import Optional

import mlx.core as mx


class TinyKvCache:
    def update_and_fetch(
        self, key: mx.array, value: mx.array
    ) -> tuple[mx.array, mx.array, int]:
        pass


class BatchingKvCache(TinyKvCache):
    def __init__(self, max_active_requests: int, max_seq_len: int):
        self.max_active_requests = max_active_requests
        self.max_seq_len = max_seq_len
        self.key_values = None
        self.head_offsets = mx.array([0] * max_active_requests)
        self.head = 0

    def update_and_fetch(
        self, key: mx.array, value: mx.array
    ) -> tuple[mx.array, mx.array, int]:
        B, H, L, D = key.shape
        assert key.shape == value.shape
        assert L <= self.max_seq_len
        keys, values = self.key_values
        if self.head + L <= self.max_seq_len:
            keys[:, :, self.head : self.head + L, :] = key
            values[:, :, self.head : self.head + L, :] = value
            self.head += L
            self.head_offsets += L
        else:
            fill_size = self.max_seq_len - self.head
            keys[:, :, self.head : self.max_seq_len, :] = key[:, :, :fill_size, :]
            values[:, :, self.head : self.max_seq_len, :] = value[:, :, :fill_size, :]
            remaining_size = L - fill_size
            keys[:, :, :remaining_size, :] = key[:, :, fill_size:, :]
            values[:, :, :remaining_size, :] = value[:, :, fill_size:, :]
            self.head = remaining_size
            self.head_offsets += L
        self.key_values = (keys, values)

        before_keys = keys[:, :, self.head :, :]
        before_values = values[:, :, self.head :, :]
        after_keys = keys[:, :, : self.head, :]
        after_values = values[:, :, : self.head, :]
        keys = mx.concat([after_keys, before_keys], axis=2)
        values = mx.concat([after_values, before_values], axis=2)
        return keys, values, self.head_offsets

    def add_request(self, prefilled: TinyKvCache, id: int):
        if id >= self.max_active_requests:
            raise ValueError(f"Request id {id} is out of range")
        keys, values = prefilled.key_values
        B, H, L, D = keys.shape
        assert B == 1
        if self.key_values is None:
            self.key_values = (
                mx.zeros((self.max_active_requests, H, self.max_seq_len, D)),
                mx.zeros((self.max_active_requests, H, self.max_seq_len, D)),
            )
        if L > self.max_seq_len:
            keys = keys[:, :, -self.max_seq_len :, :]
            values = values[:, :, -self.max_seq_len :, :]
            take_size = self.max_seq_len
        else:
            take_size = L
        cached_keys, cached_values = self.key_values
        # Firstly, fill the cache with zeros
        cached_keys[id, :, :, :] = 0
        cached_values[id, :, :, :] = 0
        # Then, fill the cache with the prefilled values up to self.head (may wrap)
        start_pos = (self.head - take_size + self.max_seq_len) % self.max_seq_len
        if start_pos + take_size <= self.max_seq_len:
            cached_keys[id, :, start_pos : start_pos + take_size, :] = keys[0, :, :, :]
            cached_values[id, :, start_pos : start_pos + take_size, :] = values[
                0, :, :, :
            ]
        else:
            cached_keys[id, :, start_pos : self.max_seq_len, :] = keys[
                0, :, : self.max_seq_len - start_pos, :
            ]
            cached_values[id, :, start_pos : self.max_seq_len, :] = values[
                0, :, : self.max_seq_len - start_pos, :
            ]
            cached_keys[id, :, : take_size - (self.max_seq_len - start_pos), :] = keys[
                0, :, self.max_seq_len - start_pos :, :
            ]
            cached_values[id, :, : take_size - (self.max_seq_len - start_pos), :] = (
                values[0, :, self.max_seq_len - start_pos :, :]
            )
        self.head_offsets[id] = L
        self.key_values = (cached_keys, cached_values)

    def remove_request(self, id: int):
        if self.key_values is None:
            raise ValueError(f"Request id {id} is not in the cache")
        cached_keys, cached_values = self.key_values
        cached_keys[id, :, :, :] = 0
        cached_values[id, :, :, :] = 0


class TinyKvFullCache(TinyKvCache):
    def __init__(self):
        self.key_values = None
        self.offset = 0

    def update_and_fetch(
        self, key: mx.array, value: mx.array
    ) -> tuple[mx.array, mx.array, int]:
        if self.key_values is None:
            assert self.offset == 0
            self.key_values = (key, value)
            B, H, S, D = key.shape
            self.offset = S
            return key, value, 0
        else:
            B, H, S, D = key.shape
            assert key.shape == value.shape
            prev_keys, prev_values = self.key_values
            assert prev_keys.shape == (B, H, self.offset, D)
            assert prev_values.shape == (B, H, self.offset, D)
            new_keys = mx.concat([prev_keys, key], axis=2)
            new_values = mx.concat([prev_values, value], axis=2)
            self.key_values = (new_keys, new_values)
            start_offset = self.offset
            self.offset += S
            return new_keys, new_values, start_offset


class TinyKvRotatingCache(TinyKvCache):
    def __init__(self, max_seq_len: int):
        self.max_seq_len = max_seq_len
        self.key_values = None
        self.head = 0
        self.head_offset = 0

    def update_and_fetch(
        self, key: mx.array, value: mx.array, offset: int
    ) -> tuple[mx.array, mx.array]:
        if self.key_values is None:
            assert offset == 0
            B, H, L, D = key.shape
            assert L <= self.max_seq_len
            keys = mx.zeros((B, H, self.max_seq_len, D))
            values = mx.zeros((B, H, self.max_seq_len, D))
            keys[:, :, :L, :] = key
            values[:, :, :L, :] = value
            self.key_values = (keys, values)
            self.head = L
            self.head_offset = L
            return keys[:, :, :L, :], values[:, :, :L, :]
        else:
            B, H, L, D = key.shape
            assert key.shape == value.shape
            assert offset == self.head_offset
            assert L <= self.max_seq_len
            keys, values = self.key_values
            if self.head + L <= self.max_seq_len:
                keys[:, :, self.head : self.head + L, :] = key
                values[:, :, self.head : self.head + L, :] = value
                self.head += L
                self.head_offset += L
            else:
                fill_size = self.max_seq_len - self.head
                keys[:, :, self.head : self.max_seq_len, :] = key[:, :, :fill_size, :]
                values[:, :, self.head : self.max_seq_len, :] = value[
                    :, :, :fill_size, :
                ]
                remaining_size = L - fill_size
                keys[:, :, :remaining_size, :] = key[:, :, fill_size:, :]
                values[:, :, :remaining_size, :] = value[:, :, fill_size:, :]
                self.head = remaining_size
                self.head_offset += L
            self.key_values = (keys, values)
            if self.head_offset < self.max_seq_len:
                return keys[:, :, : self.head_offset, :], values[
                    :, :, : self.head_offset, :
                ]
            else:
                before_keys = keys[:, :, self.head_offset :, :]
                before_values = values[:, :, self.head_offset :, :]
                after_keys = keys[:, :, : self.head_offset, :]
                after_values = values[:, :, : self.head_offset, :]
                keys = mx.concat([after_keys, before_keys], axis=2)
                values = mx.concat([after_values, before_values], axis=2)
                return keys, values

```

## /src/tiny_llm_ref/layer_norm.py

```py path="/src/tiny_llm_ref/layer_norm.py" 
import mlx.core as mx


class RMSNorm:
    def __init__(self, dim: int, weight: mx.array, eps: float = 1e-5):
        self.dim = dim
        self.eps = eps
        self.weight = weight.astype(mx.float32)

    def __call__(self, x: mx.array) -> mx.array:
        orig_dtype = x.dtype
        x = x.astype(mx.float32)
        return (
            self.weight
            * x
            * mx.rsqrt(mx.mean(mx.square(x), axis=-1, keepdims=True) + self.eps)
        ).astype(orig_dtype)

```

## /src/tiny_llm_ref/positional_encoding.py

```py path="/src/tiny_llm_ref/positional_encoding.py" 
import mlx.core as mx


class RoPE:
    def __init__(
        self,
        dims: int,
        seq_len: int,
        base: int = 10000,
        traditional: bool = False,
    ):
        assert dims % 2 == 0, "dims must be even"
        self.dims = dims
        self.seq_len = seq_len
        half_dims = dims // 2
        inner = mx.arange(0, half_dims, dtype=mx.float32) / half_dims
        freqs = mx.power(base, -inner)
        t = mx.arange(seq_len)
        freqs = mx.outer(t, freqs)
        self.cos_freqs = mx.cos(freqs)
        self.sin_freqs = mx.sin(freqs)
        self.base = base
        self.half_dims = half_dims
        self.traditional = traditional

    def __call__(
        self, x: mx.array, offset: list[slice] | slice | None = None
    ) -> mx.array:
        N, S, H, D = x.shape
        if offset is not None:
            if isinstance(offset, slice):
                assert offset.stop - offset.start == S, f"offset must be of length {S}"
            elif isinstance(offset, list):
                assert len(offset) == N, (
                    f"offsets must have the same length as batch size {N}"
                )
                for o in offset:
                    assert o.stop - o.start == S, f"offset must be of length {S}"
                offset = mx.array([list(range(i.start, i.stop)) for i in offset])
        cos_basis = (
            self.cos_freqs[:S, :] if offset is None else self.cos_freqs[offset, :]
        )
        sin_basis = (
            self.sin_freqs[:S, :] if offset is None else self.sin_freqs[offset, :]
        )
        # reshape x: (b, s, n_heads, head_dim // 2, 2)
        if self.traditional:
            x = x.reshape(N, S, H, self.half_dims, 2)
            x1 = x[..., 0]
            x2 = x[..., 1]
        else:
            x1 = x[..., 0 : self.half_dims]
            x2 = x[..., self.half_dims : self.dims]
        # reshape basis: (1, s, 1, dims // 2, 2)
        cos_basis = cos_basis.reshape(-1, S, 1, self.half_dims)
        sin_basis = sin_basis.reshape(-1, S, 1, self.half_dims)
        # manually doing complex number multiplication..
        real = mx.multiply(x1, cos_basis) - mx.multiply(x2, sin_basis)
        imag = mx.multiply(x2, cos_basis) + mx.multiply(x1, sin_basis)
        if self.traditional:
            y = mx.stack([real, imag], axis=-1)
            y = y.reshape(N, S, H, D)
        else:
            y = mx.concat([real, imag], axis=-1)
            y = y.reshape(N, S, H, D)
        return y.astype(x.dtype)

```

## /src/tiny_llm_ref/quantize.py

```py path="/src/tiny_llm_ref/quantize.py" 
import mlx.core as mx
from typing import Any
from extensions_ref import tiny_llm_ext_ref


class QuantizedWeights:
    def __init__(
        self,
        scales: mx.array,
        biases: mx.array,
        group_size: int,
        bits: int,
        weight: mx.array,
    ):
        self.scales = scales
        self.biases = biases
        self.group_size = group_size
        self.bits = bits
        self.weight = weight

    @staticmethod
    def from_mlx_layer(mlx_layer: Any) -> "QuantizedWeights":
        return QuantizedWeights(
            scales=mlx_layer.scales,
            biases=mlx_layer.biases,
            group_size=mlx_layer.group_size,
            bits=mlx_layer.bits,
            weight=mlx_layer.weight,
        )


def quantized_linear(
    x: mx.array,
    w: QuantizedWeights,
    bias: mx.array | None = None,
) -> mx.array:
    if bias is not None:
        return (
            quantized_matmul(
                w.scales, w.biases, w.group_size, w.bits, x, w.weight, True
            )
            + bias
        )
    else:
        return quantized_matmul(
            w.scales, w.biases, w.group_size, w.bits, x, w.weight, True
        )


def dequantize_linear(mx_layer: Any) -> mx.array:
    w = mx.dequantize(
        mx_layer.weight,
        mx_layer.scales,
        mx_layer.biases,
        mx_layer.group_size,
        mx_layer.bits,
    )
    return w


def quantized_matmul(
    scales: mx.array,
    biases: mx.array,
    group_size: int,
    bits: int,
    a: mx.array,
    b: mx.array,
    transpose_b: bool = False,
) -> mx.array:
    *N, D = a.shape
    a = a.reshape(-1, D)
    a = mx.contiguous(a)
    b = mx.contiguous(b)
    return tiny_llm_ext_ref.quantized_matmul(
        scales, biases, group_size, bits, a, b, transpose_b
    ).reshape(*N, -1)

```

## /src/tiny_llm_ref/qwen2_week1.py

```py path="/src/tiny_llm_ref/qwen2_week1.py" 
import mlx.core as mx
from .basics import linear, silu
from .attention import scaled_dot_product_attention_grouped
from .layer_norm import RMSNorm
from .positional_encoding import RoPE
from typing import Any
from .embedding import Embedding
from .quantize import dequantize_linear


class Qwen2MultiHeadAttention:
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        wq: mx.array,
        wk: mx.array,
        wv: mx.array,
        wo: mx.array,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.num_kv_heads = num_kv_heads
        assert hidden_size % num_heads == 0, (
            f"hidden_size {hidden_size} must be divisible by num_heads {num_heads}"
        )
        assert num_heads % num_kv_heads == 0, (
            f"num_heads {num_heads} must be divisible by num_kv_heads {num_kv_heads}"
        )
        self.head_dim = hidden_size // num_heads
        self.scale = mx.rsqrt(self.head_dim)
        self.wq = wq
        self.wk = wk
        self.wv = wv
        self.wo = wo
        self.bq = bq
        self.bk = bk
        self.bv = bv
        self.rope = RoPE(self.head_dim, max_seq_len, theta)

    def __call__(
        self,
        x: mx.array,
        offset: int,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        B, L, _ = x.shape
        projection_q = linear(x, self.wq, bias=self.bq).reshape(
            B, L, self.num_heads, self.head_dim
        )
        projection_k = linear(x, self.wk, bias=self.bk).reshape(
            B, L, self.num_kv_heads, self.head_dim
        )
        projection_v = linear(x, self.wv, bias=self.bv).reshape(
            B, L, self.num_kv_heads, self.head_dim
        )
        projection_q = self.rope(projection_q, offset=slice(offset, offset + L))
        projection_k = self.rope(projection_k, offset=slice(offset, offset + L))
        projection_q = projection_q.transpose(0, 2, 1, 3)
        projection_k = projection_k.transpose(0, 2, 1, 3)
        projection_v = projection_v.transpose(0, 2, 1, 3)
        x = scaled_dot_product_attention_grouped(
            projection_q.astype(mx.float32),
            projection_k.astype(mx.float32),
            projection_v.astype(mx.float32),
            scale=self.scale,
            mask=mask,
        ).astype(x.dtype)
        x = x.transpose(0, 2, 1, 3).reshape(B, L, self.hidden_size)
        return linear(x, self.wo)


class Qwen2MLP:
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        w_gate: mx.array,
        w_up: mx.array,
        w_down: mx.array,
    ):
        self.dim = dim
        self.hidden_dim = hidden_dim
        self.w_gate = w_gate
        self.w_up = w_up
        self.w_down = w_down

    def __call__(self, x: mx.array) -> mx.array:
        return linear(silu(linear(x, self.w_gate)) * linear(x, self.w_up), self.w_down)


class Qwen2TransformerBlock:
    def __init__(
        self,
        num_attention_heads: int,
        num_kv_heads: int,
        hidden_size: int,
        intermediate_size: int,
        rms_norm_eps: float,
        wq: mx.array,
        wk: mx.array,
        wv: mx.array,
        wo: mx.array,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        w_gate: mx.array,
        w_up: mx.array,
        w_down: mx.array,
        w_input_layernorm: mx.array,
        w_post_attention_layernorm: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.mlp = Qwen2MLP(hidden_size, intermediate_size, w_gate, w_up, w_down)
        self.input_layernorm = RMSNorm(hidden_size, w_input_layernorm, eps=rms_norm_eps)
        self.post_attention_layernorm = RMSNorm(
            hidden_size, w_post_attention_layernorm, eps=rms_norm_eps
        )
        self.self_attn = Qwen2MultiHeadAttention(
            num_heads=num_attention_heads,
            hidden_size=hidden_size,
            num_kv_heads=num_kv_heads,
            wq=wq,
            wk=wk,
            wv=wv,
            wo=wo,
            bq=bq,
            bk=bk,
            bv=bv,
            max_seq_len=max_seq_len,
            theta=theta,
        )

    def __call__(
        self,
        x: mx.array,
        offset: int,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), offset, mask)
        h = x + r
        r = self.mlp(self.post_attention_layernorm(h))
        out = h + r
        return out


class Qwen2ModelWeek1:
    def __init__(
        self,
        mlx_model: Any,
    ):
        self.num_hidden_layers = mlx_model.args.num_hidden_layers
        self.hidden_size = mlx_model.args.hidden_size
        self.vocab_size = mlx_model.args.vocab_size
        precision = mx.float16
        self.precision = precision

        self.embedding = Embedding(
            vocab_size=self.vocab_size,
            embedding_dim=self.hidden_size,
            weight=dequantize_linear(mlx_model.model.embed_tokens).astype(precision),
        )
        self.layers_inner = []

        for i in range(mlx_model.args.num_hidden_layers):
            wq = dequantize_linear(mlx_model.model.layers[i].self_attn.q_proj)
            wk = dequantize_linear(mlx_model.model.layers[i].self_attn.k_proj)
            wv = dequantize_linear(mlx_model.model.layers[i].self_attn.v_proj)
            wo = dequantize_linear(mlx_model.model.layers[i].self_attn.o_proj)
            w_gate = dequantize_linear(mlx_model.model.layers[i].mlp.gate_proj)
            w_up = dequantize_linear(mlx_model.model.layers[i].mlp.up_proj)
            w_down = dequantize_linear(mlx_model.model.layers[i].mlp.down_proj)

            layer = Qwen2TransformerBlock(
                num_attention_heads=mlx_model.args.num_attention_heads,
                num_kv_heads=mlx_model.args.num_key_value_heads,
                hidden_size=mlx_model.args.hidden_size,
                intermediate_size=mlx_model.args.intermediate_size,
                rms_norm_eps=mlx_model.args.rms_norm_eps,
                wq=wq.astype(precision),
                wk=wk.astype(precision),
                wv=wv.astype(precision),
                wo=wo.astype(precision),
                bq=mlx_model.model.layers[i].self_attn.q_proj.bias.astype(precision),
                bk=mlx_model.model.layers[i].self_attn.k_proj.bias.astype(precision),
                bv=mlx_model.model.layers[i].self_attn.v_proj.bias.astype(precision),
                w_gate=w_gate.astype(precision),
                w_up=w_up.astype(precision),
                w_down=w_down.astype(precision),
                w_input_layernorm=mlx_model.model.layers[
                    i
                ].input_layernorm.weight.astype(precision),
                w_post_attention_layernorm=mlx_model.model.layers[
                    i
                ].post_attention_layernorm.weight.astype(precision),
                max_seq_len=mlx_model.args.max_position_embeddings,
                theta=mlx_model.args.rope_theta,
            )
            self.layers_inner.append(layer)
        self.norm = RMSNorm(
            mlx_model.args.hidden_size,
            weight=mlx_model.model.norm.weight.astype(precision),
            eps=mlx_model.args.rms_norm_eps,
        )
        if not mlx_model.args.tie_word_embeddings:
            self.w_lm_head = dequantize_linear(mlx_model.lm_head)
        else:
            self.w_lm_head = None
        self.mlx_model = mlx_model

    def __call__(
        self,
        inputs: mx.array,
        offset: int,
    ) -> mx.array:
        h = self.embedding(inputs)
        for layer in range(self.num_hidden_layers):
            h = self.layers_inner[layer](
                h, offset, mask="causal" if h.shape[1] > 1 else None
            )
        h = self.norm(h)
        if self.w_lm_head is not None:
            return linear(h, self.w_lm_head)
        else:
            return self.embedding.as_linear(h)

```

## /src/tiny_llm_ref/qwen2_week2.py

```py path="/src/tiny_llm_ref/qwen2_week2.py" 
import mlx.core as mx
from .basics import silu
from .attention import scaled_dot_product_attention_grouped
from .layer_norm import RMSNorm
from .positional_encoding import RoPE
from typing import Any
from .embedding import Embedding
from .quantize import dequantize_linear, QuantizedWeights, quantized_linear
from .kv_cache import TinyKvCache


class Qwen2MultiHeadAttention:
    def __init__(
        self,
        hidden_size: int,
        num_heads: int,
        num_kv_heads: int,
        wq: QuantizedWeights,
        wk: QuantizedWeights,
        wv: QuantizedWeights,
        wo: QuantizedWeights,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        self.hidden_size = hidden_size
        self.num_heads = num_heads
        self.num_kv_heads = num_kv_heads
        assert hidden_size % num_heads == 0, (
            f"hidden_size {hidden_size} must be divisible by num_heads {num_heads}"
        )
        assert num_heads % num_kv_heads == 0, (
            f"num_heads {num_heads} must be divisible by num_kv_heads {num_kv_heads}"
        )
        self.head_dim = hidden_size // num_heads
        self.scale = mx.rsqrt(self.head_dim)
        self.wq = wq
        self.wk = wk
        self.wv = wv
        self.wo = wo
        self.bq = bq
        self.bk = bk
        self.bv = bv
        self.rope = RoPE(self.head_dim, max_seq_len, theta)

    def __call__(
        self,
        x: mx.array,
        offsets: list[int],
        cache: TinyKvCache,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        B, L, _ = x.shape
        projection_q = quantized_linear(x, self.wq, bias=self.bq).reshape(
            B, L, self.num_heads, self.head_dim
        )
        projection_k = quantized_linear(x, self.wk, bias=self.bk).reshape(
            B, L, self.num_kv_heads, self.head_dim
        )
        projection_v = quantized_linear(x, self.wv, bias=self.bv).reshape(
            B, L, self.num_kv_heads, self.head_dim
        )
        if isinstance(offsets, int):
            offset_slice = [slice(int(offsets), int(offsets + L))]
        else:
            offset_slice = [slice(int(i), int(i + L)) for i in offsets]
        projection_q = self.rope(projection_q, offset=offset_slice)
        projection_k = self.rope(projection_k, offset=offset_slice)
        projection_q = projection_q.transpose(0, 2, 1, 3)
        projection_k = projection_k.transpose(0, 2, 1, 3)
        projection_v = projection_v.transpose(0, 2, 1, 3)
        projection_k, projection_v, _ = cache.update_and_fetch(
            projection_k, projection_v
        )
        x = scaled_dot_product_attention_grouped(
            projection_q.astype(mx.float32),
            projection_k.astype(mx.float32),
            projection_v.astype(mx.float32),
            scale=self.scale,
            mask=mask,
        ).astype(x.dtype)
        x = x.transpose(0, 2, 1, 3).reshape(B, L, self.hidden_size)
        return quantized_linear(x, self.wo)


class Qwen2MLP:
    def __init__(
        self,
        dim: int,
        hidden_dim: int,
        w_gate: QuantizedWeights,
        w_up: QuantizedWeights,
        w_down: QuantizedWeights,
    ):
        self.dim = dim
        self.hidden_dim = hidden_dim
        self.w_gate = w_gate
        self.w_up = w_up
        self.w_down = w_down

    def __call__(self, x: mx.array) -> mx.array:
        return quantized_linear(
            silu(quantized_linear(x, self.w_gate)) * quantized_linear(x, self.w_up),
            self.w_down,
        )


class Qwen2TransformerBlock:
    def __init__(
        self,
        num_attention_heads: int,
        num_kv_heads: int,
        hidden_size: int,
        intermediate_size: int,
        rms_norm_eps: float,
        wq: QuantizedWeights,
        wk: QuantizedWeights,
        wv: QuantizedWeights,
        wo: QuantizedWeights,
        bq: mx.array,
        bk: mx.array,
        bv: mx.array,
        w_gate: QuantizedWeights,
        w_up: QuantizedWeights,
        w_down: QuantizedWeights,
        w_input_layernorm: mx.array,
        w_post_attention_layernorm: mx.array,
        max_seq_len: int = 32768,
        theta: int = 1000000,
    ):
        self.num_attention_heads = num_attention_heads
        self.hidden_size = hidden_size
        self.mlp = Qwen2MLP(hidden_size, intermediate_size, w_gate, w_up, w_down)
        self.input_layernorm = RMSNorm(hidden_size, w_input_layernorm, eps=rms_norm_eps)
        self.post_attention_layernorm = RMSNorm(
            hidden_size, w_post_attention_layernorm, eps=rms_norm_eps
        )
        self.self_attn = Qwen2MultiHeadAttention(
            num_heads=num_attention_heads,
            hidden_size=hidden_size,
            num_kv_heads=num_kv_heads,
            wq=wq,
            wk=wk,
            wv=wv,
            wo=wo,
            bq=bq,
            bk=bk,
            bv=bv,
            max_seq_len=max_seq_len,
            theta=theta,
        )

    def __call__(
        self,
        x: mx.array,
        offset: int,
        cache: TinyKvCache,
        mask: mx.array | str | None = None,
    ) -> mx.array:
        r = self.self_attn(self.input_layernorm(x), offset, cache, mask)
        h = x + r
        r = self.mlp(self.post_attention_layernorm(h))
        out = h + r
        return out


class Qwen2ModelWeek2:
    def __init__(
        self,
        mlx_model: Any,
    ):
        self.num_hidden_layers = mlx_model.args.num_hidden_layers
        self.hidden_size = mlx_model.args.hidden_size
        self.vocab_size = mlx_model.args.vocab_size
        precision = mx.float16
        self.precision = precision

        self.embedding = Embedding(
            vocab_size=self.vocab_size,
            embedding_dim=self.hidden_size,
            weight=dequantize_linear(mlx_model.model.embed_tokens).astype(precision),
        )
        self.layers_inner = []

        for i in range(mlx_model.args.num_hidden_layers):
            wq = QuantizedWeights.from_mlx_layer(
                mlx_model.model.layers[i].self_attn.q_proj
            )
            wk = QuantizedWeights.from_mlx_layer(
                mlx_model.model.layers[i].self_attn.k_proj
            )
            wv = QuantizedWeights.from_mlx_layer(
                mlx_model.model.layers[i].self_attn.v_proj
            )
            wo = QuantizedWeights.from_mlx_layer(
                mlx_model.model.layers[i].self_attn.o_proj
            )
            w_gate = QuantizedWeights.from_mlx_layer(
                mlx_model.model.layers[i].mlp.gate_proj
            )
            w_up = QuantizedWeights.from_mlx_layer(
                mlx_model.model.layers[i].mlp.up_proj
            )
            w_down = QuantizedWeights.from_mlx_layer(
                mlx_model.model.layers[i].mlp.down_proj
            )

            layer = Qwen2TransformerBlock(
                num_attention_heads=mlx_model.args.num_attention_heads,
                num_kv_heads=mlx_model.args.num_key_value_heads,
                hidden_size=mlx_model.args.hidden_size,
                intermediate_size=mlx_model.args.intermediate_size,
                rms_norm_eps=mlx_model.args.rms_norm_eps,
                wq=wq,
                wk=wk,
                wv=wv,
                wo=wo,
                bq=mlx_model.model.layers[i].self_attn.q_proj.bias.astype(precision),
                bk=mlx_model.model.layers[i].self_attn.k_proj.bias.astype(precision),
                bv=mlx_model.model.layers[i].self_attn.v_proj.bias.astype(precision),
                w_gate=w_gate,
                w_up=w_up,
                w_down=w_down,
                w_input_layernorm=mlx_model.model.layers[
                    i
                ].input_layernorm.weight.astype(precision),
                w_post_attention_layernorm=mlx_model.model.layers[
                    i
                ].post_attention_layernorm.weight.astype(precision),
                max_seq_len=mlx_model.args.max_position_embeddings,
                theta=mlx_model.args.rope_theta,
            )
            self.layers_inner.append(layer)
        self.norm = RMSNorm(
            mlx_model.args.hidden_size,
            weight=mlx_model.model.norm.weight.astype(precision),
            eps=mlx_model.args.rms_norm_eps,
        )
        if not mlx_model.args.tie_word_embeddings:
            self.w_lm_head = QuantizedWeights.from_mlx_layer(mlx_model.lm_head)
        else:
            self.w_lm_head = None
        self.mlx_model = mlx_model

    def __call__(
        self,
        inputs: mx.array,
        offset: int,
        cache: list[TinyKvCache],
    ) -> mx.array:
        h = self.embedding(inputs)
        for layer in range(self.num_hidden_layers):
            h = self.layers_inner[layer](
                h, offset, cache[layer], mask="causal" if h.shape[1] > 1 else None
            )
        h = self.norm(h)
        if self.w_lm_head is not None:
            return quantized_linear(h, self.w_lm_head)
        else:
            return self.embedding.as_linear(h)

```

## /src/tiny_llm_ref/sampler.py

```py path="/src/tiny_llm_ref/sampler.py" 
import mlx.core as mx
import copy


def make_sampler(temp: float, top_p: float, top_k: int | None):
    def sample(logprobs: mx.array):
        if temp == 0:
            return mx.argmax(logprobs, axis=-1)
        logprobs = copy.copy(logprobs)  # TODO: do we really need a copy?
        if top_k is not None and top_k > 0:
            mask_elements = mx.argpartition(-logprobs, kth=top_k - 1, axis=-1)[
                :, top_k:
            ]
            logprobs[:, mask_elements] = -mx.inf
        if top_p is not None and top_p > 0:
            sorted_idx = mx.argsort(-logprobs, axis=-1)
            sorted_logprobs = logprobs[:, sorted_idx]
            cumsum = mx.cumsum(sorted_logprobs, axis=-1)
            logprobs[:, sorted_idx] = mx.where(cumsum < top_p, sorted_logprobs, -mx.inf)
        logprobs = logprobs / temp
        return mx.random.categorical(logprobs, axis=-1)

    return sample

```

## /tests/.gitignore

```gitignore path="/tests/.gitignore" 
test_*.py

```

## /tests/tiny_llm_base.py

```py path="/tests/tiny_llm_base.py" 
from tiny_llm import *

```

## /tests/utils.py

```py path="/tests/utils.py" 
import numpy as np
import mlx.core as mx
import huggingface_hub

AVAILABLE_STREAMS = [mx.cpu, mx.gpu]
AVAILABLE_STREAMS_IDS = ["cpu", "gpu"]
PRECISIONS = [mx.float32, mx.float16]
PRECISION_IDS = ["f32", "f16"]


def assert_allclose(
    a: mx.array,
    b: mx.array,
    precision: mx.Dtype,
    rtol: float | None = None,
    atol: float | None = None,
):
    a = np.array(a)
    b = np.array(b)
    if precision == mx.float32:
        rtol = rtol or 1.0e-5
        atol = atol or 1.0e-8
    elif precision == mx.float16:
        rtol = rtol or 3.0e-2
        atol = atol or 1.0e-5
    assert a.shape == b.shape, f"shape mismatch: {a.shape} vs {b.shape}"
    if not np.allclose(a, b, rtol=rtol, atol=atol):
        with np.printoptions(precision=3, suppress=True):
            print("a=", a)
            print("b=", b)
            diff = np.invert(np.isclose(a, b, rtol=rtol, atol=atol))
            print("diff_a=", a * diff)
            print("diff_b=", b * diff)
            assert False, f"result mismatch"


def np_type_to_mx_type(np_type: np.dtype) -> mx.Dtype:
    if np_type == np.float32:
        return mx.float32
    elif np_type == np.float16:
        return mx.float16
    else:
        raise ValueError(f"Unsupported numpy type: {np_type}")


def qwen_2_05b_model_exists() -> bool:
    try:
        huggingface_hub.snapshot_download(
            "Qwen/Qwen2-0.5B-Instruct-MLX", local_files_only=True
        )
        return True
    except Exception as e:
        print(f"Cannot find the Qwen2-0.5B-Instruct-MLX model: {e}")
        return False


def qwen_2_15b_model_exists() -> bool:
    try:
        huggingface_hub.snapshot_download(
            "Qwen/Qwen2-1.5B-Instruct-MLX", local_files_only=True
        )
        return True
    except Exception as e:
        print(f"Cannot find the Qwen2-1.5B-Instruct-MLX model: {e}")
        return False


def qwen_2_7b_model_exists() -> bool:
    try:
        huggingface_hub.snapshot_download(
            "Qwen/Qwen2-7B-Instruct-MLX", local_files_only=True
        )
        return True
    except Exception as e:
        print(f"Cannot find the Qwen2-7B-Instruct-MLX model: {e}")
        return False

```

## /tests_refsol/test_rope.py

```py path="/tests_refsol/test_rope.py" 
import pytest
import mlx.core as mx
from .tiny_llm_base import *
from .utils import *


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("traditional", [True, False])
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_rope_week2_batch_offset(
    stream: mx.Stream, traditional: bool, precision: mx.Dtype
):
    BATCH_SIZE = 1
    NUM_HEADS = 8
    HEAD_DIM = 4
    MAX_SEQ_LEN = 20
    SEQ_LEN = 10
    BASE = 10000
    with mx.stream(stream):
        for _ in range(100):
            user_layer = RoPE(HEAD_DIM, MAX_SEQ_LEN, BASE, traditional=traditional)
            x = mx.random.uniform(
                shape=(BATCH_SIZE, SEQ_LEN, NUM_HEADS, HEAD_DIM), dtype=precision
            )
            input_pos_user = [slice(i, i + SEQ_LEN) for i in range(BATCH_SIZE)]
            user_layer(x, input_pos_user)

```

## /tests_refsol/test_week_1_day_1.py

```py path="/tests_refsol/test_week_1_day_1.py" 
import pytest
import mlx.core as mx
import mlx.nn as nn
from .tiny_llm_base import *
from .utils import *


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_task_1_softmax(stream: mx.Stream, precision: mx.Dtype):
    with mx.stream(stream):
        BATCH_SIZE = 10
        DIM = 10
        for _ in range(100):
            x = mx.random.uniform(shape=(BATCH_SIZE, DIM), dtype=precision)
            user_output = softmax(x, axis=-1)
            reference_output = mx.softmax(x, axis=-1)
            assert_allclose(user_output, reference_output, precision=precision)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
@pytest.mark.parametrize(
    "batch_dimension", [0, 1, 2], ids=["batch_0", "batch_1", "batch_2"]
)
def test_task_1_simple_attention(
    stream: mx.Stream, precision: mx.Dtype, batch_dimension: int
):
    """
    Test if `scaled_dot_product_attention_simple` can process Q/K/V correctly.
    We assume Q/K/V are of the same dimensions and test different batch dimensions.
    """
    with mx.stream(stream):
        if batch_dimension == 0:
            BATCH_SIZE = ()
        elif batch_dimension == 1:
            BATCH_SIZE = (2, 3)
        elif batch_dimension == 2:
            BATCH_SIZE = (2, 3, 3)
        DIM_L = 4
        DIM_D = 5
        for _ in range(100):
            query = mx.random.uniform(
                shape=(*BATCH_SIZE, DIM_L, DIM_D), dtype=precision
            )
            key = mx.random.uniform(shape=(*BATCH_SIZE, DIM_L, DIM_D), dtype=precision)
            value = mx.random.uniform(
                shape=(*BATCH_SIZE, DIM_L, DIM_D), dtype=precision
            )
            reference_output = mx.fast.scaled_dot_product_attention(
                q=query.reshape(1, -1, DIM_L, DIM_D),
                k=key.reshape(1, -1, DIM_L, DIM_D),
                v=value.reshape(1, -1, DIM_L, DIM_D),
                scale=1.0 / (DIM_D**0.5),
            ).reshape(*BATCH_SIZE, DIM_L, DIM_D)
            user_output = scaled_dot_product_attention_simple(
                query,
                key,
                value,
            )
            assert_allclose(user_output, reference_output, precision=precision)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
@pytest.mark.parametrize(
    "batch_dimension", [0, 1, 2], ids=["batch_0", "batch_1", "batch_2"]
)
def test_task_1_simple_attention_scale_mask(
    stream: mx.Stream, precision: mx.Dtype, batch_dimension: int
):
    """
    Test if `scaled_dot_product_attention_simple` can process scale and mask correctly.
    """
    with mx.stream(stream):
        if batch_dimension == 0:
            BATCH_SIZE = ()
        elif batch_dimension == 1:
            BATCH_SIZE = (2, 3)
        elif batch_dimension == 2:
            BATCH_SIZE = (2, 3, 3)
        DIM_L = 4
        DIM_D = 5
        for _ in range(100):
            query = mx.random.uniform(
                shape=(*BATCH_SIZE, DIM_L, DIM_D), dtype=precision
            )
            key = mx.random.uniform(shape=(*BATCH_SIZE, DIM_L, DIM_D), dtype=precision)
            value = mx.random.uniform(
                shape=(*BATCH_SIZE, DIM_L, DIM_D), dtype=precision
            )
            mask = mx.random.uniform(shape=(*BATCH_SIZE, DIM_L, DIM_L), dtype=precision)
            scale = 0.5
            reference_output = mx.fast.scaled_dot_product_attention(
                q=query.reshape(1, -1, DIM_L, DIM_D),
                k=key.reshape(1, -1, DIM_L, DIM_D),
                v=value.reshape(1, -1, DIM_L, DIM_D),
                scale=scale,
                mask=mask.reshape(1, -1, DIM_L, DIM_L),
            ).reshape(*BATCH_SIZE, DIM_L, DIM_D)
            user_output = scaled_dot_product_attention_simple(
                query,
                key,
                value,
                scale=scale,
                mask=mask,
            )
            assert_allclose(user_output, reference_output, precision=precision)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_task_2_linear(stream: mx.Stream, precision: mx.Dtype):
    with mx.stream(stream):
        BATCH_SIZE = 10
        DIM_Y = 10
        DIM_X = 12
        for _ in range(100):
            x = mx.random.uniform(shape=(BATCH_SIZE, DIM_X), dtype=precision)
            w = mx.random.uniform(shape=(DIM_Y, DIM_X), dtype=precision)
            b = mx.random.uniform(shape=(DIM_Y,), dtype=precision)
            user_output = linear(x, w, b)
            if precision == mx.float16 and stream == mx.cpu:
                # unsupported
                break
            reference_output = mx.addmm(b, x, w.T)
            assert_allclose(user_output, reference_output, precision=precision)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_task_2_simple_multi_head_attention(stream: mx.Stream, precision: mx.Dtype):
    """
    Test if `MultiHeadAttention` can process everything correctly. We assume Q/K/V are of the same dimensions.
    """
    with mx.stream(stream):
        L = 11
        D = 9
        H = 3
        BATCH_SIZE = 10
        for _ in range(100):
            query = mx.random.uniform(shape=(BATCH_SIZE, L, H * D), dtype=precision)
            key = mx.random.uniform(shape=(BATCH_SIZE, L, H * D), dtype=precision)
            value = mx.random.uniform(shape=(BATCH_SIZE, L, H * D), dtype=precision)
            q_proj_weight = mx.random.uniform(shape=(H * D, H * D), dtype=precision)
            k_proj_weight = mx.random.uniform(shape=(H * D, H * D), dtype=precision)
            v_proj_weight = mx.random.uniform(shape=(H * D, H * D), dtype=precision)
            out_proj_weight = mx.random.uniform(shape=(H * D, H * D), dtype=precision)
            mask = mx.random.uniform(shape=(L, L), dtype=precision)

            # Use MLX built-in MultiHeadAttention as reference
            reference_mha = nn.MultiHeadAttention(H * D, H)

            # Set the weights manually to match our test case
            reference_mha.query_proj.weight = q_proj_weight
            reference_mha.key_proj.weight = k_proj_weight
            reference_mha.value_proj.weight = v_proj_weight
            reference_mha.out_proj.weight = out_proj_weight

            reference_output = reference_mha(query, key, value, mask=mask)

            user_output = SimpleMultiHeadAttention(
                H * D,
                H,
                q_proj_weight,
                k_proj_weight,
                v_proj_weight,
                out_proj_weight,
            )(
                query,
                key,
                value,
                mask=mask,
            )
            assert_allclose(user_output, reference_output, precision=precision)

```

## /tests_refsol/test_week_1_day_2.py

```py path="/tests_refsol/test_week_1_day_2.py" 
import pytest
import mlx.core as mx
from .tiny_llm_base import *
import numpy as np
from .utils import *


def rope_helper(
    stream: mx.Stream,
    traditional: bool,
    precision: mx.Dtype,
    with_offset: bool,
):
    BATCH_SIZE = 1
    NUM_HEADS = 8
    HEAD_DIM = 4
    MAX_SEQ_LEN = 20
    SEQ_LEN = 10
    BASE = 10000
    with mx.stream(stream):
        for _ in range(100):
            user_layer = RoPE(HEAD_DIM, MAX_SEQ_LEN, BASE, traditional=traditional)
            x = mx.random.uniform(
                shape=(BATCH_SIZE, SEQ_LEN, NUM_HEADS, HEAD_DIM), dtype=precision
            )

            if with_offset:
                input_pos = np.random.randint(0, MAX_SEQ_LEN - SEQ_LEN)
                input_pos_mx = input_pos
                input_pos_user = slice(input_pos, input_pos + SEQ_LEN)
            else:
                input_pos = None
                input_pos_mx = None
                input_pos_user = None

            reference_output = mx.fast.rope(
                x.transpose(0, 2, 1, 3),
                dims=HEAD_DIM,
                traditional=traditional,
                base=BASE,
                scale=1.0,
                offset=input_pos_mx or 0,
            ).transpose(0, 2, 1, 3)
            user_output = user_layer(x, input_pos_user)
            assert_allclose(
                user_output,
                reference_output,
                precision,
                atol=5e-6 if precision == mx.float32 else 1e-3,
            )


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize(
    "with_offset", [True, False], ids=["with_offset", "without_offset"]
)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_task_1_rope_mlx_traditional(
    stream: mx.Stream, with_offset: bool, precision: mx.Dtype
):
    rope_helper(stream, True, precision, with_offset)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize(
    "with_offset", [True, False], ids=["with_offset", "without_offset"]
)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_task_2_rope_mlx_non_traditional(
    stream: mx.Stream, with_offset: bool, precision: mx.Dtype
):
    rope_helper(stream, False, precision, with_offset)

```

## /tests_refsol/test_week_1_day_3.py

```py path="/tests_refsol/test_week_1_day_3.py" 
import pytest
import mlx.core as mx
from .tiny_llm_base import *
from .utils import *


def grouped_attention_helper(
    stream: mx.Stream,
    precision: mx.Dtype,
    batch_dimension: int,
    scale: float | None,
    is_causal_mask: bool,
):
    with mx.stream(stream):
        H_q = 18
        H = 6
        L = 3
        D = 5
        S = 7
        BATCH = 10
        BATCH_2 = 2
        if batch_dimension == 0:
            q_shape = (H_q, L, D)
            kv_shape = (H, S, D)
            mask_shape = (H_q, L, S)
        elif batch_dimension == 1:
            q_shape = (BATCH, H_q, L, D)
            kv_shape = (BATCH, H, S, D)
            mask_shape = (BATCH, H_q, L, S)
        elif batch_dimension == 2:
            q_shape = (BATCH_2, BATCH, H_q, L, D)
            kv_shape = (BATCH_2, BATCH, H, S, D)
            mask_shape = (BATCH_2, BATCH, H_q, L, S)
        for _ in range(100):
            query = mx.random.uniform(shape=q_shape, dtype=precision)
            key = mx.random.uniform(shape=kv_shape, dtype=precision)
            value = mx.random.uniform(shape=kv_shape, dtype=precision)
            mask = mx.random.uniform(shape=mask_shape, dtype=precision)

            reference_output = mx.fast.scaled_dot_product_attention(
                q=query.reshape(-1, H_q, L, D),
                k=key.reshape(-1, H, S, D),
                v=value.reshape(-1, H, S, D),
                scale=scale if scale is not None else (1.0 / (D**0.5)),
                mask=mask.reshape(-1, H_q, L, S) if not is_causal_mask else "causal",
            )
            # Reshape reference output back to original shape
            reference_output = reference_output.reshape(query.shape)
            user_output = scaled_dot_product_attention_grouped(
                query,
                key,
                value,
                scale=scale,
                mask=mask if not is_causal_mask else "causal",
            )

            assert_allclose(user_output, reference_output, precision=precision)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
@pytest.mark.parametrize(
    "batch_dimension", [0, 1, 2], ids=["batch_0", "batch_1", "batch_2"]
)
@pytest.mark.parametrize("scale", [None, 0.8])
def test_task_1_grouped_attention(
    stream: mx.Stream, precision: mx.Dtype, batch_dimension: int, scale: float | None
):
    grouped_attention_helper(stream, precision, batch_dimension, scale, False)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
def test_task_2_mask_only_same_dim(
    stream: mx.Stream,
):
    with mx.stream(stream):
        L = 3
        S = 3
        user_output = causal_mask(
            L,
            S,
            mx.float32,
        )
        assert_allclose(
            user_output,
            mx.array(
                [
                    [0, -mx.inf, -mx.inf],
                    [0, 0, -mx.inf],
                    [0, 0, 0],
                ]
            ),
            precision=mx.float32,
        )


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
def test_task_2_mask_only_different_dim(
    stream: mx.Stream,
):
    with mx.stream(stream):
        L = 3
        S = 5
        user_output = causal_mask(
            L,
            S,
            mx.float32,
        )
        assert_allclose(
            user_output,
            mx.array(
                [
                    [0, 0, 0, -mx.inf, -mx.inf],
                    [0, 0, 0, 0, -mx.inf],
                    [0, 0, 0, 0, 0],
                ]
            ),
            precision=mx.float32,
        )


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
@pytest.mark.parametrize(
    "batch_dimension", [0, 1, 2], ids=["batch_0", "batch_1", "batch_2"]
)
@pytest.mark.parametrize("scale", [None, 0.8])
def test_task_2_grouped_attention_causal_mask(
    stream: mx.Stream, precision: mx.Dtype, batch_dimension: int, scale: float | None
):
    grouped_attention_helper(stream, precision, batch_dimension, scale, True)


def test_task_3_qwen2_grouped_query_attention():
    pass

```

## /tests_refsol/test_week_1_day_4.py

```py path="/tests_refsol/test_week_1_day_4.py" 
import pytest
import mlx.core as mx
import mlx.nn as nn
from .tiny_llm_base import *
from .utils import *
from mlx_lm.models import qwen2


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_task_1_rms_norm(
    stream: mx.Stream,
    precision: mx.Dtype,
):
    SIZE = 100
    SIZE_Y = 111
    with mx.stream(stream):
        for _ in range(100):
            data = mx.random.uniform(shape=(SIZE, SIZE_Y), dtype=precision)
            weight = mx.random.uniform(shape=(SIZE_Y,), dtype=precision)
            eps = mx.finfo(precision).eps
            reference_output = mx.fast.rms_norm(
                data,
                weight,
                eps=eps,
            )
            user_output = RMSNorm(SIZE_Y, weight, eps=eps)(data)
            assert_allclose(user_output, reference_output, precision)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
def test_task_1_rms_norm_cast_to_float32(stream: mx.Stream):
    precision = mx.float16
    SIZE, SIZE_Y = 32, 64

    data = mx.random.uniform(-1000, 1000, shape=(SIZE, SIZE_Y), dtype=precision)
    weight = mx.random.uniform(-1000, 1000, shape=(SIZE_Y,), dtype=precision)
    eps = mx.finfo(precision).eps

    with mx.stream(stream):
        user_out = RMSNorm(SIZE_Y, weight, eps=eps)(data)
        ref_out = mx.fast.rms_norm(data, weight, eps=eps)

    assert_allclose(user_out, ref_out, precision)


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
def test_task_2_silu(stream: mx.Stream, precision: mx.Dtype):
    with mx.stream(stream):
        BATCH_SIZE = 10
        DIM = 10
        for _ in range(100):
            x = mx.random.uniform(shape=(BATCH_SIZE, DIM), dtype=precision)
            user_output = silu(x)
            reference_output = nn.silu(x)
            assert_allclose(user_output, reference_output, precision=precision)


# Define different dimension parameters for testing
DIM_PARAMS = [
    {"batch_size": 1, "seq_len": 5, "dim": 4, "hidden_dim": 8, "id": "small_dims"},
    {"batch_size": 2, "seq_len": 16, "dim": 32, "hidden_dim": 64, "id": "large_dims"},
    {
        "batch_size": 1,
        "seq_len": 1,
        "dim": 128,
        "hidden_dim": 256,
        "id": "single_token",
    },
]
DIM_PARAMS_IDS = [d["id"] for d in DIM_PARAMS]


@pytest.mark.parametrize("stream", AVAILABLE_STREAMS, ids=AVAILABLE_STREAMS_IDS)
@pytest.mark.parametrize("precision", PRECISIONS, ids=PRECISION_IDS)
@pytest.mark.parametrize("dims", DIM_PARAMS, ids=DIM_PARAMS_IDS)
def test_task_2_qwen_mlp(stream: mx.Stream, precision: mx.Dtype, dims: dict):
    BATCH_SIZE, SEQ_LEN, DIM, HIDDEN_DIM = (
        dims["batch_size"],
        dims["seq_len"],
        dims["dim"],
        dims["hidden_dim"],
    )

    with mx.stream(stream):
        x = mx.random.uniform(shape=(BATCH_SIZE, SEQ_LEN, DIM), dtype=precision)
        w_gate = mx.random.uniform(shape=(HIDDEN_DIM, DIM), dtype=precision)
        w_up = mx.random.uniform(shape=(HIDDEN_DIM, DIM), dtype=precision)
        w_down = mx.random.uniform(shape=(DIM, HIDDEN_DIM), dtype=precision)

        user_mlp = qwen2_week1.Qwen2MLP(
            dim=DIM, hidden_dim=HIDDEN_DIM, w_gate=w_gate, w_up=w_up, w_down=w_down
        )
        user_output = user_mlp(x)

        reference_mlp = qwen2.MLP(dim=DIM, hidden_dim=HIDDEN_DIM)
        reference_mlp.gate_proj.weight = w_gate
        reference_mlp.up_proj.weight = w_up
        reference_mlp.down_proj.weight = w_down
        reference_output = reference_mlp(x)

        assert_allclose(user_output, reference_output, precision)

```

## /tests_refsol/test_week_1_day_5.py

```py path="/tests_refsol/test_week_1_day_5.py" 
import pytest
from .utils import *
from .tiny_llm_base import Qwen2ModelWeek1, Embedding, dequantize_linear, qwen2_week1
from mlx_lm import load

# TODO: task 1 tests


@pytest.mark.skipif(
    not qwen_2_05b_model_exists(), reason="Qwen2-0.5B-Instruct-MLX model not found"
)
def test_utils_qwen_2_05b():
    pass


@pytest.mark.skipif(
    not qwen_2_7b_model_exists(), reason="Qwen2-7B-Instruct-MLX model not found"
)
def test_utils_qwen_2_7b():
    pass


@pytest.mark.skipif(
    not qwen_2_15b_model_exists(), reason="Qwen2-1.5B-Instruct-MLX model not found"
)
def test_utils_qwen_2_15b():
    pass


def helper_test_task_3(model_name: str, iters: int = 10):
    mlx_model, tokenizer = load(model_name)
    model = Qwen2ModelWeek1(mlx_model)
    for _ in range(iters):
        input = mx.random.randint(low=0, high=tokenizer.vocab_size, shape=(1, 10))
        user_output = model(input, 0)
        user_output = user_output - mx.logsumexp(user_output, keepdims=True)
        ref_output = mlx_model(input)
        ref_output = ref_output - mx.logsumexp(ref_output, keepdims=True)
        assert_allclose(user_output, ref_output, precision=mx.float16, rtol=1e-1)


@pytest.mark.skipif(
    not qwen_2_05b_model_exists(), reason="Qwen2-0.5B-Instruct-MLX model not found"
)
def test_task_2_embedding_call():
    mlx_model, _ = load("Qwen/Qwen2-0.5B-Instruct-MLX")
    embedding = Embedding(
        mlx_model.args.vocab_size,
        mlx_model.args.hidden_size,
        dequantize_linear(mlx_model.model.embed_tokens).astype(mx.float16),
    )
    for _ in range(50):
        input = mx.random.randint(low=0, high=mlx_model.args.vocab_size, shape=(1, 10))
        user_output = embedding(input)
        ref_output = mlx_model.model.embed_tokens(input)
        assert_allclose(user_output, ref_output, precision=mx.float16)


@pytest.mark.skipif(
    not qwen_2_05b_model_exists(), reason="Qwen2-0.5B-Instruct-MLX model not found"
)
def test_task_2_embedding_as_linear():
    mlx_model, _ = load("Qwen/Qwen2-0.5B-Instruct-MLX")
    embedding = Embedding(
        mlx_model.args.vocab_size,
        mlx_model.args.hidden_size,
        dequantize_linear(mlx_model.model.embed_tokens).astype(mx.float16),
    )
    for _ in range(50):
        input = mx.random.uniform(shape=(1, 10, mlx_model.args.hidden_size))
        user_output = embedding.as_linear(input)
        ref_output = mlx_model.model.embed_tokens.as_linear(input)
        assert_allclose(user_output, ref_output, precision=mx.float16, atol=1e-1)


@pytest.mark.skipif(
    not qwen_2_05b_model_exists(), reason="Qwen2-0.5B-Instruct-MLX model not found"
)
def test_task_3_qwen_2_05b():
    helper_test_task_3("Qwen/Qwen2-0.5B-Instruct-MLX", 5)


@pytest.mark.skipif(
    not qwen_2_7b_model_exists(), reason="Qwen2-7B-Instruct-MLX model not found"
)
def test_task_3_qwen_2_7b():
    helper_test_task_3("Qwen/Qwen2-7B-Instruct-MLX", 1)


@pytest.mark.skipif(
    not qwen_2_15b_model_exists(), reason="Qwen2-1.5B-Instruct-MLX model not found"
)
def test_task_3_qwen_2_15b():
    helper_test_task_3("Qwen/Qwen2-1.5B-Instruct-MLX", 3)

```

## /tests_refsol/test_week_1_day_6.py

```py path="/tests_refsol/test_week_1_day_6.py" 
import pytest


@pytest.mark.skip("No unit tests for week 1 day 6: use main.py instead")
def test_task_1():
    pass

```

## /tests_refsol/test_week_1_day_7.py

```py path="/tests_refsol/test_week_1_day_7.py" 
import pytest


@pytest.mark.skip("No unit tests for week 1 day 7: use main.py instead")
def test_task_1():
    pass

```

## /tests_refsol/test_week_2_day_2.py

```py path="/tests_refsol/test_week_2_day_2.py" 
import pytest
import mlx.core as mx
from .tiny_llm_base import *
from .utils import *


def quantized_matmul_helper(
    stream: mx.Stream, identity_matrix: bool, precision: mx.Dtype
):
    with mx.stream(stream):
        if identity_matrix:
            input = mx.eye(64, dtype=precision)
        else:
            input = mx.random.normal(shape=(3, 64), dtype=precision)
        weight = mx.random.normal(shape=(5, 64), dtype=precision)
        w_q, scales, biases = mx.quantize(weight)
        user_out = quantized_matmul(
            scales=scales,
            biases=biases,
            group_size=64,
            bits=4,
            a=input,
            b=w_q,
            transpose_b=True,
        )
        ref_out = mx.quantized_matmul(
            input,
            w_q,
            scales,
            biases,
            group_size=64,
            bits=4,
            transpose=True,
        )
        assert_allclose(user_out, ref_out, precision)


def test_task_1_quantized_matmul_simple_f16_cpu():
    quantized_matmul_helper(mx.cpu, True, mx.float16)


def test_task_1_quantized_matmul_complex_f16_cpu():
    quantized_matmul_helper(mx.cpu, False, mx.float16)


def test_task_2_quantized_matmul_simple_f16_gpu():
    quantized_matmul_helper(mx.gpu, True, mx.float16)


def test_task_2_quantized_matmul_complex_f16_gpu():
    quantized_matmul_helper(mx.gpu, False, mx.float16)

```

## /tests_refsol/test_week_2_day_3.py

```py path="/tests_refsol/test_week_2_day_3.py" 
import pytest
import mlx.core as mx
from .tiny_llm_base import *
from .utils import *


def attention_helper(stream: mx.Stream, H_q, H, L, E, S, BATCH):
    precision = mx.float32
    with mx.stream(stream):
        q_shape = (BATCH, H_q, L, E)
        kv_shape = (BATCH, H, S, E)
        scale = 1.0
        for _ in range(100):
            query = mx.random.uniform(shape=q_shape, dtype=precision)
            key = mx.random.uniform(shape=kv_shape, dtype=precision)
            value = mx.random.uniform(shape=kv_shape, dtype=precision)

            reference_output = mx.fast.scaled_dot_product_attention(
                q=query,
                k=key,
                v=value,
                scale=scale,
            )
            user_output = flash_attention(
                query,
                key,
                value,
                scale=scale,
            )
            mx.eval(user_output)  # so that any error will be caught here
            assert_allclose(user_output, reference_output, precision=precision)


def test_flash_attention_cpu_small():
    attention_helper(mx.cpu, 6, 3, 2, 5, 3, 1)


def test_flash_attention_cpu():
    attention_helper(mx.cpu, 18, 6, 7, 5, 3, 10)


def test_flash_attention_cpu_large():
    attention_helper(mx.cpu, 28, 4, 16, 128, 16, 3)


def test_flash_attention_gpu_extra_small():
    attention_helper(mx.gpu, 1, 1, 5, 7, 4, 1)


def test_flash_attention_gpu_small():
    attention_helper(mx.gpu, 6, 3, 2, 5, 3, 1)


def test_flash_attention_gpu():
    attention_helper(mx.gpu, 18, 6, 7, 5, 3, 10)


def test_flash_attention_gpu_large():
    attention_helper(mx.gpu, 28, 4, 16, 128, 16, 3)

```

## /tests_refsol/tiny_llm_base.py

```py path="/tests_refsol/tiny_llm_base.py" 
from tiny_llm_ref import *

```

## /tests_refsol/utils.py

```py path="/tests_refsol/utils.py" 
../tests/utils.py
```


The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.