```
├── .codestyle/
   ├── copyright.hook (700 tokens)
├── .gitignore
├── .pre-commit-config.yaml (400 tokens)
├── LICENSE (omitted)
├── README.md (3.9k tokens)
├── assets/
   ├── prompt_enhancement_dify_dsl.yml (3.2k tokens)
├── comfyui/
   ├── README.md (1100 tokens)
   ├── README_CN.md (600 tokens)
   ├── __init__.py (200 tokens)
   ├── comfy_nodes.py (1700 tokens)
   ├── workflow/
      ├── magi_image_to_video_example.json (1000 tokens)
      ├── magi_text_to_video_example.json (800 tokens)
      ├── magi_video_continuation_example.json (1000 tokens)
├── example/
   ├── 24B/
      ├── 24B_base_config.json (400 tokens)
      ├── 24B_distill_config.json (400 tokens)
      ├── 24B_distill_quant_config.json (400 tokens)
      ├── run.sh (300 tokens)
   ├── 4.5B/
      ├── 4.5B_base_config.json (400 tokens)
      ├── 4.5B_distill_config.json (400 tokens)
      ├── 4.5B_distill_quant_config.json (400 tokens)
      ├── run.sh (300 tokens)
   ├── assets/
      ├── image.jpeg
      ├── prefix_video.mp4
      ├── special_tokens.npz
├── figures/
   ├── algorithm.png
   ├── dit_architecture.png
   ├── inhouse_human_evaluation.png
   ├── logo_black.png
├── inference/
   ├── common/
      ├── __init__.py (300 tokens)
      ├── common_utils.py (300 tokens)
      ├── config.py (1500 tokens)
      ├── dataclass.py (600 tokens)
      ├── logger.py (300 tokens)
      ├── timer.py (600 tokens)
   ├── infra/
      ├── checkpoint/
         ├── __init__.py (100 tokens)
         ├── checkpointing.py (1400 tokens)
      ├── distributed/
         ├── __init__.py (400 tokens)
         ├── dist_utils.py (700 tokens)
         ├── parallel_state.py (5.8k tokens)
      ├── parallelism/
         ├── __init__.py (200 tokens)
         ├── context_parallel.py (5.7k tokens)
         ├── pipeline_parallel.py (900 tokens)
         ├── tile_parallel.py (4.3k tokens)
   ├── model/
      ├── dit/
         ├── __init__.py (100 tokens)
         ├── dit_model.py (6k tokens)
         ├── dit_module.py (11.9k tokens)
      ├── t5/
         ├── __init__.py (100 tokens)
         ├── t5_model.py (2.2k tokens)
      ├── vae/
         ├── __init__.py (200 tokens)
         ├── vae_model.py (2.6k tokens)
         ├── vae_module.py (5.3k tokens)
   ├── pipeline/
      ├── __init__.py (100 tokens)
      ├── entry.py (500 tokens)
      ├── pipeline.py (500 tokens)
      ├── prompt_process.py (1800 tokens)
      ├── video_generate.py (6.7k tokens)
      ├── video_process.py (2.9k tokens)
├── requirements.txt (100 tokens)
```


## /.codestyle/copyright.hook

```hook path="/.codestyle/copyright.hook" 
from __future__ import absolute_import
from __future__ import print_function
from __future__ import unicode_literals

import argparse
import io
import re
import sys
import os
import datetime

COPYRIGHT = '''Copyright (c) 2025 SandAI. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.'''

def _generate_copyright(comment_mark):
    copyright=COPYRIGHT.split(os.linesep)
    header = copyright[0].rstrip()

    p = re.search('(\d{4})', header).group(0)
    now = datetime.datetime.now()

    header = header.replace(p,str(now.year))

    ans=[comment_mark + " " + header + os.linesep]
    for idx, line in enumerate(copyright[1:]):
        ans.append(comment_mark + " " + line.rstrip() + os.linesep)

    return ans

def _get_comment_mark(path):
    lang_type=re.compile(r"\.(py|sh)$")
    if lang_type.search(path) is not None:
        return "#"

    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
    if lang_type.search(path) is not None:
        return "//"

    return None


RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE)
RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")

def _check_copyright(path):
    head=[]
    try:
        with open(path) as f:
            head = [next(f) for x in range(4)]
    except StopIteration:
        pass

    for idx, line in enumerate(head):
        if RE_COPYRIGHT.search(line) is not None:
            return True

    return False

def generate_copyright(path, comment_mark):
    original_contents = io.open(path, encoding="utf-8").readlines()
    head = original_contents[0:4]

    insert_line_no=0
    for i, line in enumerate(head):
        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
            insert_line_no=i+1

    copyright = _generate_copyright(comment_mark)
    if insert_line_no == 0:
        new_contents = copyright
        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
            new_contents.append(os.linesep)
        new_contents.extend(original_contents)
    else:
        new_contents=original_contents[0:insert_line_no]
        new_contents.append(os.linesep)
        new_contents.extend(copyright)
        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
            new_contents.append(os.linesep)
        new_contents.extend(original_contents[insert_line_no:])
    new_contents="".join(new_contents)

    with io.open(path, 'w') as output_file:
        output_file.write(new_contents)


def main(argv=None):
    parser = argparse.ArgumentParser(
        description='Checker for copyright declaration.')
    parser.add_argument('filenames', nargs='*', help='Filenames to check')
    args = parser.parse_args(argv)

    retv = 0
    for path in args.filenames:
        comment_mark = _get_comment_mark(path)
        if comment_mark is None:
            print("warning:Unsupported file", path, file=sys.stderr)
            continue

        if _check_copyright(path):
            continue

        generate_copyright(path, comment_mark)


if __name__ == '__main__':
    exit(main())

```

## /.gitignore

```gitignore path="/.gitignore" 
__pycache__
*.pyc
*.log
*.pt
*.mp4
ckpt
downloads

```

## /.pre-commit-config.yaml

```yaml path="/.pre-commit-config.yaml" 
exclude: \.patch$
repos:
-   repo: local
    hooks:
    -   id: copyright_checker
        name: copyright_checker
        entry: python3 ./.codestyle/copyright.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v4.4.0
    hooks:
    -   id: check-added-large-files
        args:
        -   --maxkb=30720
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
    -   id: requirements-txt-fixer
    -   id: sort-simple-yaml
-   repo: https://github.com/Lucas-C/pre-commit-hooks.git
    rev: v1.5.1
    hooks:
    -   id: remove-crlf
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
    -   id: remove-tabs
        name: Tabs remover (C++)
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|xpu|kps)$
        args: [--whitespaces-count, '2']
    -   id: remove-tabs
        name: Tabs remover (Python)
        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
        args: [--whitespaces-count, '4']
-   repo: https://github.com/psf/black.git
    rev: 23.3.0
    hooks:
    -   id: black
        args: [--line-length=127, --skip-string-normalization, --skip-magic-trailing-comma]
        files: (.*\.(py|pyi|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
-   repo: https://github.com/pre-commit/mirrors-isort
    rev: v5.10.1
    hooks:
    -   id: isort
        args: [--profile=black, --line-length=127, --multi-line=3, --force-grid-wrap=0]
        files: \.py$
-   repo: https://github.com/PyCQA/autoflake
    rev: v2.3.1
    hooks:
    -   id: autoflake
        args: [--remove-all-unused-imports, --remove-unused-variables, --in-place, --ignore-init-module-imports, --ignore-pass-after-docstring]
        files: \.py$
-   repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks.git
    rev: v2.9.0
    hooks:
    -   id: pretty-format-yaml
        args: [--autofix, --indent, '4']

```

## /README.md

![magi-logo](figures/logo_black.png)


-----

<p align="center">
    <a href="https://arxiv.org/abs/2505.13211"><img alt="paper" src="https://img.shields.io/badge/Paper-arXiv-B31B1B?logo=arxiv"></a>
    <a href="https://sand.ai"><img alt="blog" src="https://img.shields.io/badge/Sand%20AI-Homepage-333333.svg?logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iODAwIiBoZWlnaHQ9IjgwMCIgdmlld0JveD0iMCAwIDgwMCA4MDAiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJNMjI3IDIyNS4wODVDMjI3IDIwMi4zMDMgMjI3IDE5MC45MTIgMjMxLjQzNyAxODIuMjExQzIzNS4zMzkgMTc0LjU1NyAyNDEuNTY2IDE2OC4zMzQgMjQ5LjIyNiAxNjQuNDM0QzI1Ny45MzMgMTYwIDI2OS4zMzIgMTYwIDI5Mi4xMjkgMTYwSDUwNy44NzFDNTA5LjI5NSAxNjAgNTEwLjY3NiAxNjAgNTEyLjAxNCAxNjAuMDAxQzUzMi4wODIgMTYwLjAxNyA1NDIuNjExIDE2MC4yNzcgNTUwLjc3NCAxNjQuNDM0QzU1OC40MzQgMTY4LjMzNCA1NjQuNjYxIDE3NC41NTcgNTY4LjU2MyAxODIuMjExQzU3MyAxOTAuOTEyIDU3MyAyMDIuMzAzIDU3MyAyMjUuMDg1VjI1Ni41NThDNTczIDI5MS4zMTkgNTczIDMwOC43IDU2NS4wMzUgMzIzLjI3OUM1NTguNzU2IDMzNC43NzIgNTQzLjU2NSAzNDYuMTEgNTIzLjA3OCAzNTkuNjA1QzUxNC42NzQgMzY1LjE0MSA1MTAuNDcyIDM2Ny45MDkgNTA1LjYzOSAzNjcuOTM2QzUwMC44MDYgMzY3Ljk2NCA0OTYuNTAzIDM2NS4yIDQ4Ny44OTYgMzU5LjY3MUw0ODcuODk2IDM1OS42N0w0NjYuNDY5IDM0NS45MDVDNDU2Ljg3NSAzMzkuNzQyIDQ1Mi4wNzggMzM2LjY2IDQ1Mi4wNzggMzMyLjIxOEM0NTIuMDc4IDMyNy43NzcgNDU2Ljg3NSAzMjQuNjk1IDQ2Ni40NjkgMzE4LjUzMUw1MjYuNzgyIDI3OS43ODVDNTM1LjI5MSAyNzQuMzE5IDU0MC40MzUgMjY0LjkwMyA1NDAuNDM1IDI1NC43OTRDNTQwLjQzNSAyMzguMzg2IDUyNy4xMjUgMjI1LjA4NSA1MTAuNzA1IDIyNS4wODVIMjg5LjI5NUMyNzIuODc1IDIyNS4wODUgMjU5LjU2NSAyMzguMzg2IDI1OS41NjUgMjU0Ljc5NEMyNTkuNTY1IDI2NC45MDMgMjY0LjcwOSAyNzQuMzE5IDI3My4yMTggMjc5Ljc4NUw1MTMuMTggNDMzLjk0MUM1NDIuNDQxIDQ1Mi43MzggNTU3LjA3MSA0NjIuMTM3IDU2NS4wMzUgNDc2LjcxNkM1NzMgNDkxLjI5NCA1NzMgNTA4LjY3NSA1NzMgNTQzLjQzNlY1NzQuOTE1QzU3MyA1OTcuNjk3IDU3MyA2MDkuMDg4IDU2OC41NjMgNjE3Ljc4OUM1NjQuNjYxIDYyNS40NDQgNTU4LjQzNCA2MzEuNjY2IDU1MC43NzQgNjM1LjU2NkM1NDIuMDY3IDY0MCA1MzAuNjY4IDY0MCA1MDcuODcxIDY0MEgyOTIuMTI5QzI2OS4zMzIgNjQwIDI1Ny45MzMgNjQwIDI0OS4yMjYgNjM1LjU2NkMyNDEuNTY2IDYzMS42NjYgMjM1LjMzOSA2MjUuNDQ0IDIzMS40MzcgNjE3Ljc4OUMyMjcgNjA5LjA4OCAyMjcgNTk3LjY5NyAyMjcgNTc0LjkxNVY1NDMuNDM2QzIyNyA1MDguNjc1IDIyNyA0OTEuMjk0IDIzNC45NjUgNDc2LjcxNkMyNDEuMjQ0IDQ2NS4yMjIgMjU2LjQzMyA0NTMuODg2IDI3Ni45MTggNDQwLjM5MkMyODUuMzIyIDQzNC44NTYgMjg5LjUyNSA0MzIuMDg4IDI5NC4zNTcgNDMyLjA2QzI5OS4xOSA0MzIuMDMyIDMwMy40OTQgNDM0Ljc5NyAzMTIuMSA0NDAuMzI2TDMzMy41MjcgNDU0LjA5MUMzNDMuMTIyIDQ2MC4yNTQgMzQ3LjkxOSA0NjMuMzM2IDM0Ny45MTkgNDY3Ljc3OEMzNDcuOTE5IDQ3Mi4yMiAzNDMuMTIyIDQ3NS4zMDEgMzMzLjUyOCA0ODEuNDY1TDMzMy41MjcgNDgxLjQ2NUwyNzMuMjIgNTIwLjIwOEMyNjQuNzA5IDUyNS42NzUgMjU5LjU2NSA1MzUuMDkxIDI1OS41NjUgNTQ1LjIwMkMyNTkuNTY1IDU2MS42MTIgMjcyLjg3NyA1NzQuOTE1IDI4OS4yOTkgNTc0LjkxNUg1MTAuNzAxQzUyNy4xMjMgNTc0LjkxNSA1NDAuNDM1IDU2MS42MTIgNTQwLjQzNSA1NDUuMjAyQzU0MC40MzUgNTM1LjA5MSA1MzUuMjkxIDUyNS42NzUgNTI2Ljc4IDUyMC4yMDhMMjg2LjgyIDM2Ni4wNTNDMjU3LjU2IDM0Ny4yNTYgMjQyLjkyOSAzMzcuODU3IDIzNC45NjUgMzIzLjI3OUMyMjcgMzA4LjcgMjI3IDI5MS4zMTkgMjI3IDI1Ni41NThWMjI1LjA4NVoiIGZpbGw9IiNGRkZGRkYiLz4KPC9zdmc+Cg=="></a>
    <a href="https://magi.sand.ai"><img alt="product" src="https://img.shields.io/badge/Magi-Product-logo.svg?logo=data:image/svg%2bxml;base64,PHN2ZyB3aWR0aD0iODAwIiBoZWlnaHQ9IjgwMCIgdmlld0JveD0iMCAwIDgwMCA4MDAiIGZpbGw9Im5vbmUiIHhtbG5zPSJodHRwOi8vd3d3LnczLm9yZy8yMDAwL3N2ZyI+CjxwYXRoIGZpbGwtcnVsZT0iZXZlbm9kZCIgY2xpcC1ydWxlPSJldmVub2RkIiBkPSJNNDY5LjAyNyA1MDcuOTUxVjE4MC4zNjRDNDY5LjAyNyAxNjguNDE2IDQ2OS4wMjcgMTYyLjQ0MiA0NjUuMjQ0IDE2MC41MTlDNDYxLjQ2MSAxNTguNTk2IDQ1Ni42NTkgMTYyLjEzIDQ0Ny4wNTYgMTY5LjE5OEwzNjEuMDQ4IDIzMi40OTZDMzQ2LjI5NiAyNDMuMzUzIDMzOC45MjEgMjQ4Ljc4MSAzMzQuOTQ3IDI1Ni42NUMzMzAuOTczIDI2NC41MTggMzMwLjk3MyAyNzMuNjk1IDMzMC45NzMgMjkyLjA0OVY2MTkuNjM2QzMzMC45NzMgNjMxLjU4NCAzMzAuOTczIDYzNy41NTggMzM0Ljc1NiA2MzkuNDgxQzMzOC41MzkgNjQxLjQwNCAzNDMuMzQxIDYzNy44NyAzNTIuOTQ0IDYzMC44MDJMNDM4Ljk1MiA1NjcuNTA0QzQ1My43MDQgNTU2LjY0OCA0NjEuMDggNTUxLjIxOSA0NjUuMDUzIDU0My4zNUM0NjkuMDI3IDUzNS40ODIgNDY5LjAyNyA1MjYuMzA1IDQ2OS4wMjcgNTA3Ljk1MVpNMjg3LjkwNyA0OTQuMTU1VjIyMS45M0MyODcuOTA3IDIxNC4wMDIgMjg3LjkwNyAyMTAuMDM5IDI4NS4zOTQgMjA4Ljc1NEMyODIuODgxIDIwNy40NyAyNzkuNjg0IDIwOS44MDEgMjczLjI5MiAyMTQuNDYyTDIwOS40MjEgMjYxLjAzMkMxOTguMjYyIDI2OS4xNjggMTkyLjY4MyAyNzMuMjM2IDE4OS42NzUgMjc5LjE2QzE4Ni42NjcgMjg1LjA4NCAxODYuNjY3IDI5Mi4wMDMgMTg2LjY2NyAzMDUuODQxVjU3OC4wNjdDMTg2LjY2NyA1ODUuOTk0IDE4Ni42NjcgNTg5Ljk1OCAxODkuMTggNTkxLjI0MkMxOTEuNjkzIDU5Mi41MjYgMTk0Ljg4OSA1OTAuMTk2IDIwMS4yODIgNTg1LjUzNUwyNjUuMTUyIDUzOC45NjVDMjc2LjMxMSA1MzAuODI5IDI4MS44OSA1MjYuNzYxIDI4NC44OTkgNTIwLjgzN0MyODcuOTA3IDUxNC45MTMgMjg3LjkwNyA1MDcuOTk0IDI4Ny45MDcgNDk0LjE1NVpNNjEzLjMzMyAyMjEuOTNWNDk0LjE1NUM2MTMuMzMzIDUwNy45OTQgNjEzLjMzMyA1MTQuOTEzIDYxMC4zMjUgNTIwLjgzN0M2MDcuMzE3IDUyNi43NjEgNjAxLjczOCA1MzAuODI5IDU5MC41NzkgNTM4Ljk2NUw1MjYuNzA4IDU4NS41MzVDNTIwLjMxNiA1OTAuMTk2IDUxNy4xMTkgNTkyLjUyNiA1MTQuNjA2IDU5MS4yNDJDNTEyLjA5MyA1ODkuOTU4IDUxMi4wOTMgNTg1Ljk5NCA1MTIuMDkzIDU3OC4wNjdWMzA1Ljg0MUM1MTIuMDkzIDI5Mi4wMDMgNTEyLjA5MyAyODUuMDg0IDUxNS4xMDIgMjc5LjE2QzUxOC4xMSAyNzMuMjM2IDUyMy42ODkgMjY5LjE2OCA1MzQuODQ4IDI2MS4wMzJMNTk4LjcxOSAyMTQuNDYyQzYwNS4xMTEgMjA5LjgwMSA2MDguMzA3IDIwNy40NyA2MTAuODIgMjA4Ljc1NEM2MTMuMzMzIDIxMC4wMzkgNjEzLjMzMyAyMTQuMDAyIDYxMy4zMzMgMjIxLjkzWiIgZmlsbD0iI0ZGRkZGRiIgc2hhcGUtcmVuZGVyaW5nPSJjcmlzcEVkZ2VzIi8+Cjwvc3ZnPgo=&color=DCBE7E"></a>
    <a href="https://huggingface.co/sand-ai"><img alt="Hugging Face"
    src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Sand AI-ffc107?color=ffc107&logoColor=white"/></a>
     <a href="https://x.com/SandAI_HQ"><img alt="Twitter Follow"
    src="https://img.shields.io/badge/Twitter-Sand%20AI-white?logo=x&logoColor=white"/></a>
    <a href="https://discord.gg/hgaZ86D7Wv"><img alt="Discord"
    src="https://img.shields.io/badge/Discord-Sand%20AI-7289da?logo=discord&logoColor=white&color=7289da"/></a>
    <a href="https://github.com/SandAI-org/MAGI-1/LICENSE"><img alt="license" src="https://img.shields.io/badge/License-Apache2.0-green?logo=Apache"></a>
</p>

# MAGI-1: Autoregressive Video Generation at Scale

This repository contains the code for the MAGI-1 model, pre-trained weights and inference code. You can find more information on our [technical report](https://static.magi.world/static/files/MAGI_1.pdf) or directly create magic with MAGI-1 [here](http://sand.ai) . 🚀✨


## 🔥🔥🔥 Latest News

- May 30, 2025: Support for ComfyUI is added 🎉 — the custom nodes for MAGI-1 are now available. Try them out in your workflows!
- May 26, 2025: MAGI-1 4.5B distill and distill+quant models has been released 🎉 — we’ve updated the model weights - check it out!
- May 14, 2025: Added Dify DSL for prompt enhancement 🎉 — import it into Dify to boost prompt quality!
- Apr 30, 2025: MAGI-1 4.5B model has been released 🎉. We've updated the model weights — check it out!
- Apr 21, 2025: MAGI-1 is here 🎉. We've released the model weights and inference code — check it out!


## 1. About

We present MAGI-1, a world model that generates videos by ***autoregressively*** predicting a sequence of video chunks, defined as fixed-length segments of consecutive frames. Trained to denoise per-chunk noise that increases monotonically over time, MAGI-1 enables causal temporal modeling and naturally supports streaming generation. It achieves strong performance on image-to-video (I2V) tasks conditioned on text instructions, providing high temporal consistency and scalability, which are made possible by several algorithmic innovations and a dedicated infrastructure stack. MAGI-1 further supports controllable generation via chunk-wise prompting, enabling smooth scene transitions, long-horizon synthesis, and fine-grained text-driven control. We believe MAGI-1 offers a promising direction for unifying high-fidelity video generation with flexible instruction control and real-time deployment.

<div align="center">
  <video src="https://github.com/user-attachments/assets/5cfa90e0-f6ed-476b-a194-71f1d309903a
" width="70%" poster=""> </video>
</div>


## 2. Model Summary

### Transformer-based VAE

- Variational autoencoder (VAE) with transformer-based architecture, 8x spatial and 4x temporal compression.
- Fastest average decoding time and highly competitive reconstruction quality

### Auto-Regressive Denoising Algorithm

MAGI-1 is an autoregressive denoising video generation model generating videos chunk-by-chunk instead of as a whole. Each chunk (24 frames) is denoised holistically, and the generation of the next chunk begins as soon as the current one reaches a certain level of denoising. This pipeline design enables concurrent processing of up to four chunks for efficient video generation.

![auto-regressive denosing algorithm](figures/algorithm.png)

### Diffusion Model Architecture

MAGI-1 is built upon the Diffusion Transformer, incorporating several key innovations to enhance training efficiency and stability at scale. These advancements include Block-Causal Attention, Parallel Attention Block, QK-Norm and GQA, Sandwich Normalization in FFN, SwiGLU, and Softcap Modulation. For more details, please refer to the [technical report.](https://static.magi.world/static/files/MAGI_1.pdf)
<div align="center">
<img src="figures/dit_architecture.png" alt="diffusion model architecture" width="500" />
</div>

### Distillation Algorithm

We adopt a shortcut distillation approach that trains a single velocity-based model to support variable inference budgets. By enforcing a self-consistency constraint—equating one large step with two smaller steps—the model learns to approximate flow-matching trajectories across multiple step sizes. During training, step sizes are cyclically sampled from {64, 32, 16, 8}, and classifier-free guidance distillation is incorporated to preserve conditional alignment. This enables efficient inference with minimal loss in fidelity.


## 3. Model Zoo

We provide the pre-trained weights for MAGI-1, including the 24B and 4.5B models, as well as the corresponding distill and distill+quant models. The model weight links are shown in the table.

| Model                         | Link                                                                 | Recommend Machine             |
| ------------------------------ | -------------------------------------------------------------------- | ------------------------------- |
| T5                             | [T5](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/t5)        | -                               |
| MAGI-1-VAE                     | [MAGI-1-VAE](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/vae) | -                               |
| MAGI-1-24B                     | [MAGI-1-24B](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/24B_base) | H100/H800 × 8                   |
| MAGI-1-24B-distill              | [MAGI-1-24B-distill](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/24B_distill) | H100/H800 × 8                   |
| MAGI-1-24B-distill+fp8_quant    | [MAGI-1-24B-distill+quant](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/24B_distill_quant) | H100/H800 × 4 or RTX 4090 × 8    |
| MAGI-1-4.5B                    | [MAGI-1-4.5B](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/4.5B_base) | RTX 4090 × 1                    |
| MAGI-1-4.5B-distill             | [MAGI-1-4.5B-distill](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/4.5B_distill) | RTX 4090 × 1                    |
| MAGI-1-4.5B-distill+fp8_quant   | [MAGI-1-4.5B-distill+quant](https://huggingface.co/sand-ai/MAGI-1/tree/main/ckpt/magi/4.5B_distill_quant) | RTX 4090 × 1                    |

> [!NOTE]
>
> For 4.5B models, any machine with at least 24GB of GPU memory is sufficient.
> If GPU memory is more constrained, you can instead run the 4.5B-distill+fp8_quant model by setting the `window_size` parameter to 1 in the `4.5B_distill_quant_config.json` file. This configuration works on GPUs with at least 12GB of memory.

## 4. Evaluation

### In-house Human Evaluation

MAGI-1 achieves state-of-the-art performance among open-source models like Wan-2.1 and HunyuanVideo and closed-source model like Hailuo (i2v-01), particularly excelling in instruction following and motion quality, positioning it as a strong potential competitor to closed-source commercial models such as Kling.

![inhouse human evaluation](figures/inhouse_human_evaluation.png)

### Physical Evaluation

Thanks to the natural advantages of autoregressive architecture, Magi achieves far superior precision in predicting physical behavior on the [Physics-IQ benchmark](https://github.com/google-deepmind/physics-IQ-benchmark) through video continuation—significantly outperforming all existing models.

| Model          | Phys. IQ Score ↑ | Spatial IoU ↑ | Spatio Temporal ↑ | Weighted Spatial IoU ↑ | MSE ↓  |
|----------------|------------------|---------------|-------------------|-------------------------|--------|
| **V2V Models** |                  |               |                   |                         |        |
| **Magi-24B (V2V)** | **56.02**        | **0.367**     | **0.270**         | **0.304**               | **0.005** |
| **Magi-4.5B (V2V)** | **42.44**        | **0.234**     | **0.285**         | **0.188**               | **0.007** |
| VideoPoet (V2V)| 29.50            | 0.204         | 0.164             | 0.137                   | 0.010  |
| **I2V Models** |                  |               |                   |                         |        |
| **Magi-24B (I2V)** | **30.23**        | **0.203**     | **0.151**         | **0.154**               | **0.012** |
| Kling1.6 (I2V) | 23.64            | 0.197         | 0.086             | 0.144                   | 0.025  |
| VideoPoet (I2V)| 20.30            | 0.141         | 0.126             | 0.087                   | 0.012  |
| Gen 3 (I2V)    | 22.80            | 0.201         | 0.115             | 0.116                   | 0.015  |
| Wan2.1 (I2V)   | 20.89            | 0.153         | 0.100             | 0.112                   | 0.023  |
| Sora (I2V)     | 10.00            | 0.138         | 0.047             | 0.063                   | 0.030  |
| **GroundTruth**| **100.0**        | **0.678**     | **0.535**         | **0.577**               | **0.002** |


## 5. How to run

### Environment Preparation

We provide two ways to run MAGI-1, with the Docker environment being the recommended option.

**Run with Docker Environment (Recommend)**

```bash
docker pull sandai/magi:latest

docker run -it --gpus all --privileged --shm-size=32g --name magi --net=host --ipc=host --ulimit memlock=-1 --ulimit stack=6710886 sandai/magi:latest /bin/bash
```

**Run with Source Code**

```bash
# Create a new environment
conda create -n magi python==3.10.12

# Install pytorch
conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.4 -c pytorch -c nvidia

# Install other dependencies
pip install -r requirements.txt

# Install ffmpeg
conda install -c conda-forge ffmpeg=4.4

# For GPUs based on the Hopper architecture (e.g., H100/H800), it is recommended to install MagiAttention(https://github.com/SandAI-org/MagiAttention) for acceleration. For non-Hopper GPUs, installing MagiAttention is not necessary.
git clone git@github.com:SandAI-org/MagiAttention.git
cd MagiAttention
git submodule update --init --recursive
pip install --no-build-isolation .
```

### Inference Command

To run the `MagiPipeline`, you can control the input and output by modifying the parameters in the `example/24B/run.sh` or `example/4.5B/run.sh` script. Below is an explanation of the key parameters:

#### Parameter Descriptions

- `--config_file`: Specifies the path to the configuration file, which contains model configuration parameters, e.g., `example/24B/24B_config.json`.
- `--mode`: Specifies the mode of operation. Available options are:
  - `t2v`: Text to Video
  - `i2v`: Image to Video
  - `v2v`: Video to Video
- `--prompt`: The text prompt used for video generation, e.g., `"Good Boy"`.
- `--image_path`: Path to the image file, used only in `i2v` mode.
- `--prefix_video_path`: Path to the prefix video file, used only in `v2v` mode.
- `--output_path`: Path where the generated video file will be saved.

#### Bash Script

```bash
#!/bin/bash
# Run 24B MAGI-1 model
bash example/24B/run.sh

# Run 4.5B MAGI-1 model
bash example/4.5B/run.sh
```

#### Customizing Parameters

You can modify the parameters in `run.sh` as needed. For example:

- To use the Image to Video mode (`i2v`), set `--mode` to `i2v` and provide `--image_path`:
  ```bash
  --mode i2v \
  --image_path example/assets/image.jpeg \
  ```

- To use the Video to Video mode (`v2v`), set `--mode` to `v2v` and provide `--prefix_video_path`:
  ```bash
  --mode v2v \
  --prefix_video_path example/assets/prefix_video.mp4 \
  ```

By adjusting these parameters, you can flexibly control the input and output to meet different requirements.

### Some Useful Configs (for config.json)

> [!NOTE]
>
> - If you are running 24B model with RTX 4090 \* 8, please set `pp_size:2 cp_size: 4`.
>
> - Our model supports arbitrary resolutions. To accelerate inference process, the default resolution for the 4.5B model is set to 720×720 in the `4.5B_config.json`.

| Config         | Help                                                         |
| -------------- | ------------------------------------------------------------ |
| seed           | Random seed used for video generation                        |
| video_size_h   | Height of the video                                          |
| video_size_w   | Width of the video                                           |
| num_frames     | Controls the duration of generated video                     |
| fps            | Frames per second, 4 video frames correspond to 1 latent_frame |
| cfg_number     | Base model uses cfg_number==3, distill and quant model uses cfg_number=1 |
| load           | Directory containing a model checkpoint.                     |
| t5_pretrained  | Path to load pretrained T5 model                             |
| vae_pretrained | Path to load pretrained VAE model                            |

## 6. Prompt Enhancement

To improve prompt quality, we provide a [Dify DSL](/assets/prompt_enhancement_dify_dsl.yml) file that can be imported directly into [Dify](https://dify.ai/) to set up a prompt enhancement pipeline. If you’re new to Dify, see [how to create an app from a DSL file](https://docs.dify.ai/en/guides/application-orchestrate/creating-an-application#creating-from-a-dsl-file) to get started.

## 7. License

This project is licensed under the Apache License 2.0 - see the [LICENSE](LICENSE) file for details.

## 8. Citation

If you find our code or model useful in your research, please cite:

```bibtex
@misc{ai2025magi1autoregressivevideogeneration,
      title={MAGI-1: Autoregressive Video Generation at Scale},
      author={Sand. ai and Hansi Teng and Hongyu Jia and Lei Sun and Lingzhi Li and Maolin Li and Mingqiu Tang and Shuai Han and Tianning Zhang and W. Q. Zhang and Weifeng Luo and Xiaoyang Kang and Yuchen Sun and Yue Cao and Yunpeng Huang and Yutong Lin and Yuxin Fang and Zewei Tao and Zheng Zhang and Zhongshu Wang and Zixun Liu and Dai Shi and Guoli Su and Hanwen Sun and Hong Pan and Jie Wang and Jiexin Sheng and Min Cui and Min Hu and Ming Yan and Shucheng Yin and Siran Zhang and Tingting Liu and Xianping Yin and Xiaoyu Yang and Xin Song and Xuan Hu and Yankai Zhang and Yuqiao Li},
      year={2025},
      eprint={2505.13211},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2505.13211},
}
```

## 9. Contact

If you have any questions, please feel free to raise an issue or contact us at [research@sand.ai](mailto:research@sand.ai) .


## /assets/prompt_enhancement_dify_dsl.yml

```yml path="/assets/prompt_enhancement_dify_dsl.yml" 
app:
  description: ''
  icon: 🤖
  icon_background: '#FFEAD5'
  mode: workflow
  name: ' magi-1'
  use_icon_as_answer_icon: false
dependencies:
- current_identifier: null
  type: marketplace
  value:
    marketplace_plugin_unique_identifier: langgenius/openai:0.0.7@11ec0b1909200f62b6ebf2cec1da981a9071d11c1ee0e2ef332ce89bcffa2544
kind: app
version: 0.1.5
workflow:
  conversation_variables: []
  environment_variables: []
  features:
    file_upload:
      allowed_file_extensions:
      - .JPG
      - .JPEG
      - .PNG
      - .GIF
      - .WEBP
      - .SVG
      allowed_file_types:
      - image
      allowed_file_upload_methods:
      - local_file
      - remote_url
      enabled: false
      fileUploadConfig:
        audio_file_size_limit: 50
        batch_count_limit: 5
        file_size_limit: 15
        image_file_size_limit: 10
        video_file_size_limit: 100
        workflow_file_upload_limit: 10
      image:
        enabled: false
        number_limits: 3
        transfer_methods:
        - local_file
        - remote_url
      number_limits: 3
    opening_statement: ''
    retriever_resource:
      enabled: true
    sensitive_word_avoidance:
      enabled: false
    speech_to_text:
      enabled: false
    suggested_questions: []
    suggested_questions_after_answer:
      enabled: false
    text_to_speech:
      enabled: false
      language: ''
      voice: ''
  graph:
    edges:
    - data:
        isInIteration: false
        sourceType: start
        targetType: llm
      id: 1737637034436-source-1737637085579-target
      source: '1737637034436'
      sourceHandle: source
      target: '1737637085579'
      targetHandle: target
      type: custom
      zIndex: 0
    - data:
        isInIteration: false
        isInLoop: false
        sourceType: start
        targetType: llm
      id: 1737637034436-source-1744808380826-target
      source: '1737637034436'
      sourceHandle: source
      target: '1744808380826'
      targetHandle: target
      type: custom
      zIndex: 0
    - data:
        isInIteration: false
        isInLoop: false
        sourceType: llm
        targetType: code
      id: 1737637085579-source-1744808916120-target
      source: '1737637085579'
      sourceHandle: source
      target: '1744808916120'
      targetHandle: target
      type: custom
      zIndex: 0
    - data:
        isInLoop: false
        sourceType: llm
        targetType: code
      id: 1744808380826-source-1744808916120-target
      source: '1744808380826'
      sourceHandle: source
      target: '1744808916120'
      targetHandle: target
      type: custom
      zIndex: 0
    - data:
        isInLoop: false
        sourceType: code
        targetType: end
      id: 1744808916120-source-1737638112438-target
      source: '1744808916120'
      sourceHandle: source
      target: '1737638112438'
      targetHandle: target
      type: custom
      zIndex: 0
    nodes:
    - data:
        desc: ''
        selected: false
        title: Start
        type: start
        variables:
        - label: prompt
          max_length: 48
          options: []
          required: true
          type: text-input
          variable: prompt
        - allowed_file_extensions: []
          allowed_file_types:
          - image
          allowed_file_upload_methods:
          - local_file
          - remote_url
          label: image
          max_length: 5
          options: []
          required: true
          type: file
          variable: image
      height: 116
      id: '1737637034436'
      position:
        x: 74.92968666682918
        y: 282
      positionAbsolute:
        x: 74.92968666682918
        y: 282
      selected: false
      sourcePosition: right
      targetPosition: left
      type: custom
      width: 244
    - data:
        context:
          enabled: false
          variable_selector: []
        desc: ''
        model:
          completion_params:
            temperature: 0.7
          mode: chat
          name: gpt-4o-mini
          provider: langgenius/openai/openai
        prompt_template:
        - id: 5181e4bf-208b-416d-b42c-161f4b847fde
          role: system
          text: "# Role: You are a screenwriter and director.\n\n## Goals：\nYour task\
            \ is to provide a precise, objective, and neutral description of the physical\
            \ appearance of subjects and the environment in the given image. Output\
            \ must begin directly with a description, without introductory phrases\
            \ or framing statements.\n\n### Steps:\n1. Identify the subjects and environment\
            \ in the image. Focus only on their physical appearance.\n2. Reference\
            \ the prompt for any explicitly stated appearance-related details, but\
            \ do not infer any additional attributes or context.\n3. Use precise,\
            \ objective, and neutral language to describe the physical appearance\
            \ of subjects and the environment in the given image. Output must begin\
            \ directly with a description, without introductory phrases or framing\
            \ statements.\n\n\n### Principles\n1. Output should not exceed 80 words.\n\
            2. Describe only **the physical app\nearance** of the subject and environment\
            \ in the image.\n3. Reference only **concrete visual details** provided\
            \ in the prompt.\n4. Avoid describing changes, movements, or transformations.\
            \ The description must represent the image as a single, unchanging moment\
            \ with no references to past or future states.\n5. **Do not infer emotions,\
            \ intentions, mood, symbolism or atmosphere.** Avoid descriptions that\
            \ suggest feelings, intentions, or subjective interpretations (e.g., “shows\
            \ determination,” “enhances the atmosphere”).\n6. Avoid figurative or\
            \ poetic expressions.\n7. Output must begin directly with a description,\
            \ without introductory phrases or framing statements, such as 'The image\
            \ shows'.\n8. Refer to the descriptive structure and language style used\
            \ in the good case.\n\n### Examples: Below are some examples. Please follow\
            \ their descriptive structure and language style. \n1. Medium Close-Up:Dim,\
            \ cool-toned: Medium Close-Up shot of a young man with shoulder-length\
            \ brown hair in a dimly lit hallway. He is positioned slightly off-center,\
            \ more towards the frame right, and oriented towards the frame left. He\
            \ wears a dark blue hooded jacket. Two indistinct figures are visible\
            \ in the blurred background. \n2. Even, soft lighting: A light-skinned\
            \ man with short brown hair holds a small, light-colored object in his\
            \ right hand. he is centered in the frame against a dark gray background.\
            \ he wears a light gray t-shirt and brown accessories.\n3. Medium Close-Up:Natural,\
            \ soft light: A medium close-up shot of a man in an orange jacket, standing\
            \ outdoors against a snowy mountain backdrop. A ski lift is partially\
            \ visible in the background. \n4. Soft, diffused: Three individuals in\
            \ the frame. Woman in green shawl, center, facing right. Man in dark coat\
            \ and hat, left, facing right. Man in dark coat and top hat, right, facing\
            \ right. Brick wall and indistinct industrial objects in background. \n\
            5. Dim, natural light: A young woman with curly brown hair sits in a dark-colored\
            \ wooden chair on a screened-in porch at night. the background is blurred\
            \ dark green foliage.\n6. Photographic:Soft, warm lighting: Two women\
            \ are in a room decorated for christmas. one woman is seated at a desk\
            \ with a laptop, and the other is standing behind her. a christmas tree\
            \ with purple and pink ornaments is in the background. a shelf with greenery\
            \ and a framed photo of a golden retriever is to the right.\n"
        - id: 8f152582-36a9-4a2b-8127-5b6fb79c7dd3
          role: user
          text: 'image is {{#1737637034436.image#}}

            prompt is {{#1737637034436.prompt#}}'
        selected: true
        title: LLM
        type: llm
        variables: []
        vision:
          enabled: false
      height: 90
      id: '1737637085579'
      position:
        x: 417.6221432741702
        y: 282
      positionAbsolute:
        x: 417.6221432741702
        y: 282
      selected: true
      sourcePosition: right
      targetPosition: left
      type: custom
      width: 244
    - data:
        desc: ''
        outputs:
        - value_selector:
          - '1744808916120'
          - result
          variable: text
        selected: false
        title: End
        type: end
      height: 90
      id: '1737638112438'
      position:
        x: 1085.934666270066
        y: 282
      positionAbsolute:
        x: 1085.934666270066
        y: 282
      selected: false
      sourcePosition: right
      targetPosition: left
      type: custom
      width: 244
    - data:
        context:
          enabled: false
          variable_selector: []
        desc: ''
        model:
          completion_params:
            temperature: 0.7
          mode: chat
          name: gpt-4o-mini
          provider: langgenius/openai/openai
        prompt_template:
        - id: ddb68185-7de2-4b87-aed3-ebac1f6cec26
          role: system
          text: "# Role: You are an expert in dynamic video scene design, specializing\
            \ in deriving and describing how videos dynamically evolve from their\
            \ first frame based on the user’s prompt.\n\n## Context: \n1. Image as\
            \ the First Frame\nThe provided image represents the first frame of the\
            \ video. It serves as the starting point for all dynamic changes and interactions\
            \ described in the video.\n2. Prompt as a Guide\nThe user’s prompt provides\
            \ expectations for how the video should progress, including dynamic interactions,\
            \ changes, and any specific instructions about the video environment.\n\
            \n## Goals：\nYour goal is to describe the dynamic changes of the main\
            \ subjects, starting from the first frame and evolving based on the user’s\
            \ prompt.\n\n### Steps\n1. Analyze the main subjects and environmental\
            \ elements present in the first frame of the image.\n2. Deriving and describing\
            \ the dynamic changes of main subjects in videos based on the input prompt.\
            \ Descriptions must include all behaviors and changes specified in the\
            \ user’s prompt, ensuring complete coverage.\n3. Analyze the user’s prompt\
            \ to determine the desired motion or transformation. If the prompt lacks\
            \ a complete description, infer and reasonably supplement the missing\
            \ details based on the main subject in the provided image.\n4. Appropriately\
            \ and reasonably describe the changes in other elements within the environment.\
            \ To make the video more authentic and engaging.\n5. Describe the Camera\
            \ Movement:\n\t• If the prompt specifies camera movement(such as 'static',\
            \ 'tracking shot', 'dolly in', 'dolly out', 'zoom in', 'zoom out', 'pan\
            \ left', 'pan right', 'tilt up', 'tilt down', etc.): Describe the movement\
            \ of the camera based on the prompt. Ensure the camera movement is seamlessly\
            \ and reasonably integrated into the description.\n\t• If the prompt unspecifies\
            \ camera movement: \n\t  •\tFor static scenes, strictly output the following\
            \ sentence in full, replacing “[main subject]” with the specific subject:\
            \ 'The camera remains static, the [main subject] is centered in the shot.'\n\
            \t  •\tFor dynamic scenes, strictly output the following sentence in full,\
            \ replacing “[main subject]” with the specific subject: 'The camera keeps\
            \ focusing on the [main subject], ensuring [main subject] stays consistently\
            \ at the center of the shot.'\n\n### Principles\n1. Use **objective, precise\
            \ and factual language**. \n2. Descriptions of behavior and actions should\
            \ focus solely on the main subjects.\n3. Descriptions must include all\
            \ behaviors and changes specified in the user’s prompt, ensuring complete\
            \ coverage.\n4. Unless explicitly specified in the user’s prompt, do not\
            \ describe elements that do not appear in the first frame.\n5. Do not\
            \ describe the detail of the first frame image.\n6. Provide a description\
            \ directly without applying any specific format. Avoid transitional or\
            \ process-based phrases such as \"As the video progresses\", \"The video\
            \ features\", \"Over time\", or \"Next\".\n7.\tDo not describe symbolic\
            \ meanings or atmospheres (e.g., “conveys determination,” “enhances the\
            \ mood,” “emphasizes struggle,” “creates a sense of mystery”).\n8.\tDo\
            \ not use metaphors or poetic expressions (e.g., “moonlight dancing on\
            \ the water”).\n9.\tDo not evaluate content, skill, or presentation (e.g.,\
            \ “demonstrates,” “reveals,” “highlights”).\n\n\n\n### Examples: Below\
            \ are some examples. Please follow their descriptive structure and language\
            \ style. \n1. A young woman with long brown hair runs along a boardwalk,\
            \ her hair continues to blow in the wind.\n2. The man's arms are initially\
            \ raised above his head, then he lowers them to his sides.\n3. A man is\
            \ boxing with blue gloves, the other man observes. The older man in the\
            \ gray hoodie stops shadow boxing and looks at the other man.\n4. A young\
            \ woman is walking in shallow ocean water on a beach. The woman’s skirt\
            \ flows more dramatically as she moves. he waves gently splash against\
            \ her legs. \n5. The young man continues to smoke, his body slightly hunched.\
            \ He exhales a plume of smoke, his head tilted back slightly. He continues\
            \ to smoke, his expression still somewhat pensive.\n "
        - id: e9b0675f-335d-4d26-b1d7-c45121424015
          role: user
          text: 'first frame is {{#1737637034436.image#}}

            prompt is {{#1737637034436.prompt#}}

            '
        selected: false
        title: LLM 2
        type: llm
        variables: []
        vision:
          enabled: false
      height: 90
      id: '1744808380826'
      position:
        x: 417.6221432741702
        y: 406.7747388890244
      positionAbsolute:
        x: 417.6221432741702
        y: 406.7747388890244
      selected: false
      sourcePosition: right
      targetPosition: left
      type: custom
      width: 244
    - data:
        code: "\ndef main(arg1: str, arg2: str) -> dict:\n    return {\n        \"\
          result\": arg1 + arg2 + 'hyper quality, Ultra HD, 8K.',\n    }\n"
        code_language: python3
        desc: ''
        outputs:
          result:
            children: null
            type: string
        selected: false
        title: Merge
        type: code
        variables:
        - value_selector:
          - '1737637085579'
          - text
          variable: arg1
        - value_selector:
          - '1744808380826'
          - text
          variable: arg2
      height: 54
      id: '1744808916120'
      position:
        x: 759.6494932729511
        y: 282
      positionAbsolute:
        x: 759.6494932729511
        y: 282
      selected: false
      sourcePosition: right
      targetPosition: left
      type: custom
      width: 244
    viewport:
      x: 183.91603968062452
      y: -196.34473124880242
      zoom: 1.3085780714550102

```

## /comfyui/README.md

# Using MAGI-1 in ComfyUI

## Installation

1. **Manual Installation of [ComfyUI](https://github.com/comfyanonymous/ComfyUI?tab=readme-ov-file#manual-install-windows-linux)**

   Download and install ComfyUI manually following the official instructions.

2. **Install MAGI-1 Node**

    **Method 1: Install via comfy-cli (Recommended)**

    After installing [comfy-cli](https://github.com/Comfy-Org/comfy-cli?tab=readme-ov-file#installation), run the following commands in the terminal:
    ```shell
    cd ComfyUI
    comfy node registry-install MAGI-1
    ```

    **Method 2: Install from Source**

    Download the MAGI-1 repository and place it in `ComfyUI/custom_nodes/MAGI-1`. Then, [install the required dependencies](https://github.com/SandAI-org/MAGI-1?tab=readme-ov-file#environment-preparation).

    > ⚠️ To make ComfyUI recognize the custom node, you need to move `comfyui/__init__.py` to the root directory of MAGI-1 for both methods above.

3. **Download the MAGI-1 Model Files Locally**

    Download the MAGI-1 model files to your local machine. In the MAGI-1 config file, such as `example/4.5B/4.5B_base_config.json` (if using the 4.5B base model), change the model weights paths to absolute local paths. The following three fields need to be updated:

  * **load**: Absolute path to the DiT model weights
  * **t5\_pretrained**: Absolute path to the T5 model weights
  * **vae\_pretrained**: Absolute path to the VAE model weights

## Node Functions

After installation, start ComfyUI from the ComfyUI directory:

```shell
cd ComfyUI
# If comfy-cli is installed
comfy launch
# Otherwise
python main.py
```

You can find the nodes provided by this repository under *Add Node - Magi* in ComfyUI. In newer versions of ComfyUI, the nodes are also available in the NODE LIBRARY on the left.

### Load Prompt

Loads a prompt text from the input for subsequent text encoding processing.

* **prompt**: The input text from the user, supports multiline input.

### T5 Text Encoder

Encodes the prompt into text features (Conditioning Embedding) used for video generation.

* **prompt**: Description text input.
* **t5\_pretrained\_path**: Absolute path to the T5 model weights, pointing to the pretrained model in the `ckpt/t5` directory.
* **t5\_device**: Specifies the device on which the T5 model is loaded and run, options include `"cpu"` or `"cuda:x"` (e.g., `"cuda:0"`).

### Load Image

Loads an image file from the input directory. File picker is supported for image uploads.

* **image\_path**: Select image file from ComfyUI's input folder. The system automatically filters out non-image file types and displays only supported formats.

### Process with MAGI

The core node for text-to-video, image-to-video, or video continuation tasks. It generates a video sequence and passes the frame rate to downstream video-saving nodes.

* **task\_mode**: Specifies which task to perform among *text-to-video, image-to-video, or video continuation*.
* **config\_path**: Absolute path to the JSON configuration file required for model execution.

  > ⚠️ All paths in the config file must also be absolute.
* **image\_path**: Absolute path of the image or video to be converted into a video.
* **text\_embeddings**: The embeddings and masks generated by the text encoder, serving as semantic guidance for video generation.
* **magi\_seed**: Random seed to control reproducibility. Using the same seed will yield the same output. Default is 1234, valid range is 0–100000.
* **video\_size\_h**: Height (in pixels) of the generated video. Default is 720. Large values may slow down processing or cause out-of-memory issues—use with caution.
* **video\_size\_w**: Width (in pixels) of the generated video. Default is 720. Same caution as above.
* **num\_frames**: Total number of video frames, determines video duration. Default is 96, range is 24–24000.
* **num\_steps**: Number of diffusion sampling steps. More steps lead to better quality but longer inference time. Default is 64, range is 4–240.
* **fps**: Frame rate (frames per second) of the generated video. Affects playback speed and smoothness. Default is 24, supported range is 1–60.

> ⚠️ This node sets a series of distributed and memory-related environment variables before running.

### Save Video

Saves the generated video sequence to a local file.

* **video**: The video tensor to be saved, essentially a `torch.Tensor`.
* **output\_path**: Absolute path to save the video file (only `.mp4` extension is supported).
* **fps**: Video frame rate. Default is 24. Supports integer values from 1 to 60.

The video will be encoded using the specified frame rate and saved to the `output_path`.

## Workflow Examples

This section provides example workflows for image-to-video generation. You can import workflows via the *Load* button in the menu. In newer versions of ComfyUI, use *Workflow - Open* from the top-left menu, or copy the workflow files to the `user/default/workflows` directory of ComfyUI and refresh the *Workflow* panel on the left to access them.

Workflows are located in the `comfyui/workflow/` directory. Assets are in the `example/assets/` directory.

After importing a workflow, **you must manually reassign the corresponding file paths.**

### Text-to-Video

Workflow file: `workflow/magi_text_to_video_example.json`

### Image-to-Video

Workflow file: `workflow/magi_image_to_video_example.json`

### Video Continuation

Workflow file: `workflow/magi_video_continuation_example.json`


## /comfyui/README_CN.md

# 在 ComfyUI 中使用 MAGI-1

## 安装方法

1. 用手动安装的方式[下载 ComfyUI 并安装](https://github.com/comfyanonymous/ComfyUI?tab=readme-ov-file#manual-install-windows-linux)

2. 安装MAGI-1节点

    方法一：使用comfy-cli安装（推荐）

    安装 [comfy-cli](https://github.com/Comfy-Org/comfy-cli?tab=readme-ov-file#installation) 后，在命令行中运行以下命令：
    ```shell
    cd ComfyUI

    comfy node registry-install MAGI-1
    ```

    方法二：通过源码安装

    下载本仓库到 *ComfyUI/custom_nodes/MAGI-1* 路径并[安装相应的依赖](https://github.com/SandAI-org/MAGI-1?tab=readme-ov-file#environment-preparation)。

    > ⚠️ 为了让ComfyUI识别到自定义节点，以上两种方法都需要将`comfyui/__init__.py`移动到 MAGI-1 项目的根目录下。

3. 下载 MAGI-1 模型文件到本地。在 MAGI-1 的配置文件，例如在文件`example/4.5B/4.5B_base_config.json`中（若要使用4.5B基础模型），修改模型权重的路径为本地绝对路径。主要有以下三个文件路径需要修改：
    * **load**: DiT 模型权重的绝对路径
    * **t5_pretrained**： T5 模型权重的绝对路径
    * **vae_pretrained**： VAE 模型权重的绝对路径

## 节点功能

安装完成后，在 ComfyUI 目录下启动 ComfyUI

```shell
cd ComfyUI
# 如果安装了 comfy-cli
comfy launch
# 否则
python main.py
```
可在 ComfyUI 的 *Add Node - Magi* 菜单找到本仓库提供的节点，新版 ComfyUI 节点也可在界面左侧 NODE LIBRARY 中找到。

### Load Prompt

用于从输入中加载一段 prompt 文本，用于后续的文本编码处理。

* **prompt**：用户输入的文本内容，支持多行输入。

### T5 Text Encoder

用于将 prompt 文本编码成用于视频生成的文本特征（Conditioning Embedding）。

* **prompt**：输入的描述文本。
* **t5\_pretrained\_path**：T5 模型权重的绝对路径，指向 `ckpt/t5` 目录中的预训练模型。
* **t5\_device**：指定在哪个设备上加载和运行 T5 模型，可选 `"cpu"` 或 `"cuda:x"`（如 `"cuda:0"`）。

### Load Image

用于从输入目录中加载图像文件。支持通过文件选择器上传图片。

* **image\_path**：从 ComfyUI 的输入文件夹中选择图像文件，系统自动过滤非图像的文件类型，仅显示支持的图片格式。


### Process with MAGI

核心节点，可用于文生视频、图生视频、视频续写任务，生成对应的视频序列，并将帧率传递给后续的视频保存节点。

* **task\_mode**：指定执行 *文生视频、图生视频、视频续写* 中的哪一种任务。
* **config\_path**：模型运行所需的 JSON 配置文件的绝对路径。
    > ⚠️ 配置文件中出现的所有路径也需要是绝对路径。
* **image\_path**：要转换为视频的图像/视频的绝对路径。
* **text\_embeddings**：文本编码器生成的嵌入特征与 mask，作为视频生成的语义引导。
* **magi\_seed**：生成随机数的种子，用于控制模型生成的可复现性。相同的 seed 将产生相同的视频输出。默认值为 1234，取值范围 0～100000。
* **video\_size\_h**：生成视频的高度（像素）。默认值为 720，过大会导致运行速度变慢和显存溢出问题，请谨慎设置。
* **video\_size\_w**：生成视频的宽度（像素）。默认值为 720，过大会导致运行速度变慢和显存溢出问题，请谨慎设置。
* **num\_frames**：视频的总帧数，控制生成视频的时长。默认值为 96，范围为 24～24000。
* **num\_steps**：扩散采样的步数，步数越多，画面质量越高但推理时间越长。默认值为 64，范围为 4～240。
* **fps**：生成视频的帧率（每秒帧数），影响视频播放的速度和流畅度。默认值为 24，支持的范围是 1～60。

> ⚠️ 本节点在运行前会设置一系列分布式和内存相关的环境变量。


### Save Video

将生成的视频序列保存为本地文件。

* **video**：待保存的视频张量，本质是一个 torch.Tensor。
* **output\_path**：保存视频的绝对路径（只允许使用 `.mp4` 后缀）。
* **fps**：视频帧率，默认为 24，支持 1\~60 的整数设置。

视频将使用指定的帧率编码并写入到 `output_path`。


## 工作流样例

本节展示了图生视频的工作流样例，可通过菜单中的 *Load* 按钮导入工作流。新版 ComfyUI 可通过左上方菜单的 *Workflow - Open* 加载工作流，也可以将工作流文件拷贝到 ComfyUI 的`user/default/workflows`目录下，再在左侧 *工作流* 面板中刷新即可找到。

工作流位于 `comfyui/workflow/` 目录中，素材位于 `example/assets/` 目录中。

导入工作流后，**需要手动重新指定对应的文件路径片**。

### 文生视频

工作流对应 `workflow/magi_text_to_video_example.json` 文件。

### 图生视频

工作流对应 `workflow/magi_image_to_video_example.json` 文件。

### 视频续写

工作流对应 `workflow/magi_video_continuation_example.json` 文件。


## /comfyui/__init__.py

```py path="/comfyui/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import sys

magi_directory = os.path.dirname(os.path.abspath(__file__))
special_token_path = os.path.join(magi_directory, "example/assets/special_tokens.npz")
os.environ["SPECIAL_TOKEN_PATH"] = special_token_path
sys.path.append(magi_directory)

from .comfyui.comfy_nodes import NODE_CLASS_MAPPINGS, NODE_DISPLAY_NAME_MAPPINGS

__all__ = ["NODE_CLASS_MAPPINGS", "NODE_DISPLAY_NAME_MAPPINGS"]

```

## /comfyui/comfy_nodes.py

```py path="/comfyui/comfy_nodes.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import folder_paths
import node_helpers
import torch
from comfy.comfy_types import IO
from PIL import Image

from inference.common import EngineConfig, MagiConfig, ModelConfig, RuntimeConfig, set_random_seed
from inference.infra.distributed import dist_init
from inference.model.dit import get_dit
from inference.pipeline.prompt_process import get_txt_embeddings
from inference.pipeline.video_generate import generate_per_chunk
from inference.pipeline.video_process import post_chunk_process, process_image, process_prefix_video, save_video_to_disk


class MagiPromptLoader:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "prompt": (IO.STRING, {"multiline": True, "dynamicPrompts": True, "tooltip": "The text to be encoded."})
            }
        }

    RETURN_TYPES = ("STRING",)
    RETURN_NAMES = ("prompt",)
    FUNCTION = "load"
    CATEGORY = "Magi"

    def load(self, prompt):
        return (prompt,)


class MagiTextEncoder:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "prompt": ("STRING", {"default": "The text to be encoded."}),
                "t5_pretrained_path": ("STRING", {"default": "/path/to/your/ckpt/magi/ckpt/t5"}),
                "t5_device": (
                    "COMBO",
                    {
                        "options": ["cpu", "cuda:0", "cuda:1", "cuda:2", "cuda:3", "cuda:4", "cuda:5", "cuda:6", "cuda:7"],
                        "default": "cpu",
                    },
                ),
            }
        }

    RETURN_TYPES = ("CONDITIONING",)
    RETURN_NAMES = ("text_embeddings",)
    FUNCTION = "encode"
    CATEGORY = "Magi"

    def encode(self, prompt: str, t5_pretrained_path: str, t5_device: str):
        model_config = ModelConfig(model_name="videodit_ardf")
        config = MagiConfig(model_config=model_config, runtime_config=RuntimeConfig(), engine_config=EngineConfig())
        config.runtime_config.t5_pretrained = t5_pretrained_path
        config.runtime_config.t5_device = t5_device
        config.model_config.caption_max_length = 800

        caption_embs, emb_masks = get_txt_embeddings(prompt, config)
        return ([caption_embs, emb_masks],)


class MagiImageLoader:
    @classmethod
    def INPUT_TYPES(s):
        input_dir = folder_paths.get_input_directory()
        files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
        files = folder_paths.filter_files_content_types(files, ["image"])
        return {"required": {"image_path": (sorted(files), {"image_upload": True})}}

    RETURN_TYPES = ("STRING",)
    RETURN_NAMES = ("image_path",)
    FUNCTION = "load"
    CATEGORY = "Magi"

    def load(self, image_path):
        image_path = folder_paths.get_annotated_filepath(image_path)
        node_helpers.pillow(Image.open, image_path)
        return (image_path,)


class MagiVideoLoader:
    @classmethod
    def INPUT_TYPES(s):
        input_dir = folder_paths.get_input_directory()
        files = [f for f in os.listdir(input_dir) if os.path.isfile(os.path.join(input_dir, f))]
        files = folder_paths.filter_files_content_types(files, ["video"])
        return {"required": {"video_path": (sorted(files), {"video_upload": True})}}

    RETURN_TYPES = ("STRING",)
    RETURN_NAMES = ("video_path",)
    FUNCTION = "load"
    CATEGORY = "Magi"

    def load(self, video_path):
        video_path = folder_paths.get_annotated_filepath(video_path)
        return (video_path,)


class MagiProcess:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "task_mode": (
                    "COMBO",
                    {"options": ["text to video", "image to video", "video continuation"], "default": "image to video"},
                ),
                "config_path": ("STRING", {"default": "/path/to/your/config"}),
                "image_path": ("STRING", {"default": "/path/to/your/image"}),
                "text_embeddings": ("CONDITIONING",),
                "magi_seed": ("INT", {"default": 1234, "min": 0, "max": 100000, "step": 1}),
                "video_size_h": ("INT", {"default": 720, "min": 16, "max": 14400, "step": 16}),
                "video_size_w": ("INT", {"default": 720, "min": 16, "max": 14400, "step": 16}),
                "num_frames": ("INT", {"default": 96, "min": 24, "max": 24000, "step": 24}),
                "num_steps": ("INT", {"default": 64, "min": 4, "max": 240, "step": 4}),
                "fps": ("INT", {"default": 24, "min": 1, "max": 60, "step": 1}),
            }
        }

    RETURN_TYPES = ("IMAGE", "INT")
    RETURN_NAMES = ("video", "fps")
    FUNCTION = "process"
    CATEGORY = "Magi"

    def set_environ_variables(self):
        os.environ["MASTER_ADDR"] = "localhost"
        os.environ["MASTER_PORT"] = "6009"
        os.environ["GPUS_PER_NODE"] = "1"
        os.environ["NNODES"] = "1"
        os.environ["WORLD_SIZE"] = "1"
        os.environ["CUDA_VISIBLE_DEVICES"] = "0"

        os.environ["PAD_HQ"] = "1"
        os.environ["PAD_DURATION"] = "1"

        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
        os.environ["OFFLOAD_T5_CACHE"] = "true"
        os.environ["OFFLOAD_VAE_CACHE"] = "true"
        os.environ["TORCH_CUDA_ARCH_LIST"] = "8.9;9.0"

    def process(
        self,
        task_mode: str,
        config_path: str,
        image_path: str,
        text_embeddings: list,
        magi_seed: int,
        video_size_h: int,
        video_size_w: int,
        num_frames: int,
        num_steps: int,
        fps: int,
    ):
        self.set_environ_variables()
        config = MagiConfig.from_json(config_path)
        config.runtime_config.seed = magi_seed
        config.runtime_config.video_size_h = video_size_h
        config.runtime_config.video_size_w = video_size_w
        config.runtime_config.num_frames = num_frames
        config.runtime_config.num_steps = num_steps
        config.runtime_config.fps = fps

        # setup distributed environment
        set_random_seed(config.runtime_config.seed)
        dist_init(config)

        if task_mode == "text to video":
            prefix_video = None
        elif task_mode == "image to video":
            prefix_video = process_image(image_path, config)
        elif task_mode == "video continuation":
            prefix_video = process_prefix_video(image_path, config)
        else:
            raise ValueError(f"Unknown task mode: {task_mode}")

        dit = get_dit(config)

        videos = torch.cat(
            [
                post_chunk_process(chunk, config)
                for chunk in generate_per_chunk(
                    model=dit, prefix_video=prefix_video, caption_embs=text_embeddings[0], emb_masks=text_embeddings[1]
                )
            ],
            dim=0,
        )
        return (videos, fps)


class MagiSaveVideo:
    @classmethod
    def INPUT_TYPES(s):
        return {
            "required": {
                "video": ("IMAGE",),
                "output_path": ("STRING", {"default": "/path/to/your/output"}),
                "fps": ("INT", {"default": 24, "min": 1, "max": 60, "step": 1}),
            }
        }

    RETURN_TYPES = ()
    OUTPUT_NODE = True
    FUNCTION = "save"
    CATEGORY = "Magi"

    def save(self, video: torch.Tensor, output_path: str, fps: int):
        output = save_video_to_disk(video, output_path, fps)
        return {"output": output}


NODE_CLASS_MAPPINGS = {
    "MagiImageLoader": MagiImageLoader,
    "MagiVideoLoader": MagiVideoLoader,
    "MagiPromptLoader": MagiPromptLoader,
    "MagiTextEncoder": MagiTextEncoder,
    "MagiProcess": MagiProcess,
    "MagiSaveVideo": MagiSaveVideo,
}

NODE_DISPLAY_NAME_MAPPINGS = {
    "MagiImageLoader": "Load Image",
    "MagiVideoLoader": "Load Video",
    "MagiPromptLoader": "Load Prompt",
    "MagiTextEncoder": "T5 Text Encoder",
    "MagiProcess": "Process with MAGI",
    "MagiSaveVideo": "Save Video",
}

```

## /comfyui/workflow/magi_image_to_video_example.json

```json path="/comfyui/workflow/magi_image_to_video_example.json" 
{
  "id": "99e51cb9-ac44-44dd-9993-df1c3e422f3f",
  "revision": 0,
  "last_node_id": 13,
  "last_link_id": 18,
  "nodes": [
    {
      "id": 3,
      "type": "MagiSaveVideo",
      "pos": [
        986.2220458984375,
        -24.729631423950195
      ],
      "size": [
        270,
        82
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "video",
          "type": "IMAGE",
          "link": 18
        }
      ],
      "outputs": [],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiSaveVideo"
      },
      "widgets_values": [
        "",
        24
      ]
    },
    {
      "id": 4,
      "type": "MagiPromptLoader",
      "pos": [
        -5.279978275299072,
        -95.93070983886719
      ],
      "size": [
        270,
        88
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "prompt",
          "type": "STRING",
          "links": [
            14
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiPromptLoader"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 5,
      "type": "MagiTextEncoder",
      "pos": [
        331.57623291015625,
        -99.68794250488281
      ],
      "size": [
        271.408203125,
        130
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "prompt",
          "type": "STRING",
          "widget": {
            "name": "prompt"
          },
          "link": 14
        }
      ],
      "outputs": [
        {
          "name": "text_embeddings",
          "type": "CONDITIONING",
          "links": [
            16
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiTextEncoder"
      },
      "widgets_values": [
        "",
        "",
        "cpu"
      ]
    },
    {
      "id": 11,
      "type": "MagiImageLoader",
      "pos": [
        302.2950134277344,
        118.1882095336914
      ],
      "size": [
        270,
        294.0000305175781
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "image_path",
          "type": "STRING",
          "links": [
            17
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiImageLoader"
      },
      "widgets_values": [
        "example.png",
        "image"
      ]
    },
    {
      "id": 13,
      "type": "MagiProcess",
      "pos": [
        662.83447265625,
        -26.482036590576172
      ],
      "size": [
        270,
        270
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "text_embeddings",
          "type": "CONDITIONING",
          "link": 16
        },
        {
          "name": "image_path",
          "type": "STRING",
          "widget": {
            "name": "image_path"
          },
          "link": 17
        }
      ],
      "outputs": [
        {
          "name": "video",
          "type": "IMAGE",
          "links": [
            18
          ]
        },
        {
          "name": "fps",
          "type": "INT",
          "links": null
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiProcess"
      },
      "widgets_values": [
        "image to video",
        "/path/to/your/config",
        "/path/to/your/image",
        1234,
        720,
        720,
        96,
        64,
        24
      ]
    }
  ],
  "links": [
    [
      14,
      4,
      0,
      5,
      0,
      "STRING"
    ],
    [
      16,
      5,
      0,
      13,
      0,
      "CONDITIONING"
    ],
    [
      17,
      11,
      0,
      13,
      1,
      "STRING"
    ],
    [
      18,
      13,
      0,
      3,
      0,
      "IMAGE"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 1.0165289256198382,
      "offset": [
        379.20744956526335,
        327.03377618471154
      ]
    },
    "frontendVersion": "1.19.9",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

```

## /comfyui/workflow/magi_text_to_video_example.json

```json path="/comfyui/workflow/magi_text_to_video_example.json" 
{
  "id": "e50b6e81-33c9-4507-b999-abd7e7486517",
  "revision": 0,
  "last_node_id": 4,
  "last_link_id": 4,
  "nodes": [
    {
      "id": 4,
      "type": "MagiSaveVideo",
      "pos": [
        1292.05859375,
        257.34765625
      ],
      "size": [
        270,
        82
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "video",
          "type": "IMAGE",
          "link": 3
        },
        {
          "name": "fps",
          "type": "INT",
          "widget": {
            "name": "fps"
          },
          "link": 4
        }
      ],
      "outputs": [],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiSaveVideo"
      },
      "widgets_values": [
        "/path/to/your/output",
        24
      ]
    },
    {
      "id": 2,
      "type": "MagiTextEncoder",
      "pos": [
        606.72265625,
        406.90234375
      ],
      "size": [
        270,
        106
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "prompt",
          "type": "STRING",
          "widget": {
            "name": "prompt"
          },
          "link": 2
        }
      ],
      "outputs": [
        {
          "name": "text_embeddings",
          "type": "CONDITIONING",
          "links": [
            1
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiTextEncoder"
      },
      "widgets_values": [
        "The text to be encoded.",
        "/path/to/your/ckpt/magi/ckpt/t5",
        "cpu"
      ]
    },
    {
      "id": 1,
      "type": "MagiPromptLoader",
      "pos": [
        166.45703125,
        399.74609375
      ],
      "size": [
        400,
        200
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "prompt",
          "type": "STRING",
          "links": [
            2
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiPromptLoader"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 3,
      "type": "MagiProcess",
      "pos": [
        969.10546875,
        251.12890625
      ],
      "size": [
        270,
        270
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "text_embeddings",
          "type": "CONDITIONING",
          "link": 1
        }
      ],
      "outputs": [
        {
          "name": "video",
          "type": "IMAGE",
          "links": [
            3
          ]
        },
        {
          "name": "fps",
          "type": "INT",
          "links": [
            4
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiProcess"
      },
      "widgets_values": [
        "text to video",
        "/path/to/your/config",
        "/path/to/your/image",
        1234,
        720,
        720,
        96,
        64,
        24
      ]
    }
  ],
  "links": [
    [
      1,
      2,
      0,
      3,
      0,
      "CONDITIONING"
    ],
    [
      2,
      1,
      0,
      2,
      0,
      "STRING"
    ],
    [
      3,
      3,
      0,
      4,
      0,
      "IMAGE"
    ],
    [
      4,
      3,
      1,
      4,
      1,
      "INT"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 1,
      "offset": [
        -0.12890625,
        0.125
      ]
    },
    "frontendVersion": "1.19.9",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

```

## /comfyui/workflow/magi_video_continuation_example.json

```json path="/comfyui/workflow/magi_video_continuation_example.json" 
{
  "id": "0e7a4368-232e-483a-ac18-0fafb7f1288b",
  "revision": 0,
  "last_node_id": 5,
  "last_link_id": 5,
  "nodes": [
    {
      "id": 5,
      "type": "MagiSaveVideo",
      "pos": [
        1226.15625,
        153.84765625
      ],
      "size": [
        270,
        82
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "video",
          "type": "IMAGE",
          "link": 1
        },
        {
          "name": "fps",
          "type": "INT",
          "widget": {
            "name": "fps"
          },
          "link": 2
        }
      ],
      "outputs": [],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiSaveVideo"
      },
      "widgets_values": [
        "/path/to/your/output",
        24
      ]
    },
    {
      "id": 3,
      "type": "MagiTextEncoder",
      "pos": [
        577.359375,
        106.4765625
      ],
      "size": [
        270,
        106
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [
        {
          "name": "prompt",
          "type": "STRING",
          "widget": {
            "name": "prompt"
          },
          "link": 4
        }
      ],
      "outputs": [
        {
          "name": "text_embeddings",
          "type": "CONDITIONING",
          "links": [
            3
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiTextEncoder"
      },
      "widgets_values": [
        "The text to be encoded.",
        "/path/to/your/ckpt/magi/ckpt/t5",
        "cpu"
      ]
    },
    {
      "id": 1,
      "type": "MagiVideoLoader",
      "pos": [
        585.5703125,
        308.578125
      ],
      "size": [
        270,
        82
      ],
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "video_path",
          "type": "STRING",
          "links": [
            5
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiVideoLoader"
      },
      "widgets_values": [
        "undefined",
        "image"
      ]
    },
    {
      "id": 2,
      "type": "MagiPromptLoader",
      "pos": [
        142.6953125,
        124.89453125
      ],
      "size": [
        400,
        200
      ],
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "prompt",
          "type": "STRING",
          "links": [
            4
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiPromptLoader"
      },
      "widgets_values": [
        ""
      ]
    },
    {
      "id": 4,
      "type": "MagiProcess",
      "pos": [
        914.95703125,
        160.8984375
      ],
      "size": [
        270,
        270
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "text_embeddings",
          "type": "CONDITIONING",
          "link": 3
        },
        {
          "name": "image_path",
          "type": "STRING",
          "widget": {
            "name": "image_path"
          },
          "link": 5
        }
      ],
      "outputs": [
        {
          "name": "video",
          "type": "IMAGE",
          "links": [
            1
          ]
        },
        {
          "name": "fps",
          "type": "INT",
          "links": [
            2
          ]
        }
      ],
      "properties": {
        "aux_id": "SandAI-org/MAGI-1",
        "ver": "a80ef963c9f3cd3cec677cbfb97f341f228426fa",
        "Node name for S&R": "MagiProcess"
      },
      "widgets_values": [
        "video continuation",
        "/path/to/your/config",
        "/path/to/your/image",
        1234,
        720,
        720,
        96,
        64,
        24
      ]
    }
  ],
  "links": [
    [
      1,
      4,
      0,
      5,
      0,
      "IMAGE"
    ],
    [
      2,
      4,
      1,
      5,
      1,
      "INT"
    ],
    [
      3,
      3,
      0,
      4,
      0,
      "CONDITIONING"
    ],
    [
      4,
      2,
      0,
      3,
      0,
      "STRING"
    ],
    [
      5,
      1,
      0,
      4,
      1,
      "STRING"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
      "scale": 1,
      "offset": [
        79.3359375,
        110.73828125
      ]
    },
    "frontendVersion": "1.19.9",
    "VHS_latentpreview": false,
    "VHS_latentpreviewrate": 0,
    "VHS_MetadataImage": true,
    "VHS_KeepIntermediate": true
  },
  "version": 0.4
}

```

## /example/24B/24B_base_config.json

```json path="/example/24B/24B_base_config.json" 
{
    "model_config": {
        "model_name": "videodit_ardf",
        "num_layers": 48,
        "hidden_size": 6144,
        "ffn_hidden_size": 16384,
        "num_attention_heads": 48,
        "num_query_groups": 8,
        "kv_channels": 128,
        "layernorm_epsilon": 1e-06,
        "apply_layernorm_1p": true,
        "x_rescale_factor": 0.1,
        "half_channel_vae": true,
        "params_dtype": "torch.bfloat16",
        "patch_size": 2,
        "t_patch_size": 1,
        "in_channels": 32,
        "out_channels": 32,
        "cond_hidden_ratio": 0.25,
        "caption_channels": 4096,
        "caption_max_length": 800,
        "xattn_cond_hidden_ratio": 1.0,
        "cond_gating_ratio": 1.0,
        "gated_linear_unit": true
    },
    "runtime_config": {
        "cfg_number": 3,
        "cfg_t_range": [
            0.0,
            0.0217,
            0.1,
            0.3,
            0.999
        ],
        "prev_chunk_scales": [
            1.5,
            1.5,
            1.5,
            1.0,
            1.0
        ],
        "text_scales": [
            7.5,
            7.5,
            7.5,
            0.0,
            0.0
        ],
        "noise2clean_kvrange": [
            5,
            4,
            3,
            2
        ],
        "clean_chunk_kvrange": 1,
        "clean_t": 0.9999,
        "seed": 1234,
        "num_frames": 96,
        "video_size_h": 720,
        "video_size_w": 1280,
        "num_steps": 32,
        "window_size": 4,
        "fps": 24,
        "chunk_width": 6,
        "load": "./downloads/24B_base",
        "t5_pretrained": "./downloads/t5_pretrained",
        "t5_device": "cuda",
        "vae_pretrained": "./downloads/vae",
        "scale_factor": 0.18215,
        "temporal_downsample_factor": 4
    },
    "engine_config": {
        "distributed_backend": "nccl",
        "distributed_timeout_minutes": 15,
        "pp_size": 1,
        "cp_size": 8,
        "cp_strategy": "cp_ulysses",
        "ulysses_overlap_degree": 1,
        "fp8_quant": false,
        "distill_nearly_clean_chunk_threshold": 0.3,
        "shortcut_mode": "8,16,16",
        "distill": false,
        "kv_offload": true,
        "enable_cuda_graph": false
    }
}

```

## /example/24B/24B_distill_config.json

```json path="/example/24B/24B_distill_config.json" 
{
    "model_config": {
        "model_name": "videodit_ardf",
        "num_layers": 48,
        "hidden_size": 6144,
        "ffn_hidden_size": 16384,
        "num_attention_heads": 48,
        "num_query_groups": 8,
        "kv_channels": 128,
        "layernorm_epsilon": 1e-06,
        "apply_layernorm_1p": true,
        "x_rescale_factor": 0.1,
        "half_channel_vae": true,
        "params_dtype": "torch.bfloat16",
        "patch_size": 2,
        "t_patch_size": 1,
        "in_channels": 32,
        "out_channels": 32,
        "cond_hidden_ratio": 0.25,
        "caption_channels": 4096,
        "caption_max_length": 800,
        "xattn_cond_hidden_ratio": 1.0,
        "cond_gating_ratio": 1.0,
        "gated_linear_unit": true
    },
    "runtime_config": {
        "cfg_number": 1,
        "cfg_t_range": [
            0.0,
            0.0217,
            0.1,
            0.3,
            0.999
        ],
        "prev_chunk_scales": [
            1.5,
            1.5,
            1.5,
            1.0,
            1.0
        ],
        "text_scales": [
            7.5,
            7.5,
            7.5,
            0.0,
            0.0
        ],
        "noise2clean_kvrange": [
            5,
            4,
            3,
            2
        ],
        "clean_chunk_kvrange": 1,
        "clean_t": 0.9999,
        "seed": 1234,
        "num_frames": 96,
        "video_size_h": 720,
        "video_size_w": 1280,
        "num_steps": 16,
        "window_size": 4,
        "fps": 24,
        "chunk_width": 6,
        "load": "./downloads/24B_distill",
        "t5_pretrained": "./downloads/t5_pretrained",
        "t5_device": "cuda",
        "vae_pretrained": "./downloads/vae",
        "scale_factor": 0.18215,
        "temporal_downsample_factor": 4
    },
    "engine_config": {
        "distributed_backend": "nccl",
        "distributed_timeout_minutes": 15,
        "pp_size": 1,
        "cp_size": 8,
        "cp_strategy": "cp_ulysses",
        "ulysses_overlap_degree": 1,
        "fp8_quant": false,
        "distill_nearly_clean_chunk_threshold": 0.3,
        "shortcut_mode": "8,16,16",
        "distill": true,
        "kv_offload": true,
        "enable_cuda_graph": false
    }
}

```

## /example/24B/24B_distill_quant_config.json

```json path="/example/24B/24B_distill_quant_config.json" 
{
    "model_config": {
        "model_name": "videodit_ardf",
        "num_layers": 48,
        "hidden_size": 6144,
        "ffn_hidden_size": 16384,
        "num_attention_heads": 48,
        "num_query_groups": 8,
        "kv_channels": 128,
        "layernorm_epsilon": 1e-06,
        "apply_layernorm_1p": true,
        "x_rescale_factor": 0.1,
        "half_channel_vae": true,
        "params_dtype": "torch.bfloat16",
        "patch_size": 2,
        "t_patch_size": 1,
        "in_channels": 32,
        "out_channels": 32,
        "cond_hidden_ratio": 0.25,
        "caption_channels": 4096,
        "caption_max_length": 800,
        "xattn_cond_hidden_ratio": 1.0,
        "cond_gating_ratio": 1.0,
        "gated_linear_unit": true
    },
    "runtime_config": {
        "cfg_number": 1,
        "cfg_t_range": [
            0.0,
            0.0217,
            0.1,
            0.3,
            0.999
        ],
        "prev_chunk_scales": [
            1.5,
            1.5,
            1.5,
            1.0,
            1.0
        ],
        "text_scales": [
            7.5,
            7.5,
            7.5,
            0.0,
            0.0
        ],
        "noise2clean_kvrange": [
            5,
            4,
            3,
            2
        ],
        "clean_chunk_kvrange": 1,
        "clean_t": 0.9999,
        "seed": 1234,
        "num_frames": 96,
        "video_size_h": 720,
        "video_size_w": 1280,
        "num_steps": 16,
        "window_size": 4,
        "fps": 24,
        "chunk_width": 6,
        "load": "./downloads/24B_distill_quant",
        "t5_pretrained": "./downloads/t5_pretrained",
        "t5_device": "cuda",
        "vae_pretrained": "./downloads/vae",
        "scale_factor": 0.18215,
        "temporal_downsample_factor": 4
    },
    "engine_config": {
        "distributed_backend": "nccl",
        "distributed_timeout_minutes": 15,
        "pp_size": 1,
        "cp_size": 8,
        "cp_strategy": "cp_ulysses",
        "ulysses_overlap_degree": 1,
        "fp8_quant": true,
        "distill_nearly_clean_chunk_threshold": 0.3,
        "shortcut_mode": "8,16,16",
        "distill": true,
        "kv_offload": true,
        "enable_cuda_graph": false
    }
}

```

## /example/24B/run.sh

```sh path="/example/24B/run.sh" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export CUDA_DEVICE_MAX_CONNECTIONS=1
export NCCL_ALGO=^NVLS

export PAD_HQ=1
export PAD_DURATION=1

export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export OFFLOAD_T5_CACHE=true
export OFFLOAD_VAE_CACHE=true
export TORCH_CUDA_ARCH_LIST="8.9;9.0"

GPUS_PER_NODE=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
DISTRIBUTED_ARGS="
    --rdzv-backend=c10d \
    --rdzv-endpoint=localhost:6009 \
    --nnodes=1 \
    --nproc_per_node=$GPUS_PER_NODE
"

MAGI_ROOT=$(git rev-parse --show-toplevel)
LOG_DIR=log_$(date "+%Y-%m-%d_%H:%M:%S").log

export PYTHONPATH="$MAGI_ROOT:$PYTHONPATH"
torchrun $DISTRIBUTED_ARGS inference/pipeline/entry.py \
    --config_file example/24B/24B_base_config.json \
    --mode i2v \
    --prompt "Good Boy" \
    --image_path example/assets/image.jpeg \
    --output_path example/assets/output_i2v.mp4 \
    2>&1 | tee $LOG_DIR

```

## /example/4.5B/4.5B_base_config.json

```json path="/example/4.5B/4.5B_base_config.json" 
{
    "model_config": {
        "model_name": "videodit_ardf",
        "num_layers": 34,
        "hidden_size": 3072,
        "ffn_hidden_size": 12288,
        "num_attention_heads": 24,
        "num_query_groups": 8,
        "kv_channels": 128,
        "layernorm_epsilon": 1e-06,
        "apply_layernorm_1p": true,
        "x_rescale_factor": 1,
        "half_channel_vae": false,
        "params_dtype": "torch.bfloat16",
        "patch_size": 2,
        "t_patch_size": 1,
        "in_channels": 16,
        "out_channels": 16,
        "cond_hidden_ratio": 0.25,
        "caption_channels": 4096,
        "caption_max_length": 800,
        "xattn_cond_hidden_ratio": 1.0,
        "cond_gating_ratio": 1.0,
        "gated_linear_unit": false
    },
    "runtime_config": {
        "cfg_number": 3,
        "cfg_t_range": [
            0.0,
            0.0217,
            0.1,
            0.3,
            0.999
        ],
        "prev_chunk_scales": [
            1.5,
            1.5,
            1.5,
            1.0,
            1.0
        ],
        "text_scales": [
            7.5,
            7.5,
            7.5,
            0.0,
            0.0
        ],
        "noise2clean_kvrange": [
            5,
            4,
            3,
            2
        ],
        "clean_chunk_kvrange": 1,
        "clean_t": 0.9999,
        "seed": 1234,
        "num_frames": 96,
        "video_size_h": 720,
        "video_size_w": 720,
        "num_steps": 64,
        "window_size": 4,
        "fps": 24,
        "chunk_width": 6,
        "load": "./downloads/4.5B_base",
        "t5_pretrained": "./downloads/t5_pretrained",
        "t5_device": "cpu",
        "vae_pretrained": "./downloads/vae",
        "scale_factor": 0.18215,
        "temporal_downsample_factor": 4
    },
    "engine_config": {
        "distributed_backend": "nccl",
        "distributed_timeout_minutes": 15,
        "pp_size": 1,
        "cp_size": 1,
        "cp_strategy": "cp_ulysses",
        "ulysses_overlap_degree": 1,
        "fp8_quant": false,
        "distill_nearly_clean_chunk_threshold": 0.3,
        "shortcut_mode": "8,16,16",
        "distill": false,
        "kv_offload": true,
        "enable_cuda_graph": false
    }
}

```

## /example/4.5B/4.5B_distill_config.json

```json path="/example/4.5B/4.5B_distill_config.json" 
{
    "model_config": {
        "model_name": "videodit_ardf",
        "num_layers": 34,
        "hidden_size": 3072,
        "ffn_hidden_size": 12288,
        "num_attention_heads": 24,
        "num_query_groups": 8,
        "kv_channels": 128,
        "layernorm_epsilon": 1e-06,
        "apply_layernorm_1p": true,
        "x_rescale_factor": 1,
        "half_channel_vae": false,
        "params_dtype": "torch.bfloat16",
        "patch_size": 2,
        "t_patch_size": 1,
        "in_channels": 16,
        "out_channels": 16,
        "cond_hidden_ratio": 0.25,
        "caption_channels": 4096,
        "caption_max_length": 800,
        "xattn_cond_hidden_ratio": 1.0,
        "cond_gating_ratio": 1.0,
        "gated_linear_unit": false
    },
    "runtime_config": {
        "cfg_number": 1,
        "cfg_t_range": [
            0.0,
            0.0217,
            0.1,
            0.3,
            0.999
        ],
        "prev_chunk_scales": [
            1.5,
            1.5,
            1.5,
            1.0,
            1.0
        ],
        "text_scales": [
            7.5,
            7.5,
            7.5,
            0.0,
            0.0
        ],
        "noise2clean_kvrange": [
            5,
            4,
            3,
            2
        ],
        "clean_chunk_kvrange": 1,
        "clean_t": 0.9999,
        "seed": 1234,
        "num_frames": 96,
        "video_size_h": 720,
        "video_size_w": 720,
        "num_steps": 64,
        "window_size": 4,
        "fps": 24,
        "chunk_width": 6,
        "load": "./downloads/4.5B_distill",
        "t5_pretrained": "./downloads/t5_pretrained",
        "t5_device": "cpu",
        "vae_pretrained": "./downloads/vae",
        "scale_factor": 0.18215,
        "temporal_downsample_factor": 4
    },
    "engine_config": {
        "distributed_backend": "nccl",
        "distributed_timeout_minutes": 15,
        "pp_size": 1,
        "cp_size": 1,
        "cp_strategy": "cp_ulysses",
        "ulysses_overlap_degree": 1,
        "fp8_quant": false,
        "distill_nearly_clean_chunk_threshold": 0.3,
        "shortcut_mode": "8,16,16",
        "distill": true,
        "kv_offload": true,
        "enable_cuda_graph": false
    }
}

```

## /example/4.5B/4.5B_distill_quant_config.json

```json path="/example/4.5B/4.5B_distill_quant_config.json" 
{
    "model_config": {
        "model_name": "videodit_ardf",
        "num_layers": 34,
        "hidden_size": 3072,
        "ffn_hidden_size": 12288,
        "num_attention_heads": 24,
        "num_query_groups": 8,
        "kv_channels": 128,
        "layernorm_epsilon": 1e-06,
        "apply_layernorm_1p": true,
        "x_rescale_factor": 1,
        "half_channel_vae": false,
        "params_dtype": "torch.bfloat16",
        "patch_size": 2,
        "t_patch_size": 1,
        "in_channels": 16,
        "out_channels": 16,
        "cond_hidden_ratio": 0.25,
        "caption_channels": 4096,
        "caption_max_length": 800,
        "xattn_cond_hidden_ratio": 1.0,
        "cond_gating_ratio": 1.0,
        "gated_linear_unit": false
    },
    "runtime_config": {
        "cfg_number": 1,
        "cfg_t_range": [
            0.0,
            0.0217,
            0.1,
            0.3,
            0.999
        ],
        "prev_chunk_scales": [
            1.5,
            1.5,
            1.5,
            1.0,
            1.0
        ],
        "text_scales": [
            7.5,
            7.5,
            7.5,
            0.0,
            0.0
        ],
        "noise2clean_kvrange": [
            5,
            4,
            3,
            2
        ],
        "clean_chunk_kvrange": 1,
        "clean_t": 0.9999,
        "seed": 1234,
        "num_frames": 96,
        "video_size_h": 720,
        "video_size_w": 720,
        "num_steps": 64,
        "window_size": 4,
        "fps": 24,
        "chunk_width": 6,
        "load": "./downloads/4.5B_distill_quant",
        "t5_pretrained": "./downloads/t5_pretrained",
        "t5_device": "cpu",
        "vae_pretrained": "./downloads/vae",
        "scale_factor": 0.18215,
        "temporal_downsample_factor": 4
    },
    "engine_config": {
        "distributed_backend": "nccl",
        "distributed_timeout_minutes": 15,
        "pp_size": 1,
        "cp_size": 1,
        "cp_strategy": "cp_ulysses",
        "ulysses_overlap_degree": 1,
        "fp8_quant": true,
        "distill_nearly_clean_chunk_threshold": 0.3,
        "shortcut_mode": "8,16,16",
        "distill": true,
        "kv_offload": true,
        "enable_cuda_graph": false
    }
}

```

## /example/4.5B/run.sh

```sh path="/example/4.5B/run.sh" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

export MASTER_ADDR=localhost
export MASTER_PORT=6009
export GPUS_PER_NODE=1
export NNODES=1
export WORLD_SIZE=1
export CUDA_VISIBLE_DEVICES=0

export PAD_HQ=1
export PAD_DURATION=1

export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export OFFLOAD_T5_CACHE=true
export OFFLOAD_VAE_CACHE=true
export TORCH_CUDA_ARCH_LIST="8.9;9.0"

MAGI_ROOT=$(git rev-parse --show-toplevel)
LOG_DIR=log_$(date "+%Y-%m-%d_%H:%M:%S").log

export PYTHONPATH="$MAGI_ROOT:$PYTHONPATH"
python3 inference/pipeline/entry.py \
    --config_file example/4.5B/4.5B_base_config.json \
    --mode t2v \
    --prompt "Good Boy" \
    --output_path example/assets/output_t2v.mp4 \
    2>&1 | tee $LOG_DIR

```

## /example/assets/image.jpeg

Binary file available at https://raw.githubusercontent.com/SandAI-org/MAGI-1/refs/heads/main/example/assets/image.jpeg

## /example/assets/prefix_video.mp4

Binary file available at https://raw.githubusercontent.com/SandAI-org/MAGI-1/refs/heads/main/example/assets/prefix_video.mp4

## /example/assets/special_tokens.npz

Binary file available at https://raw.githubusercontent.com/SandAI-org/MAGI-1/refs/heads/main/example/assets/special_tokens.npz

## /figures/algorithm.png

Binary file available at https://raw.githubusercontent.com/SandAI-org/MAGI-1/refs/heads/main/figures/algorithm.png

## /figures/dit_architecture.png

Binary file available at https://raw.githubusercontent.com/SandAI-org/MAGI-1/refs/heads/main/figures/dit_architecture.png

## /figures/inhouse_human_evaluation.png

Binary file available at https://raw.githubusercontent.com/SandAI-org/MAGI-1/refs/heads/main/figures/inhouse_human_evaluation.png

## /figures/logo_black.png

Binary file available at https://raw.githubusercontent.com/SandAI-org/MAGI-1/refs/heads/main/figures/logo_black.png

## /inference/common/__init__.py

```py path="/inference/common/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .common_utils import divide, env_is_true, set_random_seed
from .config import EngineConfig, MagiConfig, ModelConfig, RuntimeConfig
from .dataclass import InferenceParams, ModelMetaArgs, PackedCoreAttnParams, PackedCrossAttnParams
from .logger import magi_logger, print_per_rank, print_rank_0
from .timer import event_path_timer

__all__ = [
    "MagiConfig",
    "ModelConfig",
    "EngineConfig",
    "RuntimeConfig",
    "magi_logger",
    "print_per_rank",
    "print_rank_0",
    "event_path_timer",
    "divide",
    "env_is_true",
    "set_random_seed",
    "PackedCoreAttnParams",
    "PackedCrossAttnParams",
    "ModelMetaArgs",
    "InferenceParams",
]

```

## /inference/common/common_utils.py

```py path="/inference/common/common_utils.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import random

import numpy as np
import torch


def env_is_true(env_name: str) -> bool:
    return str(os.environ.get(env_name, "0")).lower() in {"1", "true", "yes", "y", "on", "enabled"}


def divide(numerator, denominator):
    assert numerator % denominator == 0, "{} is not divisible by {}".format(numerator, denominator)
    return numerator // denominator


def set_random_seed(seed):
    """Set random seed.

    Args:
        seed (int): Seed to be used.
    """
    assert seed is not None, "Please provide a seed in config.json"
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    return seed

```

## /inference/common/config.py

```py path="/inference/common/config.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import dataclasses
import json
import os

import torch


@dataclasses.dataclass
class ModelConfig:
    model_name: str

    # Transformer
    num_layers: int = None  # Number of transformer layers.
    hidden_size: int = None  # Transformer hidden size.
    ffn_hidden_size: int = None  # Transformer Feed-Forward Network hidden size
    num_attention_heads: int = None  # Number of transformer attention heads.
    num_query_groups: int = 1  # Number of query groups, which used for GQA
    kv_channels: int = None  # Projection weights dimension in multi-head attention
    layernorm_epsilon: float = 1e-6  # Epsilon for layer norm and RMS norm.
    apply_layernorm_1p: bool = False  # Adjust LayerNorm weights which improves numerical stability.
    x_rescale_factor: float = 1.0
    half_channel_vae: bool = False
    params_dtype: torch.dtype = None

    # Embedding
    patch_size: int = 2  # (latent) patch size for DiT patch embedding layer
    t_patch_size: int = 1  # (latent) patch size for t dim patch embedding layer
    in_channels: int = 4  # latent input channel for DiT
    out_channels: int = 4  # latent output channel for DiT
    cond_hidden_ratio: float = 0.25
    caption_channels: int = 4096
    caption_max_length: int = 800
    xattn_cond_hidden_ratio: float = 1.0
    cond_gating_ratio: float = 1.0
    gated_linear_unit: bool = False


@dataclasses.dataclass
class RuntimeConfig:
    # Inference settings such as cfg, kv range, clean t, etc.
    cfg_number: int = None  # Number of CFG
    cfg_t_range: list = dataclasses.field(
        default_factory=lambda: [0, 0.0217, 0.1000, 0.3, 0.999]
    )  # CFG t-range of each scales
    prev_chunk_scales: list = dataclasses.field(
        default_factory=lambda: [1.5, 1.5, 1.5, 1.5, 1.5]
    )  # CFG scales of previous chunks
    text_scales: list = dataclasses.field(default_factory=lambda: [7.5, 7.5, 7.5, 7.5, 7.5])  # CFG scales of text

    noise2clean_kvrange: list = dataclasses.field(default_factory=list)  # Range of kv for noise2clean chunks
    clean_chunk_kvrange: int = -1  # Range of kv for clean chunks
    clean_t: float = 1.0  # timestep for clean chunks

    # Video settings
    seed: int = 1234  # Random seed used for python, numpy, pytorch, and cuda.
    num_frames: int = 128
    video_size_h: int = None
    video_size_w: int = None
    num_steps: int = 64  # Number of steps for the diffusion model
    window_size: int = 4  # Window size for the diffusion model
    fps: int = 24  # Frames per second
    chunk_width: int = 6  # Clip width for the diffusion model

    # Checkpoint, includes t5, vae, dit, etc.
    t5_pretrained: str = None  # Path to load pretrained T5 model.
    t5_device: str = "cuda"  # Device for T5 model to run on.
    vae_pretrained: str = None  # Path to load pretrained VAE model.
    scale_factor: float = 0.18215  # Scale factor for the vae
    temporal_downsample_factor: int = 4  # Temporal downsample factor for the vae
    load: str = None  # Directory containing a model checkpoint.


@dataclasses.dataclass
class EngineConfig:
    # Parallism strategy
    distributed_backend: str = "nccl"  # Choices: ["nccl", "gloo"]
    distributed_timeout_minutes: int = 10  # Timeout minutes for torch.distributed.
    pp_size: int = 1  # Degree of pipeline model parallelism.
    cp_size: int = 1  # Degree of context parallelism.
    cp_strategy: str = "none"  # Choices: ["none", "cp_ulysses", "cp_shuffle_overlap"]
    ulysses_overlap_degree: int = 1  # Overlap degree for Ulysses

    # Quantization
    fp8_quant: bool = False  # Enable 8-bit floating point quantization for model weights.

    # Distillation
    distill_nearly_clean_chunk_threshold: float = 0.3  # Threshold for distilling nearly clean chunks
    shortcut_mode: str = "8,16,16"  # Parameters for shortcut mode
    distill: bool = False  # Use distill mode

    # Optimization
    kv_offload: bool = False  # Use kv-offload algorithm
    enable_cuda_graph: bool = False  # Enable CUDA graph for video generation


@dataclasses.dataclass
class MagiConfig:
    model_config: ModelConfig
    runtime_config: RuntimeConfig
    engine_config: EngineConfig

    @classmethod
    def _check_missing_fields(cls, config_dict: dict, required_fields: list):
        actual_fields = set(config_dict.keys())
        missing_fields = set(required_fields) - actual_fields
        if missing_fields:
            raise ValueError(f"Missing fields in the configuration file: {', '.join(missing_fields)}")

    @classmethod
    def _create_nested_config(cls, config_dict: dict, config_name: str, config_cls):
        nested_config_dict = config_dict.get(config_name, {})
        cls._check_missing_fields(nested_config_dict, config_cls.__dataclass_fields__.keys())
        return config_cls(**nested_config_dict)

    @classmethod
    def _create_config_from_dict(cls, config_dict: dict):
        cls._check_missing_fields(config_dict, cls.__dataclass_fields__.keys())

        # Create nested configs
        model_config = cls._create_nested_config(config_dict, "model_config", ModelConfig)
        runtime_config = cls._create_nested_config(config_dict, "runtime_config", RuntimeConfig)
        engine_config = cls._create_nested_config(config_dict, "engine_config", EngineConfig)

        return cls(model_config=model_config, runtime_config=runtime_config, engine_config=engine_config)

    @classmethod
    def from_json(cls, json_path: str):
        def simple_json_decoder(dct):
            dtype_map = {"torch.bfloat16": torch.bfloat16, "torch.float16": torch.float16, "torch.float32": torch.float32}
            if 'params_dtype' in dct:
                dct['params_dtype'] = dtype_map[dct['params_dtype']]
            return dct

        with open(json_path, "r") as f:
            config_dict = json.load(f, object_hook=simple_json_decoder)
        magi_config = cls._create_config_from_dict(config_dict)

        def post_validation(magi_config):
            if magi_config.engine_config.fp8_quant or magi_config.engine_config.distill:
                assert (
                    magi_config.runtime_config.cfg_number == 1
                ), "Please set `cfg_number: 1` in config.json for distill or quant model"
            else:
                assert magi_config.runtime_config.cfg_number == 3, "Please set `cfg_number: 3` in config.json for base model"

        post_validation(magi_config)

        return magi_config

    def to_json(self, json_path: str):
        class SimpleJSONEncoder(json.JSONEncoder):
            def default(self, obj):
                if isinstance(obj, torch.dtype):
                    return str(obj)
                return super().default(obj)

        # Ensure the directory exists
        os.makedirs(os.path.dirname(json_path), exist_ok=True)

        config_dict = {
            "model_config": dataclasses.asdict(self.model_config),
            "runtime_config": dataclasses.asdict(self.runtime_config),
            "engine_config": dataclasses.asdict(self.engine_config),
        }
        with open(json_path, "w") as f:
            json.dump(config_dict, f, indent=4, cls=SimpleJSONEncoder)

```

## /inference/common/dataclass.py

```py path="/inference/common/dataclass.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from dataclasses import dataclass
from typing import List

import numpy as np
import torch


@dataclass(frozen=True)
class PackedCoreAttnParams:
    # Packed sequence parameters for core_attn
    q_range: torch.Tensor
    k_range: torch.Tensor
    np_q_range: np.ndarray
    np_k_range: np.ndarray
    max_seqlen_q: int
    max_seqlen_k: int


@dataclass(frozen=True)
class PackedCrossAttnParams:
    # Packed sequence parameters for cross_attn
    q_ranges: torch.Tensor = None
    kv_ranges: torch.Tensor = None
    cu_seqlens_q: torch.Tensor = None
    cu_seqlens_kv: torch.Tensor = None
    max_seqlen_q: int = None
    max_seqlen_kv: int = None


@dataclass(frozen=True)
class ModelMetaArgs:
    H: int
    W: int
    cp_pad_size: int
    cp_split_sizes: List[int]
    slice_point: int
    denoising_range_num: int
    range_num: int
    extract_prefix_video_feature: bool
    fwd_extra_1st_chunk: bool
    distill_nearly_clean_chunk: bool
    clip_token_nums: int
    enable_cuda_graph: bool
    core_attn_params: PackedCoreAttnParams
    cross_attn_params: PackedCrossAttnParams


class InferenceParams:
    """Inference parameters that are passed to the main model in order
    to efficienly calculate and store the context during inference."""

    def __init__(self, max_batch_size, max_sequence_length):
        self.max_sequence_length = max_sequence_length
        self.max_batch_size = max_batch_size
        self.sequence_len_offset = 0
        self.key_value_memory_dict = {}
        self.update_kv_cache = False

    def swap_key_value_dict(self, batch_idx):
        "swap between batches"
        if len(self.key_value_memory_dict) == 0:
            raise ValueError("should not swap when dict in empty")

        for layer_number in self.key_value_memory_dict.keys():
            inference_key_memory, inference_value_memory = self.key_value_memory_dict[layer_number]
            assert len(batch_idx) == inference_key_memory.shape[1]  # make sure batch size is the same
            new_inference_key_memory = inference_key_memory[:, batch_idx]
            new_inference_value_memory = inference_value_memory[:, batch_idx]
            self.key_value_memory_dict[layer_number] = (new_inference_key_memory, new_inference_value_memory)

```

## /inference/common/logger.py

```py path="/inference/common/logger.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import logging

import torch


class GlobalLogger:
    _logger = None

    @classmethod
    def get_logger(cls, name=__name__, level=logging.INFO):
        if cls._logger is None:
            cls._logger = logging.getLogger("magi_logger")
            cls._logger.setLevel(logging.INFO)

            cls._logger.propagate = False
            cls._logger.handlers.clear()
            formatter = logging.Formatter("[%(asctime)s - %(levelname)s] %(message)s")
            handler = logging.StreamHandler()
            handler.setFormatter(formatter)
            cls._logger.addHandler(handler)

        return cls._logger


magi_logger = GlobalLogger.get_logger()


def print_per_rank(message):
    magi_logger.info(message)


def print_rank_0(message):
    if torch.distributed.is_initialized():
        if torch.distributed.get_rank() == 0:
            magi_logger.info(message)
    else:
        magi_logger.info(message)

```

## /inference/common/timer.py

```py path="/inference/common/timer.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime

import torch

from .logger import print_rank_0


class EventPathTimer:
    """
    A lightweight class for recording time without any distributed barrier.

    This class allows for recording elapsed time between events without requiring
    synchronization across distributed processes. It maintains the previous message
    and time to calculate the duration between consecutive records.
    """

    def __init__(self):
        """
        Initialize the EventPathTimer.

        This constructor sets the previous message and time to None, preparing
        the instance for recording events.
        """
        self.prev_message: str = None
        self.prev_time: datetime = None

    def reset(self):
        """
        Reset the recorded message and time.

        This method clears the previous message and time, allowing for a fresh
        start in recording new events.
        """
        self.prev_message = None
        self.prev_time = None

    def synced_record(self, message):
        """
        Record the current time with a message.

        Args:
            message (str): A message to log along with the current time.

        This method synchronizes the CUDA operations, records the current time,
        and calculates the elapsed time since the last recorded message, if any.
        It then logs the elapsed time along with the previous and current messages.
        """
        torch.cuda.synchronize()
        current_time = datetime.now()
        if self.prev_message is not None:
            print_rank_0(
                f"\nTime Elapsed: [{current_time - self.prev_time}] From [{self.prev_message} ({self.prev_time})] To [{message} ({current_time})]"
            )
        self.prev_message = message
        self.prev_time = current_time


_GLOBAL_LIGHT_TIMER = EventPathTimer()


def event_path_timer() -> EventPathTimer:
    """Get the current EventPathTimer instance.

    Returns:
        EventPathTimer: The current EventPathTimer instance.

    Raises:
        AssertionError: If the EventPathTimer has not been initialized.
    """
    assert _GLOBAL_LIGHT_TIMER is not None, "light time recorder is not initialized"
    return _GLOBAL_LIGHT_TIMER

```

## /inference/infra/checkpoint/__init__.py

```py path="/inference/infra/checkpoint/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .checkpointing import load_checkpoint

__all__ = ["load_checkpoint"]

```

## /inference/infra/checkpoint/checkpointing.py

```py path="/inference/infra/checkpoint/checkpointing.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import io
import json
import os
import re
import subprocess
from collections import OrderedDict
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

import numpy as np
import torch
import torch.distributed
from safetensors.torch import load as load_from_bytes
from safetensors.torch import load_file
from tqdm.auto import tqdm

import inference.infra.distributed.parallel_state as mpu
from inference.common import EngineConfig, ModelConfig, RuntimeConfig, print_per_rank, print_rank_0


def _load_shard(shard_path, param_names, num_threads=None):
    zstd_path = shard_path + ".zst"
    if os.path.exists(zstd_path):
        start_time = datetime.now()
        print_per_rank(f"Decompressing {zstd_path} with {num_threads} threads")
        cmd = ["zstd", "-d"]
        if num_threads:
            cmd.extend(["-T", str(num_threads)])

        process = subprocess.Popen(cmd + ["-c", zstd_path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=-1)

        decompressed_data = process.stdout.read()
        process.stdout.close()

        retcode = process.wait()
        if retcode != 0:
            raise RuntimeError(f"Decompression failed: {process.stderr.read().decode()}")
        print_per_rank(
            f"Decompressed {zstd_path} with {num_threads} threads, duration: {(datetime.now() - start_time).total_seconds()}s"
        )

        buffer = io.BytesIO(decompressed_data)
        start_time = datetime.now()
        print_per_rank(f"Loading {shard_path} from zstd file, start time: {start_time}")
        weights = load_from_bytes(buffer.getvalue())
        print_per_rank(f"Loaded {shard_path} from zstd file, duration: {(datetime.now() - start_time).total_seconds()}s")
        buffer.close()
    else:
        weights = load_file(shard_path)

    return {name: weights[name] for name in param_names}


def load_sharded_safetensors_parallel_with_progress(checkpoint_dir):
    index_path = os.path.join(checkpoint_dir, "model.safetensors.index.json")
    if not os.path.exists(index_path):
        model_file_path = os.path.join(checkpoint_dir, "model.safetensors")
        state_dict = load_file(model_file_path)
        return state_dict

    with open(index_path, "r") as f:
        index = json.load(f)

    state_dict = {}
    shard_map = {}

    # Group parameters by shard file
    for param_name, shard_file in index["weight_map"].items():
        shard_path = os.path.join(checkpoint_dir, shard_file)
        if shard_path not in shard_map:
            shard_map[shard_path] = []
        shard_map[shard_path].append(param_name)

    # Load shards in parallel with a progress bar
    with ThreadPoolExecutor() as executor:
        futures = {
            executor.submit(_load_shard, shard_path, param_names): shard_path for shard_path, param_names in shard_map.items()
        }
        pbar = tqdm(futures, desc="Loading shards", total=len(futures))
        for future in pbar:
            result = future.result()
            state_dict.update(result)

    return state_dict


def unwrap_model(model):
    return_list = True
    if not isinstance(model, list):
        model = [model]
        return_list = False
    unwrapped_model = []
    for model_module in model:
        while hasattr(model_module, "module"):
            model_module = model_module.module
        unwrapped_model.append(model_module)
    if not return_list:
        return unwrapped_model[0]
    return unwrapped_model


def _split_state_dict_for_pp(weight_dict: OrderedDict, model_config: ModelConfig):
    num_layers = model_config.num_layers
    partition = mpu.get_pp_world_size()

    ## use partition and num_layers to get current rank layer order
    layers_for_each_stage = np.array_split(range(num_layers), partition)
    current_stage = mpu.get_pp_rank()
    allow_layer_num = layers_for_each_stage[current_stage]
    layer_offset = allow_layer_num[0]
    new_weight_dict = {}
    for k, v in weight_dict.items():
        if "videodit_blocks.layers" in k:
            layer_num = int(re.search(r"videodit_blocks\.layers\.(\d+)", k).group(1))
            if layer_num not in allow_layer_num:
                continue
            ## replace the old key name by new layer number
            new_layer_num = layer_num - layer_offset
            new_k = k.replace(f"videodit_blocks.layers.{layer_num}", f"videodit_blocks.layers.{new_layer_num}")
            new_weight_dict[new_k] = v
        else:
            new_weight_dict[k] = v
    return new_weight_dict


def load_state_dict(runtime_config: RuntimeConfig, engine_config: EngineConfig):
    load_dir = runtime_config.load

    default_subdir = "inference_weight"
    if engine_config.fp8_quant:
        default_subdir = f"{default_subdir}.fp8"
    if engine_config.distill:
        default_subdir = f"{default_subdir}.distill"
    inference_weight_dir = os.path.join(load_dir, default_subdir)

    print_rank_0(f"load {default_subdir} weight from {inference_weight_dir}")
    assert (
        os.path.exists(inference_weight_dir) and len(os.listdir(inference_weight_dir)) > 0
    ), f"Ckpt directory {inference_weight_dir} does not exist or empty. If you are using fp8_quant, please run calibration first."
    state_dict = load_sharded_safetensors_parallel_with_progress(inference_weight_dir)
    return state_dict


def load_checkpoint(model):
    state_dict = load_state_dict(model.runtime_config, model.engine_config)

    model = unwrap_model(model)
    # if we use pipeline parallelism, we need to load the state dict for each stage
    # as it always record layer from 0 -> num_layers//pipeline_parallel_size
    # so we need to choose correct layer weight when load_state_dict
    if mpu.get_pp_world_size() > 1:
        state_dict = _split_state_dict_for_pp(state_dict, model.model_config)

    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False, assign=True)
    model.cuda(torch.cuda.current_device())

    if mpu.get_pp_world_size() > 1:
        rank_msg = f"CP_rank={mpu.get_cp_rank()} PP_rank={mpu.get_pp_rank()}"
        print_per_rank(
            f"""[{rank_msg}] Load Weight Missing Keys: {missing_keys} Load Weight Unexpected Keys: {unexpected_keys} You should see message [missing fianl layer norm weight] except the final pipeline stage"""
        )
    else:
        print_rank_0(f"Load Weight Missing Keys: {missing_keys}")
        print_rank_0(f"Load Weight Unexpected Keys: {unexpected_keys}")

    return model

```

## /inference/infra/distributed/__init__.py

```py path="/inference/infra/distributed/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .dist_utils import dist_init, get_device, get_world_size, is_last_rank, is_last_tp_cp_rank
from .parallel_state import (
    destroy_model_parallel,
    get_cp_group,
    get_cp_rank,
    get_cp_world_size,
    get_dp_group,
    get_dp_group_gloo,
    get_dp_rank,
    get_dp_world_size,
    get_pipeline_model_parallel_first_rank,
    get_pipeline_model_parallel_last_rank,
    get_pipeline_model_parallel_next_rank,
    get_pipeline_model_parallel_prev_rank,
    get_pp_group,
    get_pp_rank,
    get_pp_world_size,
    get_tensor_model_parallel_last_rank,
    get_tensor_model_parallel_ranks,
    get_tensor_model_parallel_src_rank,
    get_tp_group,
    get_tp_rank,
    get_tp_world_size,
    is_initialized,
    is_pipeline_first_stage,
    is_pipeline_last_stage,
)

__all__ = [
    "dist_init",
    "is_initialized",
    "get_tp_group",
    "get_pp_group",
    "get_dp_group",
    "get_dp_group_gloo",
    "get_cp_group",
    "get_tp_world_size",
    "get_pp_world_size",
    "get_dp_world_size",
    "get_cp_world_size",
    "get_tp_rank",
    "get_pp_rank",
    "get_dp_rank",
    "get_cp_rank",
    "is_pipeline_first_stage",
    "is_pipeline_last_stage",
    "get_tensor_model_parallel_src_rank",
    "get_tensor_model_parallel_ranks",
    "get_tensor_model_parallel_last_rank",
    "get_pipeline_model_parallel_first_rank",
    "get_pipeline_model_parallel_last_rank",
    "get_pipeline_model_parallel_next_rank",
    "get_pipeline_model_parallel_prev_rank",
    "destroy_model_parallel",
    "is_last_rank",
    "is_last_tp_cp_rank",
    "get_world_size",
    "get_device",
]

```

## /inference/infra/distributed/dist_utils.py

```py path="/inference/infra/distributed/dist_utils.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
from datetime import timedelta

import torch

import inference.infra.distributed.parallel_state as mpu
from inference.common import print_rank_0
from inference.infra.parallelism.pipeline_parallel import init_pp_scheduler

from . import parallel_state as mpu


def dist_init(config):
    """Initialize torch.distributed and core model parallel."""

    assert torch.cuda.is_available()
    device_count = torch.cuda.device_count()
    if torch.distributed.is_initialized():
        print_rank_0("Torch distribution already initialized, skipping initialization ...")
    else:
        rank = int(os.getenv("RANK", "0"))
        world_size = int(os.getenv("WORLD_SIZE", "1"))
        # Manually set the device ids.
        if device_count > 0:
            device = rank % device_count
            torch.cuda.set_device(device)
        # Call the init process
        torch.distributed.init_process_group(
            backend=config.engine_config.distributed_backend,
            world_size=world_size,
            rank=rank,
            timeout=timedelta(minutes=config.engine_config.distributed_timeout_minutes),
        )
    assert config.engine_config.cp_size * config.engine_config.pp_size == torch.distributed.get_world_size()
    if device_count > 0:
        if mpu.model_parallel_is_initialized():
            print_rank_0("Model parallel is already initialized")
        else:
            mpu.initialize_model_parallel(
                cp_size=config.engine_config.cp_size,
                pp_size=config.engine_config.pp_size,
                nccl_communicator_config_path=None,
                distributed_timeout_minutes=config.engine_config.distributed_timeout_minutes,
                order="tp-cp-pp-dp",
            )
    if mpu.get_pp_world_size() > 1:
        init_pp_scheduler()
    print_rank_0("Initialize torch distribution and model parallel successfully")


def is_last_rank():
    return torch.distributed.get_rank() == (torch.distributed.get_world_size() - 1)


def is_last_tp_cp_rank():
    return mpu.get_tp_rank(with_context_parallel=True) == mpu.get_tp_world_size(with_context_parallel=True) - 1


def get_world_size():
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        world_size = torch.distributed.get_world_size()
    else:
        world_size = 1
    return world_size


def get_device(local_rank=None):
    backend = torch.distributed.get_backend()
    if backend == "nccl":
        if local_rank is None:
            device = torch.device("cuda")
        else:
            device = torch.device(f"cuda:{local_rank}")
    elif backend == "gloo":
        device = torch.device("cpu")
    else:
        raise RuntimeError
    return device

```

## /inference/infra/distributed/parallel_state.py

```py path="/inference/infra/distributed/parallel_state.py" 
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Model and data parallel groups."""

import warnings
from datetime import timedelta
from typing import List, Optional

import torch

# Intra-layer model parallel group that the current rank belongs to.
_TENSOR_MODEL_PARALLEL_GROUP = None
# Tensor parallel group information with context parallel combined.
_TENSOR_MODEL_PARALLEL_GROUP_WITH_CP = None
_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP = None
# Inter-layer model parallel group that the current rank belongs to.
_PIPELINE_MODEL_PARALLEL_GROUP = None
# Model parallel group (both intra- and pipeline) that the current rank belongs to.
_MODEL_PARALLEL_GROUP = None
# Data parallel group that the current rank belongs to.
_DATA_PARALLEL_GROUP = None
_DATA_PARALLEL_GROUP_GLOO = None
# tensor model parallel group and data parallel group combined
# used for fp8 and moe training
_TENSOR_AND_DATA_PARALLEL_GROUP = None

# A list of global ranks for each pipeline group to ease calculation of the source
# rank when broadcasting from the first or last pipeline stage.
_PIPELINE_GLOBAL_RANKS = None

# A list of global ranks for each data parallel group to ease calculation of the source
# rank when broadcasting weights from src to all other data parallel ranks
_DATA_PARALLEL_GLOBAL_RANKS = None

# A list of global ranks for each tensor model parallel group to ease calculation of
# the first local rank in the tensor model parallel group
_TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = None

# Context parallel group that the current rank belongs to
_CONTEXT_PARALLEL_GROUP = None
# A list of global ranks for each context parallel group to ease calculation of the
# destination rank when exchanging KV/dKV between context parallel_ranks
_CONTEXT_PARALLEL_GLOBAL_RANKS = None

# Data parallel group information with context parallel combined.
_DATA_PARALLEL_GROUP_WITH_CP = None
_DATA_PARALLEL_GROUP_WITH_CP_GLOO = None
_DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None

# combined parallel group of TP, DP, and CP used for fp8
_TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None


def get_nccl_options(pg_name, nccl_comm_cfgs):
    """Set the NCCL process group options.

    Args:
        pg_name (str): process group name
        nccl_comm_cfgs (dict): nccl communicator configurations

    When an option (e.g., max_ctas) is not found in the config, use the NCCL default setting.
    """
    if pg_name in nccl_comm_cfgs:
        nccl_options = torch.distributed.ProcessGroupNCCL.Options()
        nccl_options.config.cga_cluster_size = nccl_comm_cfgs[pg_name].get("cga_cluster_size", 4)
        nccl_options.config.max_ctas = nccl_comm_cfgs[pg_name].get("max_ctas", 32)
        nccl_options.config.min_ctas = nccl_comm_cfgs[pg_name].get("min_ctas", 1)
        return nccl_options
    else:
        return None


def generate_masked_orthogonal_rank_groups(world_size: int, parallel_size: List[int], mask: List[bool]) -> List[List[int]]:
    """Generate orthogonal parallel groups based on the parallel size and mask.

    Arguments:
        world_size (int): world size

        parallel_size (List[int]):
            The parallel size of each orthogonal parallel type. For example, if
            tensor_parallel_size = 2, pipeline_model_parallel_group = 3, data_parallel_size = 4,
            and the parallel mapping order is tp-pp-dp, then the parallel_size = [2, 3, 4].

        mask (List[bool]):
            The mask controls which parallel methods the generated groups represent. If mask[i] is
            True, it means the generated group contains the i-th parallelism method. For example,
            if parallel_size = [tp_size, pp_size, dp_size], and mask = [True, False , True], then
            the generated group is the `tp-dp` group, if the mask = [False, True, False], then the
            generated group is the `pp` group.

    Algorithm:
        For orthogonal parallelism, such as tp/dp/pp/cp, the global_rank and
        local_rank satisfy the following equation:
            global_rank = tp_rank + dp_rank * tp_size + pp_rank * tp_size * dp_size (1)
                tp_rank \in [0, tp_size)
                dp_rank \in [0, dp_size)
                pp_rank \in [0, pp_size)

        If we want to get the `dp_group` (tp_size * pp_size groups of dp_size ranks each.
        For example,  if the gpu size is 8 and order is 'tp-pp-dp', size is '2-2-2', and the
        dp_group here is [[0, 4], [1, 5], [2, 6], [3, 7]].)
        The tp_rank and pp_rank will be combined to form the `dp_group_index`.
            dp_group_index = tp_rank + pp_rank * tp_size (2)

        So, Given that tp_rank and pp_rank satisfy equation (2), and dp_rank in
        range(0, dp_size), the ranks in dp_group[dp_group_index] satisfies the
        equation (1).

        This function solve this math problem.

    For example, if the parallel_size = [tp_size, dp_size, pp_size] = [2, 3, 4],
    and the mask = [False, True, False]. Then,
        dp_group_index(0) = tp_rank(0) + pp_rank(0) * 2
        dp_group_index(1) = tp_rank(1) + pp_rank(0) * 2
        ...
        dp_group_index(7) = tp_rank(1) + pp_rank(3) * 2

        dp_group[0] = 0 + range(0, 3) * 2 + 0 = [0, 2, 4]
        dp_group[1] = 1 + range(0, 3) * 2 + 0 = [1, 3, 5]
        ...
        dp_group[7] = 1 + range(0, 3) * 2 + 3 * 2 * 3 = [19, 21, 23]
    """

    def prefix_product(a: List[int], init=1) -> List[int]:
        r = [init]
        for v in a:
            init = init * v
            r.append(init)
        return r

    def inner_product(a: List[int], b: List[int]) -> int:
        return sum([x * y for x, y in zip(a, b)])

    def decompose(index, shape, stride=None):
        """
        This function solve the math problem below:
            There is an equation:
                index = sum(idx[i] * stride[i])
            And given the value of index, stride.
            Return the idx.
        This function will used to get the pp/dp/pp_rank
        from group_index and rank_in_group.
        """
        if stride is None:
            stride = prefix_product(shape)
        idx = [(index // d) % s for s, d in zip(shape, stride)]
        # stride is a prefix_product result. And the value of stride[-1]
        # is not used.
        assert (
            sum([x * y for x, y in zip(idx, stride[:-1])]) == index
        ), "idx {} with shape {} mismatch the return idx {}".format(index, shape, idx)
        return idx

    masked_shape = [s for s, m in zip(parallel_size, mask) if m]
    unmasked_shape = [s for s, m in zip(parallel_size, mask) if not m]

    global_stride = prefix_product(parallel_size)
    masked_stride = [d for d, m in zip(global_stride, mask) if m]
    unmasked_stride = [d for d, m in zip(global_stride, mask) if not m]

    group_size = prefix_product(masked_shape)[-1]
    num_of_group = world_size // group_size

    ranks = []
    for group_index in range(num_of_group):
        # get indices from unmaksed for group_index.
        decomposed_group_idx = decompose(group_index, unmasked_shape)
        rank = []
        for rank_in_group in range(group_size):
            # get indices from masked for rank_in_group.
            decomposed_rank_idx = decompose(rank_in_group, masked_shape)
            rank.append(
                inner_product(decomposed_rank_idx, masked_stride) + inner_product(decomposed_group_idx, unmasked_stride)
            )
        ranks.append(rank)
    return ranks


class RankGenerator(object):
    def __init__(self, tp: int, dp: int, pp: int, cp: int, order: str) -> None:
        self.tp = tp
        self.dp = dp
        self.pp = pp
        self.cp = cp
        self.world_size = tp * dp * pp * cp

        self.name_to_size = {"tp": self.tp, "pp": self.pp, "dp": self.dp, "cp": self.cp}
        order = order.lower()
        for name in self.name_to_size.keys():
            if name not in order and self.name_to_size[name] != 1:
                raise RuntimeError(
                    f"The size of ({name}) is ({self.name_to_size[name]}), but you haven't specified the order ({self.order})."
                )
            elif name not in order:
                order = order + "-" + name

        self.order = order
        self.ordered_size = [self.name_to_size[token] for token in order.split("-")]

    def get_mask(self, order: str, token: str):
        ordered_token = order.split("-")
        token = token.split("-")
        mask = [False] * len(ordered_token)
        for t in token:
            mask[ordered_token.index(t)] = True
        return mask

    def get_ranks(self, token):
        """Get rank group by input token.

        Arguments:
            token (str):
                Specify the ranks type that want to get. If we want
                to obtain multiple parallel types, we can use a hyphen
                '-' to separate them. For example, if we want to obtain
                the TP_DP group, the token should be 'tp-dp'.
        """
        mask = self.get_mask(self.order, token)
        ranks = generate_masked_orthogonal_rank_groups(self.world_size, self.ordered_size, mask)
        return ranks


def initialize_model_parallel(
    tp_size: int = 1,
    pp_size: int = 1,
    cp_size: int = 1,
    nccl_communicator_config_path: Optional[str] = None,
    distributed_timeout_minutes: int = 30,
    order: str = "tp-cp-pp-dp",
) -> None:
    """Initialize model data parallel groups.
    Borrow from: https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py

    Args:
        tp_size (int, default = 1):
            The number of GPUs to split individual tensors across.

        pp_size (int, default = 1):
            The number of tensor parallel GPU groups to split the
            Transformer layers across. For example, if tp_size is 4 and
            pp_size is 2, the model will be split into 2 groups of 4 GPUs.

        cp_size (int, default = 1):
            The number of tensor parallel GPU groups to split the
            network input sequence length across. Compute of attention
            module requires tokens of full sequence length, so GPUs
            in a context parallel group need to communicate with each
            other to exchange information of other sequence chunks.
            Each GPU and its counterparts in other tensor parallel
            groups compose a context parallel group.

            For example, assume we have 8 GPUs, if tensor model parallel
            size is 4 and context parallel size is 2, the network input
            will be split into two sequence chunks, which are processed
            by 2 different groups of 4 GPUs. One chunk is processed by
            GPU0-3, the other chunk is processed by GPU4-7. Four groups
            are build to do context parallel communications: [GPU0, GPU4],
            [GPU1, GPU5], [GPU2, GPU6], and [GPU3, GPU7].

            Context parallelism partitions sequence length, so it has no
            impact on weights, which means weights are duplicated among
            GPUs in a context parallel group. Hence, weight gradients
            all-reduce is required in backward. For simplicity, we piggyback
            GPUs of context parallelism on data parallel group for
            weight gradient all-reduce.

        nccl_communicator_config_path (str, default = None):
            Path to the yaml file of NCCL communicator configurations.
            `min_ctas`, `max_ctas`, and `cga_cluster_size` can be set
            for each communicator.

        distributed_timeout_minutes (int, default = 30): Timeout, in
            minutes,for operations executed against distributed
            process groups. See PyTorch documentation at
            https://pytorch.org/docs/stable/distributed.html for
            caveats.

        order (str, default=tp-dp-pp):
            The rank initialization order of parallelism. Now we support
            tp-dp-pp and tp-pp-dp orders.

    Let's say we have a total of 16 GPUs denoted by g0 ... g15 and we
    use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
    the model pipeline. The present function will
    create 8 tensor model-parallel groups, 4 pipeline model-parallel groups
    and 8 data-parallel groups as:
        8 data_parallel groups:
            [g0, g2], [g1, g3], [g4, g6], [g5, g7], [g8, g10], [g9, g11], [g12, g14], [g13, g15]
        8 tensor model-parallel groups:
            [g0, g1], [g2, g3], [g4, g5], [g6, g7], [g8, g9], [g10, g11], [g12, g13], [g14, g15]
        4 pipeline model-parallel groups:
            [g0, g4, g8, g12], [g1, g5, g9, g13], [g2, g6, g10, g14], [g3, g7, g11, g15]
    Note that for efficiency, the caller should make sure adjacent ranks
    are on the same DGX box. For example if we are using 2 DGX-1 boxes
    with a total of 16 GPUs, rank 0 to 7 belong to the first box and
    ranks 8 to 15 belong to the second box.

    """
    # Get world size and rank. Ensure some consistencies.
    assert torch.distributed.is_initialized()
    world_size: int = torch.distributed.get_world_size()
    if world_size % (tp_size * pp_size * cp_size) != 0:
        raise RuntimeError(
            f"world_size ({world_size}) is not divisible by tp_size "
            f"({tp_size}) x pp_size ({pp_size}) "
            f"x cp_size ({cp_size})"
        )

    nccl_comm_cfgs = {}
    if nccl_communicator_config_path is not None:
        try:
            import yaml
        except ImportError:
            raise RuntimeError("Cannot import `yaml`. Setting custom nccl communicator configs " "requires the yaml package.")

        with open(nccl_communicator_config_path, "r") as stream:
            nccl_comm_cfgs = yaml.safe_load(stream)

    dp_size: int = world_size // (tp_size * pp_size * cp_size)
    rank = torch.distributed.get_rank()
    rank_generator = RankGenerator(tp=tp_size, dp=dp_size, pp=pp_size, cp=cp_size, order=order)
    timeout = timedelta(minutes=distributed_timeout_minutes)

    # Build the data-parallel groups.
    global _DATA_PARALLEL_GROUP
    global _DATA_PARALLEL_GROUP_GLOO
    global _DATA_PARALLEL_GLOBAL_RANKS
    global _DATA_PARALLEL_GROUP_WITH_CP
    global _DATA_PARALLEL_GROUP_WITH_CP_GLOO
    global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
    assert _DATA_PARALLEL_GROUP is None, "data parallel group is already initialized"

    for ranks in rank_generator.get_ranks("dp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("dp", nccl_comm_cfgs))
        group_gloo = torch.distributed.new_group(ranks, timeout=timeout, backend="gloo")
        if rank in ranks:
            _DATA_PARALLEL_GROUP = group
            _DATA_PARALLEL_GROUP_GLOO = group_gloo
            _DATA_PARALLEL_GLOBAL_RANKS = ranks
    for ranks_with_cp in rank_generator.get_ranks("dp-cp"):
        group_with_cp = torch.distributed.new_group(
            ranks_with_cp, timeout=timeout, pg_options=get_nccl_options("dp_cp", nccl_comm_cfgs)
        )
        group_with_cp_gloo = torch.distributed.new_group(ranks_with_cp, timeout=timeout, backend="gloo")
        if rank in ranks_with_cp:
            _DATA_PARALLEL_GROUP_WITH_CP = group_with_cp
            _DATA_PARALLEL_GROUP_WITH_CP_GLOO = group_with_cp_gloo
            _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks_with_cp

    # Build the context-parallel groups.
    global _CONTEXT_PARALLEL_GROUP
    global _CONTEXT_PARALLEL_GLOBAL_RANKS
    assert _CONTEXT_PARALLEL_GROUP is None, "context parallel group is already initialized"
    for ranks in rank_generator.get_ranks("cp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("cp", nccl_comm_cfgs))
        if rank in ranks:
            _CONTEXT_PARALLEL_GROUP = group
            _CONTEXT_PARALLEL_GLOBAL_RANKS = ranks

    # Build the model-parallel groups.
    global _MODEL_PARALLEL_GROUP
    assert _MODEL_PARALLEL_GROUP is None, "model parallel group is already initialized"
    for ranks in rank_generator.get_ranks("tp-pp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("mp", nccl_comm_cfgs))
        if rank in ranks:
            _MODEL_PARALLEL_GROUP = group

    # Build the tensor model-parallel groups.
    global _TENSOR_MODEL_PARALLEL_GROUP
    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS
    assert _TENSOR_MODEL_PARALLEL_GROUP is None, "tensor model parallel group is already initialized"
    for ranks in rank_generator.get_ranks("tp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("tp", nccl_comm_cfgs))
        if rank in ranks:
            _TENSOR_MODEL_PARALLEL_GROUP = group
            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = ranks

    # Build the tensor + context parallel groups.
    global _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP
    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP
    assert (
        _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP is None
    ), "tensor model parallel group with context parallel is already initialized"
    for ranks in rank_generator.get_ranks("tp-cp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("tp_cp", nccl_comm_cfgs))
        if rank in ranks:
            _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP = group
            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP = ranks

    # Build the pipeline model-parallel groups
    global _PIPELINE_MODEL_PARALLEL_GROUP
    global _PIPELINE_GLOBAL_RANKS
    assert _PIPELINE_MODEL_PARALLEL_GROUP is None, "pipeline model parallel group is already initialized"
    for ranks in rank_generator.get_ranks("pp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("pp", nccl_comm_cfgs))
        if rank in ranks:
            _PIPELINE_MODEL_PARALLEL_GROUP = group
            _PIPELINE_GLOBAL_RANKS = ranks

    # Build the tensor + data parallel groups.
    global _TENSOR_AND_DATA_PARALLEL_GROUP
    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
    assert _TENSOR_AND_DATA_PARALLEL_GROUP is None, "Tensor + data parallel group is already initialized"
    for ranks in rank_generator.get_ranks("tp-cp-dp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("tp_cp_dp", nccl_comm_cfgs))
        if rank in ranks:
            _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = group
    for ranks in rank_generator.get_ranks("tp-dp"):
        group = torch.distributed.new_group(ranks, timeout=timeout, pg_options=get_nccl_options("tp_dp", nccl_comm_cfgs))
        if rank in ranks:
            _TENSOR_AND_DATA_PARALLEL_GROUP = group


def is_initialized():
    """Useful for code segments that may be accessed with or without mpu initialization"""
    return _DATA_PARALLEL_GROUP is not None


def is_unitialized() -> bool:
    """Check if parallel state has been initialized

    Deprecated. Use is_initialized instead.

    """
    warnings.warn("is_unitialized is deprecated, use is_initialized instead", DeprecationWarning)
    return not is_initialized()


def model_parallel_is_initialized():
    """Check if model and data parallel groups are initialized."""
    if _TENSOR_MODEL_PARALLEL_GROUP is None or _PIPELINE_MODEL_PARALLEL_GROUP is None or _DATA_PARALLEL_GROUP is None:
        return False
    return True


def get_model_parallel_group():
    """Get the model parallel group the caller rank belongs to."""
    assert _MODEL_PARALLEL_GROUP is not None, "model parallel group is not initialized"
    return _MODEL_PARALLEL_GROUP


def get_tp_group(check_initialized=True, with_context_parallel=False):
    """Get the tensor model parallel group the caller rank belongs to."""
    if check_initialized:
        assert _TENSOR_MODEL_PARALLEL_GROUP is not None, "tensor model parallel group is not initialized"
    if with_context_parallel:
        assert (
            _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP is not None
        ), "tensor model parallel group with context parallel combined is not initialized"
        return _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP
    else:
        assert _TENSOR_MODEL_PARALLEL_GROUP is not None, "tensor model parallel group is not initialized"
        return _TENSOR_MODEL_PARALLEL_GROUP


def get_pp_group():
    """Get the pipeline model parallel group the caller rank belongs to."""
    assert _PIPELINE_MODEL_PARALLEL_GROUP is not None, "pipeline_model parallel group is not initialized"
    return _PIPELINE_MODEL_PARALLEL_GROUP


def get_dp_group(with_context_parallel=False):
    """Get the data parallel group the caller rank belongs to."""
    if with_context_parallel:
        assert (
            _DATA_PARALLEL_GROUP_WITH_CP is not None
        ), "data parallel group with context parallel combined is not initialized"
        return _DATA_PARALLEL_GROUP_WITH_CP
    else:
        assert _DATA_PARALLEL_GROUP is not None, "data parallel group is not initialized"
        return _DATA_PARALLEL_GROUP


def get_dp_group_gloo(with_context_parallel=False):
    """Get the data parallel group-gloo the caller rank belongs to."""
    if with_context_parallel:
        assert (
            _DATA_PARALLEL_GROUP_WITH_CP_GLOO is not None
        ), "data parallel group-gloo with context parallel combined is not initialized"
        return _DATA_PARALLEL_GROUP_WITH_CP_GLOO
    else:
        assert _DATA_PARALLEL_GROUP_GLOO is not None, "data parallel group-gloo is not initialized"
        return _DATA_PARALLEL_GROUP_GLOO


def get_cp_group(check_initialized=True):
    """Get the context parallel group the caller rank belongs to."""
    if check_initialized:
        assert _CONTEXT_PARALLEL_GROUP is not None, "context parallel group is not initialized"
    return _CONTEXT_PARALLEL_GROUP


def get_tp_world_size(with_context_parallel=False):
    """Return world size for the tensor model parallel group."""
    return torch.distributed.get_world_size(group=get_tp_group(with_context_parallel=with_context_parallel))


def get_pp_world_size():
    """Return world size for the pipeline model parallel group."""
    return torch.distributed.get_world_size(group=get_pp_group())


def get_tp_rank(with_context_parallel=False):
    """Return my rank for the tensor model parallel group."""
    return torch.distributed.get_rank(group=get_tp_group(with_context_parallel=with_context_parallel))


def get_pp_rank():
    """Return my rank for the pipeline model parallel group."""
    return torch.distributed.get_rank(group=get_pp_group())


def is_pipeline_first_stage():
    """Return True if in the first pipeline model-parallel stage, False otherwise."""
    return get_pp_rank() == 0


def is_pipeline_last_stage():
    """Return True if in the last pipeline model-parallel stage, False otherwise."""
    return get_pp_rank() == (get_pp_world_size() - 1)


def get_tensor_model_parallel_src_rank(with_context_parallel=False):
    """Calculate the global rank corresponding to the first local rank
    in the tensor model parallel group."""
    assert _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None, "Tensor model parallel group is not initialized"
    if with_context_parallel:
        assert (
            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
        ), "Tensor model parallel group with context parallel combined is not initialized"
        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP[0]
    else:
        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS[0]


def get_tensor_model_parallel_ranks(with_context_parallel=False):
    """Return all global ranks for the tensor model parallel group."""
    if with_context_parallel:
        assert (
            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
        ), "Tensor model parallel group with context parallel combined is not initialized"
        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP
    else:
        assert _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None, "Tensor model parallel group is not initialized"
        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS


def get_tensor_model_parallel_last_rank(with_context_parallel=False):
    """Calculate the global rank corresponding to the first local rank
    in the tensor model parallel group."""
    assert _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS is not None, "Tensor model parallel group is not initialized"
    if with_context_parallel:
        assert (
            _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP is not None
        ), "Tensor model parallel group with context parallel combined is not initialized"
        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP[-1]
    else:
        return _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS[-1]


def get_pipeline_model_parallel_first_rank():
    """Return the global rank of the first process in the pipeline for the
    current tensor parallel group"""
    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
    return _PIPELINE_GLOBAL_RANKS[0]


def get_pipeline_model_parallel_last_rank():
    """Return the global rank of the last process in the pipeline for the
    current tensor parallel group"""
    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
    last_rank_local = get_pp_world_size() - 1
    return _PIPELINE_GLOBAL_RANKS[last_rank_local]


def get_pipeline_model_parallel_next_rank():
    """Return the global rank that follows the caller in the pipeline"""
    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
    rank_in_pipeline = get_pp_rank()
    world_size = get_pp_world_size()
    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline + 1) % world_size]


def get_pipeline_model_parallel_prev_rank():
    """Return the global rank that preceeds the caller in the pipeline"""
    assert _PIPELINE_GLOBAL_RANKS is not None, "Pipeline parallel group is not initialized"
    rank_in_pipeline = get_pp_rank()
    world_size = get_pp_world_size()
    return _PIPELINE_GLOBAL_RANKS[(rank_in_pipeline - 1) % world_size]


def get_dp_world_size(with_context_parallel=False):
    """Return world size for the data parallel group."""
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        return torch.distributed.get_world_size(group=get_dp_group(with_context_parallel=with_context_parallel))
    else:
        return 0


def get_dp_rank(with_context_parallel=False):
    """Return my rank for the data parallel group."""
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        return torch.distributed.get_rank(group=get_dp_group(with_context_parallel=with_context_parallel))
    else:
        return 0


def get_cp_world_size():
    """Return world size for the context parallel group."""
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        return torch.distributed.get_world_size(group=get_cp_group())
    else:
        return 0


def get_cp_rank():
    """Return my rank for the context parallel group."""
    if torch.distributed.is_available() and torch.distributed.is_initialized():
        return torch.distributed.get_rank(group=get_cp_group())
    else:
        return 0


def destroy_model_parallel():
    """Set the groups to none."""
    global _MODEL_PARALLEL_GROUP
    _MODEL_PARALLEL_GROUP = None
    global _TENSOR_MODEL_PARALLEL_GROUP
    _TENSOR_MODEL_PARALLEL_GROUP = None
    global _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP
    _TENSOR_MODEL_PARALLEL_GROUP_WITH_CP = None
    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP
    _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS_WITH_CP = None
    global _PIPELINE_MODEL_PARALLEL_GROUP
    _PIPELINE_MODEL_PARALLEL_GROUP = None
    global _DATA_PARALLEL_GROUP
    _DATA_PARALLEL_GROUP = None
    global _DATA_PARALLEL_GROUP_GLOO
    _DATA_PARALLEL_GROUP_GLOO = None
    global _TENSOR_AND_DATA_PARALLEL_GROUP
    _TENSOR_AND_DATA_PARALLEL_GROUP = None
    global _PIPELINE_GLOBAL_RANKS
    _PIPELINE_GLOBAL_RANKS = None
    global _DATA_PARALLEL_GLOBAL_RANKS
    _DATA_PARALLEL_GLOBAL_RANKS = None
    global _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS
    _TENSOR_MODEL_PARALLEL_GLOBAL_RANKS = None
    global _CONTEXT_PARALLEL_GROUP
    _CONTEXT_PARALLEL_GROUP = None
    global _CONTEXT_PARALLEL_GLOBAL_RANKS
    _CONTEXT_PARALLEL_GLOBAL_RANKS = None
    global _DATA_PARALLEL_GROUP_WITH_CP
    _DATA_PARALLEL_GROUP_WITH_CP = None
    global _DATA_PARALLEL_GROUP_WITH_CP_GLOO
    _DATA_PARALLEL_GROUP_WITH_CP_GLOO = None
    global _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP
    _DATA_PARALLEL_GLOBAL_RANKS_WITH_CP = None
    global _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP
    _TENSOR_AND_DATA_PARALLEL_GROUP_WITH_CP = None

```

## /inference/infra/parallelism/__init__.py

```py path="/inference/infra/parallelism/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .context_parallel import CSOHelper, UlyssesScheduler, cp_post_process, cp_pre_process, cso_communication
from .pipeline_parallel import pp_scheduler
from .tile_parallel import TileProcessor

__all__ = [
    "CSOHelper",
    "cso_communication",
    "UlyssesScheduler",
    "pp_scheduler",
    "TileProcessor",
    "cp_pre_process",
    "cp_post_process",
]

```

## /inference/infra/parallelism/context_parallel.py

```py path="/inference/infra/parallelism/context_parallel.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import math
from typing import Callable, List, Tuple, Union

import torch
import torch.distributed
from einops import rearrange

from inference.common import ModelMetaArgs, PackedCoreAttnParams, PackedCrossAttnParams, divide
from inference.infra.distributed import parallel_state as mpu


#####################################################
# Common Primitives
#####################################################
def scatter_to_context_parallel_region(input_, cp_split_sizes, cp_shuffle_num=1, cp_pad_size=0):
    """Split the tensor along its first dimension and keep the
    corresponding slice."""

    world_size = mpu.get_cp_world_size()
    # Bypass the function if we are using only 1 GPU.
    if world_size == 1:
        return input_

    # Split along first dimension with padding.
    rank = mpu.get_cp_rank()
    if cp_shuffle_num > 1:
        cp_pad_size = divide(cp_pad_size, cp_shuffle_num)
        cp_split_sizes = [divide(s, cp_shuffle_num) for s in cp_split_sizes]
        dim_offset = sum(cp_split_sizes[:rank])
        xs = []
        for x in torch.chunk(input_, cp_shuffle_num, dim=0):
            x = torch.nn.functional.pad(x, [0, 0] * (x.dim() - 1) + [0, cp_pad_size], mode="constant", value=0)
            xs.append(x[dim_offset : dim_offset + cp_split_sizes[rank]])
        output = torch.concat(xs, dim=0)
    else:
        dim_offset = sum(cp_split_sizes[:rank])
        x = torch.nn.functional.pad(input_, [0, 0] * (input_.dim() - 1) + [0, cp_pad_size], mode="constant", value=0)
        output = x[dim_offset : dim_offset + cp_split_sizes[rank]].contiguous()
    return output


def gather_from_context_parallel_region(input_, cp_split_sizes, cp_shuffle_num=1, cp_pad_size=0):
    """Gather tensors and concatinate along the first dimension."""

    world_size = mpu.get_cp_world_size()
    # Bypass the function if we are using only 1 GPU.
    if world_size == 1:
        return input_

    input_ = input_.contiguous()
    total_seq_len = sum(cp_split_sizes)
    dim_size = list(input_.size())
    dim_size[0] = total_seq_len

    output = torch.empty(dim_size, dtype=input_.dtype, device=input_.device)
    outputs = list(torch.split(output, cp_split_sizes, dim=0))
    torch.distributed.all_gather(outputs, input_, group=mpu.get_cp_group())
    if cp_shuffle_num > 1:
        total_seq_len = divide(total_seq_len, cp_shuffle_num)
        cp_pad_size = divide(cp_pad_size, cp_shuffle_num)
        chunks = [torch.chunk(o, cp_shuffle_num, dim=0) for o in outputs]
        output = torch.concat(
            [
                torch.concat([chunk[i] for chunk in chunks], dim=0)[: total_seq_len - cp_pad_size]
                for i in range(cp_shuffle_num)
            ],
            dim=0,
        )
    else:
        output = torch.concat(outputs, dim=0)[: total_seq_len - cp_pad_size]

    return output


class FakeHandle:
    def __init__(self):
        pass

    def wait(self):
        pass


#####################################################
# Context Parallel Process
#####################################################
def update_packed_seq_params_for_cuda_graph(cross_attn_params: PackedCrossAttnParams, xattn_mask: torch.Tensor):
    assert xattn_mask is not None
    # xattn_mask: (N * denoising_range_num, L, 1, 1)
    xattn_mask = xattn_mask.reshape(xattn_mask.shape[0], -1)
    batch_size, static_caption_length = xattn_mask.shape

    # Get index_map for kv_range injection, map y_index to static_caption_length
    y_index = torch.sum(xattn_mask, dim=-1)
    cu_seqlens_k = torch.cat([y_index.new_tensor([0]), y_index]).to(torch.int32).to(xattn_mask.device)
    cu_seqlens_k = cu_seqlens_k.cumsum(-1).to(torch.int32)
    static_cu_seqlens_k = torch.arange(0, (batch_size + 1) * static_caption_length, static_caption_length)
    assert cu_seqlens_k.shape[0] == batch_size + 1 == static_cu_seqlens_k.shape[0]
    start_index_map = dict(zip(cu_seqlens_k.flatten().tolist(), static_cu_seqlens_k.flatten().tolist()))

    # Move kv_range to the right position
    kv_range_start_list = cross_attn_params.kv_ranges[:, 0].flatten().tolist()
    static_kv_range_start = [start_index_map[kv_range_start_list[i]] for i in range(len(kv_range_start_list))]
    static_kv_range_start = torch.tensor(static_kv_range_start, dtype=torch.int32, device=xattn_mask.device)
    assert static_kv_range_start.shape[0] == cross_attn_params.kv_ranges.shape[0]
    static_kv_range_diff = cross_attn_params.kv_ranges[:, 1] - cross_attn_params.kv_ranges[:, 0]
    static_kv_range_end = static_kv_range_start + static_kv_range_diff
    static_kv_range = torch.stack((static_kv_range_start, static_kv_range_end), dim=1)

    assert static_kv_range.shape == cross_attn_params.kv_ranges.shape
    return PackedCrossAttnParams(
        q_ranges=cross_attn_params.q_ranges,
        kv_ranges=static_kv_range,
        cu_seqlens_q=cross_attn_params.cu_seqlens_q,
        cu_seqlens_kv=cross_attn_params.cu_seqlens_kv,
        max_seqlen_q=cross_attn_params.max_seqlen_q,
        max_seqlen_kv=cross_attn_params.max_seqlen_kv,
    )


def cp_update_cross_attn_qkv_range(
    cross_attn_params: PackedCrossAttnParams,
    batch_size: int,
    cp_split_sizes: List[int],
    device: torch.device,
    cp_shuffle_num: int = 1,
    cp_pad_size: int = 0,
):
    """
    Update cross_attn_params for cross_attn in context parallel.

    Input:
        cross_attn_params: PackedCrossAttnParams. Packed sequence parameters for cross_atten
        batch_size: int. Batch size
        cp_split_sizes: List[int]. Split sizes for each rank
        device: torch.device. Device

    Output:
        cross_attn_params: PackedCrossAttnParams. Updated packed parameters for cross_atten
    """
    # Update cu_seqlens_q and max_seqlen_q because split x maybe unbalanced
    cp_rank = mpu.get_cp_rank()
    seq_len_cur_rank = cp_split_sizes[cp_rank]
    cp_split_sizes = [divide(x, cp_shuffle_num) for x in cp_split_sizes]
    cp_split_sizes = torch.tensor(cp_split_sizes, dtype=torch.int32, device=device)
    base_cp_boundaries = torch.cat((torch.zeros(1, dtype=torch.int32, device=device), cp_split_sizes.cumsum(0)))
    total_seq_len = base_cp_boundaries[-1]

    cu_seqlens_q = cross_attn_params.cu_seqlens_q
    cu_seqlens_k = cross_attn_params.cu_seqlens_kv
    cu_seqlens_pad = torch.arange(cu_seqlens_q.shape[0], dtype=torch.int32, device=device) * divide(
        cp_pad_size, cp_shuffle_num
    )
    cu_seqlens_q = cu_seqlens_q + cu_seqlens_pad

    q_seg_starts, q_seg_ends = cu_seqlens_q[:-1], cu_seqlens_q[1:]

    xattn_q_ranges, xattn_k_ranges = [], []
    for i in range(batch_size):
        inner_xattn_q_ranges, inner_xattn_k_ranges = [], []
        for j in range(cp_shuffle_num):
            global_offset = i * total_seq_len * cp_shuffle_num + j * total_seq_len
            cp_boundaries = base_cp_boundaries + global_offset
            this_cp_start, this_cp_end = (cp_boundaries[cp_rank], cp_boundaries[cp_rank + 1])

            q_inter_starts = torch.maximum(this_cp_start, q_seg_starts)
            q_inter_ends = torch.minimum(this_cp_end, q_seg_ends)

            q_mask = q_inter_starts < q_inter_ends
            valid_q_starts = q_inter_starts[q_mask]
            valid_q_ends = q_inter_ends[q_mask]

            k_seg_starts, k_seg_ends = cu_seqlens_k[:-1], cu_seqlens_k[1:]
            valid_indices = torch.nonzero(q_mask, as_tuple=True)[0]

            valid_k_starts = k_seg_starts[valid_indices]
            valid_k_ends = k_seg_ends[valid_indices]

            part_xattn_q_rangs = torch.stack((valid_q_starts, valid_q_ends), dim=1)
            offset = part_xattn_q_rangs[:, 0].min()
            part_xattn_q_rangs = part_xattn_q_rangs - offset

            inner_xattn_q_ranges.append(part_xattn_q_rangs)
            inner_xattn_k_ranges.append(torch.stack((valid_k_starts, valid_k_ends), dim=1))
        inner_end_values = torch.tensor([ranges[-1, -1] for ranges in inner_xattn_q_ranges], dtype=torch.int32)
        inner_offsets = torch.cat((torch.zeros(1, dtype=inner_end_values.dtype), torch.cumsum(inner_end_values[:-1], dim=0)))
        inner_xattn_q_ranges = [tensor + int(offset) for tensor, offset in zip(inner_xattn_q_ranges, inner_offsets)]
        xattn_q_ranges.append(torch.cat(inner_xattn_q_ranges, dim=0))
        xattn_k_ranges.append(torch.cat(inner_xattn_k_ranges, dim=0))

    end_values = torch.tensor([ranges[-1, -1].item() for ranges in xattn_q_ranges], dtype=torch.int32)
    offsets = torch.cat((torch.zeros(1, dtype=end_values.dtype), torch.cumsum(end_values[:-1], dim=0)))

    shifted_tensors = [tensor + int(offset) for tensor, offset in zip(xattn_q_ranges, offsets)]
    xattn_q_ranges_ts = torch.cat(shifted_tensors, dim=0)
    xattn_k_ranges_ts = torch.cat(xattn_k_ranges, dim=0)

    cu_seqlens_q = torch.unique(xattn_q_ranges_ts)
    cu_seqlens_k = torch.unique(xattn_k_ranges_ts)
    assert (
        cu_seqlens_q.shape == cu_seqlens_k.shape
    ), f"cu_seqlens_q.shape: {cu_seqlens_q.shape}, cu_seqlens_k.shape: {cu_seqlens_k.shape}, "

    return PackedCrossAttnParams(
        q_ranges=xattn_q_ranges_ts,
        kv_ranges=xattn_k_ranges_ts,
        cu_seqlens_q=cu_seqlens_q,
        cu_seqlens_kv=cu_seqlens_k,
        max_seqlen_q=seq_len_cur_rank,
        max_seqlen_kv=cross_attn_params.max_seqlen_kv,
    )


def cp_ulysses_process(
    cp_size: int,
    x: torch.Tensor,
    condition_map: torch.Tensor,
    rope: torch.Tensor,
    xattn_mask_for_cuda_graph: Union[torch.Tensor, None],
    cross_attn_params: PackedCrossAttnParams,
):
    seq_len, N, D = x.shape
    assert seq_len == rope.size(0), f"seq_len: {seq_len} != rope.size(0): {rope.size(0)}"
    assert condition_map.size(0) == seq_len, f"condition_map.size(0): {condition_map.size(0)} != seq_len: {seq_len}"

    # Part1: split for CP
    cp_split_sizes = [seq_len // cp_size] * cp_size
    for i in range(seq_len % cp_size):
        cp_split_sizes[i] += 1

    # Part2: scatter to CP
    x = scatter_to_context_parallel_region(x, cp_split_sizes)
    condition_map = scatter_to_context_parallel_region(condition_map, cp_split_sizes)
    rope = scatter_to_context_parallel_region(rope, cp_split_sizes)

    # Part3: update cross_attn cross_attn_params
    cross_attn_params = cp_update_cross_attn_qkv_range(cross_attn_params, N, cp_split_sizes, x.device)
    if xattn_mask_for_cuda_graph is not None:
        cross_attn_params = update_packed_seq_params_for_cuda_graph(cross_attn_params, xattn_mask_for_cuda_graph)

    return x, condition_map, rope, cp_split_sizes, cross_attn_params


def cp_shuffle_overlap_process(
    cp_size: int,
    x: torch.Tensor,
    condition_map: torch.Tensor,
    rope: torch.Tensor,
    xattn_mask_for_cuda_graph: Union[torch.Tensor, None],
    ardf_meta: dict,
    core_attn_params: PackedCoreAttnParams,
    cross_attn_params: PackedCrossAttnParams,
):
    seq_len, N, D = x.shape
    assert seq_len == rope.size(0), f"seq_len: {seq_len} != rope.size(0): {rope.size(0)}"
    assert condition_map.size(0) == seq_len, f"condition_map.size(0): {condition_map.size(0)} != seq_len: {seq_len}"
    cp_shuffle_num = ardf_meta["denoising_range_num"]

    # Part1: calculate cp_pad_size and cp_split_sizes
    cp_pad_size = 0
    if divide(seq_len, cp_shuffle_num) % cp_size != 0:
        cp_pad_size = (cp_size - divide(seq_len, cp_shuffle_num) % cp_size) * cp_shuffle_num
    cp_split_sizes = [(seq_len + cp_pad_size) // cp_size] * cp_size

    # Part2: scatter to CP
    x = scatter_to_context_parallel_region(x, cp_split_sizes, cp_shuffle_num, cp_pad_size)
    condition_map = scatter_to_context_parallel_region(condition_map, cp_split_sizes, cp_shuffle_num, cp_pad_size)
    rope = scatter_to_context_parallel_region(rope, cp_split_sizes, cp_shuffle_num, cp_pad_size)

    # Part3: update core_attn_params
    gcd = math.gcd(seq_len, seq_len + cp_pad_size)
    _sq = seq_len // gcd
    _psq = (seq_len + cp_pad_size) // gcd
    q_range = ardf_meta["q_range"] * _psq // _sq
    max_seqlen_q = ardf_meta["max_seqlen_q"] * _psq // _sq
    core_attn_params = PackedCoreAttnParams(
        q_range=q_range,
        k_range=ardf_meta["k_range"],
        np_q_range=q_range.cpu().numpy(),
        np_k_range=ardf_meta["k_range"].cpu().numpy(),
        max_seqlen_q=max_seqlen_q,
        max_seqlen_k=ardf_meta["max_seqlen_k"],
    )

    # Part4: update cross_attn cross_attn_params
    cross_attn_params = cp_update_cross_attn_qkv_range(
        cross_attn_params, N, cp_split_sizes, x.device, cp_shuffle_num, cp_pad_size
    )
    if xattn_mask_for_cuda_graph is not None:
        cross_attn_params = update_packed_seq_params_for_cuda_graph(cross_attn_params, xattn_mask_for_cuda_graph)

    return x, condition_map, rope, cp_pad_size, cp_split_sizes, core_attn_params, cross_attn_params


def cp_pre_process(
    cp_size: int,
    cp_strategy: str,
    x: torch.Tensor,
    condition_map: torch.Tensor,
    rope: torch.Tensor,
    xattn_mask_for_cuda_graph: Union[torch.Tensor, None],
    ardf_meta: dict,
    core_attn_params: PackedCoreAttnParams,
    cross_attn_params: PackedCrossAttnParams,
):
    """
    This function is used to handle context parallel behavior,
    split input tensors into multiple parts and scatter them to different GPUs.

    Input:
        cp_strategy: str. cp_ulysses for hopper or newer, cp_shuffle_overlap for 4090 or older
        x: (S, N, D). torch.Tensor of inputs embedding (images or latent representations of images)
        condition_map: (N * S). torch.Tensor determine which condition to use for each token
        rope: (S, 96). torch.Tensor of rope
        xattn_mask_for_cuda_graph: (N * denoising_range_num, L, 1, 1). torch.Tensor of xattn mask for cuda graph, None means no cuda graph
        core_attn_params: PackedCoreAttnParams. Packed sequence parameters for core_atten
        cross_attn_params: PackedCrossAttnParams. Packed sequence parameters for cross_atten

    Output:
        x: (S', N, D). torch.Tensor of inputs embedding (images or latent representations of images)
        condition_map: (N * S'). torch.Tensor determine which condition to use for each token
        rope: (S', 96). torch.Tensor of rope
        cp_split_sizes: List[int]. Split sizes for each rank
        core_attn_params: PackedCoreAttnParams
        cross_attn_params: PackedCrossAttnParams
    """
    if cp_size == 1:
        return x, condition_map, rope, None, None, core_attn_params, cross_attn_params
    if cp_strategy == "cp_ulysses":
        (x, condition_map, rope, cp_split_sizes, cross_attn_params) = cp_ulysses_process(
            cp_size, x, condition_map, rope, xattn_mask_for_cuda_graph, cross_attn_params
        )
        return (x, condition_map, rope, 0, cp_split_sizes, core_attn_params, cross_attn_params)
    elif cp_strategy == "cp_shuffle_overlap":
        (
            x,
            condition_map,
            rope,
            cp_pad_size,
            cp_split_sizes,
            core_attn_params,
            cross_attn_params,
        ) = cp_shuffle_overlap_process(
            cp_size, x, condition_map, rope, xattn_mask_for_cuda_graph, ardf_meta, core_attn_params, cross_attn_params
        )
        return (x, condition_map, rope, cp_pad_size, cp_split_sizes, core_attn_params, cross_attn_params)
    else:
        raise ValueError(f"Invalid CP strategy: {cp_strategy}, expected cp_ulysses or cp_shuffle_overlap")


def cp_post_process(cp_size: int, cp_strategy: str, x: torch.Tensor, meta_args: ModelMetaArgs) -> torch.Tensor:
    if cp_size == 1:
        return x
    if cp_strategy == "cp_shuffle_overlap":
        x = gather_from_context_parallel_region(
            x, meta_args.cp_split_sizes, meta_args.denoising_range_num, meta_args.cp_pad_size
        )
    elif cp_strategy == "cp_ulysses":
        x = gather_from_context_parallel_region(x, meta_args.cp_split_sizes)
    else:
        raise ValueError(f"Invalid CP strategy: {cp_strategy}, expected cp_ulysses or cp_shuffle_overlap")
    return x


#####################################################
# Ulysses Attention Pipeline
#####################################################
def all_to_all_input_split(tensor: torch.Tensor, cp_split_sizes: List[int]) -> Tuple[torch.Tensor, torch.distributed.Work]:
    """
    Scatter head_number and gather seq_len, for example:
    input: (seq_len, cp * hn, hd)
    output: (seq_len * cp, hn, hd)
    NOTE: seq_len of input maybe not equal, which depends on cp_split_sizes[mpu.get_cp_rank()]
    """
    cp_world_size = mpu.get_cp_world_size()
    if cp_world_size == 1:
        return tensor, FakeHandle()
    assert cp_split_sizes is not None
    _, hn, _ = tensor.shape
    if cp_world_size % hn == 0 and cp_world_size != hn:
        tensor = torch.repeat_interleave(tensor, repeats=divide(cp_world_size, hn), dim=1).contiguous()
    assert tensor.is_contiguous()
    input = rearrange(tensor, "seq (cp hn) hd -> (cp seq) hn hd", cp=cp_world_size).contiguous()
    output = torch.empty([sum(cp_split_sizes), *input.shape[1:]], device=input.device, dtype=input.dtype)
    handle = torch.distributed.all_to_all_single(
        output, input, output_split_sizes=cp_split_sizes, group=mpu.get_cp_group(), async_op=True
    )
    return output, handle


def all_to_all_output_split(tensor: torch.Tensor, cp_split_sizes: List[int]) -> Tuple[torch.Tensor, torch.distributed.Work]:
    """
    Scatter seq_len and gather head_number, for example:
    input: (seq_len * cp, hn, hd)
    output: (seq_len, cp * hn, hd)
    NOTE: seq_len of output maybe not equal, which depends on cp_split_sizes[mpu.get_cp_rank()]
    """
    cp_world_size = mpu.get_cp_world_size()
    if cp_world_size == 1:
        return tensor, FakeHandle()
    assert cp_split_sizes is not None
    assert tensor.is_contiguous()
    _, hn, _ = tensor.shape
    output = torch.empty(
        [cp_split_sizes[mpu.get_cp_rank()] * cp_world_size, *tensor.shape[1:]], device=tensor.device, dtype=tensor.dtype
    )
    handle = torch.distributed.all_to_all_single(
        output, tensor, input_split_sizes=cp_split_sizes, group=mpu.get_cp_group(), async_op=True
    )
    return output, handle


def fused_qkv_communication(
    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, cp_split_sizes: List[int]
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    cp_world_size = mpu.get_cp_world_size()
    if cp_world_size == 1:
        return q, k, v
    assert cp_split_sizes is not None
    _, k_head, _ = k.shape
    if cp_world_size % k_head == 0 and cp_world_size != k_head:
        k = torch.repeat_interleave(k, repeats=divide(cp_world_size, k_head), dim=1)
        v = torch.repeat_interleave(v, repeats=divide(cp_world_size, k_head), dim=1)

    q = rearrange(q, "seq (cp hn) hd -> (cp seq) hn hd", cp=cp_world_size).contiguous()
    k = rearrange(k, "seq (cp hn) hd -> (cp seq) hn hd", cp=cp_world_size).contiguous()
    v = rearrange(v, "seq (cp hn) hd -> (cp seq) hn hd", cp=cp_world_size).contiguous()
    head_split_number = [q.shape[1], k.shape[1], v.shape[1]]
    qkv = torch.cat([q, k, v], dim=1).contiguous()

    qkv_output = torch.empty([sum(cp_split_sizes), *qkv.shape[1:]], device=qkv.device, dtype=qkv.dtype)
    torch.distributed.all_to_all_single(
        qkv_output, qkv, output_split_sizes=cp_split_sizes, group=mpu.get_cp_group(), async_op=False
    )
    q, k, v = torch.split(qkv_output, head_split_number, dim=1)
    return q, k, v


class UlyssesScheduler:
    def __init__(self):
        pass

    @staticmethod
    def get_attn_and_xattn_with_comm_overlap(
        get_q_func: Callable,  # [seq hn hd]
        get_k_func: Callable,  # [seq hn hd]
        get_v_func: Callable,  # [seq hn hd]
        kv_cache_func: Callable,
        core_attn_func: Callable,
        cross_attn_func: Callable,
        overlap_degree: int,
        batch_size: int,
        cp_size: int,
        cp_split_sizes: List[int] = None,
    ):
        """
        Get Q, K, V with communication overlap.
        Input:
            get_q: Callable, function to get q, shape [b, sq, hn, hd]
            get_k: Callable, function to get k, shape [sq, b, hn, hd]
            get_v: Callable, function to get v, shape [sq, b, hn, hd]
        NOTE: Why follow such compute and comm order?
        1. v_compute
        2. k_compute(overlap with v_comm)
        3. q_compute(overlap with k_comm)
        4. kv_cache_func(overlap with q_comm)
        Follow the principle: We need to begin comm as soon as possible to hide the comm latency.
        The computation flops and commnunication order is:
        flops order: q_compute (larger hidden_size + layernorm) > k_compute (layernorm) > v_compute
        comm order: q_compute (larger hidden_size) > k_compute = v_compute
        """
        value = get_v_func()
        value, handle_v = all_to_all_input_split(value, cp_split_sizes)
        key = get_k_func()
        key, handle_k = all_to_all_input_split(key, cp_split_sizes)
        query = get_q_func()
        query, handle_q = all_to_all_input_split(query, cp_split_sizes)

        handle_v.wait()
        handle_k.wait()
        kv = torch.concat([key, value], dim=-1)

        key, value = kv_cache_func(kv)
        handle_q.wait()
        return UlyssesScheduler.get_attn_and_xattn_base(
            query, key, value, core_attn_func, cross_attn_func, overlap_degree, batch_size, cp_size, cp_split_sizes
        )

    @staticmethod
    def get_attn_and_xattn_with_fused_kv_comm(
        get_q_func: Callable,
        get_kv_func: Callable,
        kv_cache_func: Callable,
        core_attn_func: Callable,
        cross_attn_func: Callable,
        overlap_degree: int,
        batch_size: int,
        cp_size: int,
        cp_split_sizes: List[int] = None,
    ):
        """
        When seq_len is very small, CPU-bound issues are severe. By fusing kv communication,
        CPU operations and the number of kernel launches are reduced.
        """
        kv = get_kv_func()
        kv, handle_kv = all_to_all_input_split(kv, cp_split_sizes)
        query = get_q_func()
        query, handle_q = all_to_all_input_split(query, cp_split_sizes)
        handle_kv.wait()
        key, value = kv_cache_func(kv)
        handle_q.wait()
        return UlyssesScheduler.get_attn_and_xattn_base(
            query, key, value, core_attn_func, cross_attn_func, overlap_degree, batch_size, cp_size, cp_split_sizes
        )

    def get_attn_and_xattn_with_fused_qkv_comm(
        get_qkv_func: Callable,
        kv_cache_func: Callable,
        core_attn_func: Callable,
        cross_attn_func: Callable,
        overlap_degree: int,
        batch_size: int,
        cp_size: int,
        cp_split_sizes: List[int] = None,
    ):
        """
        By fusing the communication of q, k, and v together, further optimize CPU-bound issues.
        """
        q, k, v = get_qkv_func()
        q, k, v = fused_qkv_communication(q, k, v, cp_split_sizes)
        k, v = kv_cache_func(torch.cat([k, v], dim=-1))
        return UlyssesScheduler.get_attn_and_xattn_base(
            q, k, v, core_attn_func, cross_attn_func, overlap_degree, batch_size, cp_size, cp_split_sizes
        )

    @staticmethod
    def get_attn_and_xattn_base(
        query: torch.Tensor,
        key: torch.Tensor,
        value: torch.Tensor,
        core_attn_func: Callable,
        cross_attn_func: Callable,
        overlap_degree: int,
        batch_size: int,
        cp_size: int,
        cp_split_sizes: List[int] = None,
    ):
        # Split Query, Key, Value into multiple parts
        # k/v may have different sequence length with q due to kv cache
        q_seq, q_head, q_hidden = query.shape
        kv_seq, kv_head, kv_hidden = key.shape
        if overlap_degree == -1:
            overlap_degree = q_head // kv_head
        else:
            assert overlap_degree <= q_head

        if overlap_degree == 1:
            query = [query]
        elif kv_head == 1:  # MQA
            query = query.chunk(overlap_degree, dim=1)
        else:  # GQA
            assert q_head % (overlap_degree * kv_head) == 0
            query = query.reshape(q_seq, kv_head, -1, q_hidden)
            query = query.chunk(overlap_degree, dim=2)
            query = [q.reshape(q_seq, -1, q_hidden) for q in query]

        # Compute Core Attention
        handle_attn = None
        core_attn_out = None
        core_attn_outs = []
        for i in range(overlap_degree):
            core_attn_out_new = core_attn_func(query[i], key, value)
            if handle_attn is not None:
                handle_attn.wait()
                core_attn_outs.append(core_attn_out)
            core_attn_out, handle_attn = all_to_all_output_split(core_attn_out_new, cp_split_sizes)

        xattn_out = cross_attn_func()
        handle_attn.wait()
        core_attn_outs.append(core_attn_out)
        core_attn_out = torch.cat(core_attn_outs, dim=1)

        core_attn_out = rearrange(core_attn_out, "(cp sq b) hn hd -> (sq) b (cp hn hd)", cp=cp_size, b=batch_size)
        return core_attn_out, xattn_out


#####################################################
# CSO(context shuffle overlap) Attention Pipeline
#####################################################
def cso_communication(
    input: torch.Tensor, cp_world_size: int, cp_split_sizes: List[int], comm_type: str = None
) -> Tuple[torch.Tensor, torch.distributed.Work]:
    if cp_world_size == 1:
        return input, FakeHandle()
    assert cp_split_sizes is not None
    _, hn, _ = input.shape
    if comm_type == "kv":
        if cp_world_size % hn == 0 and cp_world_size != hn:
            input = torch.repeat_interleave(input, repeats=divide(cp_world_size, hn), dim=1)
        input = rearrange(input, "spb (cp hn) hd -> (cp spb) hn hd", cp=cp_world_size).contiguous()
    output = torch.empty(input.shape, device=input.device, dtype=input.dtype)

    handle = torch.distributed.all_to_all_single(
        output, input, input_split_sizes=cp_split_sizes, group=mpu.get_cp_group(), async_op=True
    )

    return output, handle


class CSOHelper:
    def __init__(self, cp_shuffle_num, cp_world_size, cp_split_sizes):
        self.cp_shuffle_num = cp_shuffle_num
        self.cp_world_size = cp_world_size
        self.cp_split_sizes = [divide(x, self.cp_shuffle_num) for x in cp_split_sizes]

    def split_query_for_overlap(self, query):
        query = rearrange(
            query, "(dn spb) (cp hn) hd -> (dn cp spb) hn hd", cp=self.cp_world_size, dn=self.cp_shuffle_num
        ).contiguous()
        querys = list(torch.chunk(query, self.cp_shuffle_num, dim=0))
        querys[0], handle_q = cso_communication(querys[0], self.cp_world_size, self.cp_split_sizes)
        return querys, handle_q

    def overlap(self, fattn, qs, k, v):
        core_attn_outs = []
        for i in range(self.cp_shuffle_num):
            if self.cp_shuffle_num == 1:
                q = qs[0]
            elif i == 0:
                q = qs[0]
                loop_var, loop_handle = cso_communication(qs[i + 1], self.cp_world_size, self.cp_split_sizes)
            else:
                loop_handle.wait()
                if loop_var.numel() == qs[0].numel():
                    q = loop_var
                else:
                    assert loop_var.numel() == qs[0].numel() * 2
                    q, ready_o = torch.chunk(loop_var, 2, dim=-1)
                    core_attn_outs.append(ready_o)
                loop_var = torch.concat([qs[i + 1], o], dim=-1) if i < self.cp_shuffle_num - 1 else o
                loop_var, loop_handle = cso_communication(loop_var, self.cp_world_size, self.cp_split_sizes)

            o = fattn(q, k, v, i)
            if i == self.cp_shuffle_num - 1:
                if i != 0:
                    loop_handle.wait()
                    assert loop_var.numel() == qs[0].numel()
                    core_attn_outs.append(loop_var)
                last_o, handle_attn = cso_communication(o, self.cp_world_size, self.cp_split_sizes)
                core_attn_outs.append(last_o)
        return core_attn_outs, handle_attn

```

## /inference/infra/parallelism/pipeline_parallel.py

```py path="/inference/infra/parallelism/pipeline_parallel.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import queue
from dataclasses import dataclass
from typing import Optional

import torch

from inference.infra.distributed import parallel_state as mpu


@dataclass
class TensorAndHandler:
    tensor: torch.Tensor
    handler: torch.distributed.Work


class PPScheduler:
    def __init__(self):
        """Initialize an instance of the PPScheduler class"""

        self.device: torch.device = torch.device(f"cuda:{torch.cuda.current_device()}")
        self.recv_queue: queue.Queue = queue.Queue()

    def isend_next(self, tensor: torch.Tensor) -> torch.distributed.Work:
        """Asynchronously send a tensor to the next pipeline and return the send handle.

        Args:
            tensor (torch.Tensor): The tensor to be sent.

        Returns:
            torch.distributed.Work: The handle for the send operation.
        """
        handle = torch.distributed.isend(
            tensor.contiguous(), dst=mpu.get_pipeline_model_parallel_next_rank(), group=mpu.get_pp_group()
        )
        return handle

    def irecv_prev(self, buffer: torch.Tensor) -> torch.distributed.Work:
        """Asynchronously receive a tensor from the previous pipeline and return the receive handle.

        Args:
            buffer (torch.Tensor): The buffer tensor for receiving data.

        Returns:
            torch.distributed.Work: The handle for the receive operation.
        """
        handle = torch.distributed.irecv(buffer, src=mpu.get_pipeline_model_parallel_prev_rank(), group=mpu.get_pp_group())
        return handle

    def recv_prev_data(self, shape: torch.Size, dtype: torch.dtype) -> torch.Tensor:
        """Receive data from the previous pipeline and return the received tensor.

        Args:
            shape (torch.Size): The shape of the tensor to receive.
            dtype (torch.dtype): The data type of the tensor to receive.

        Returns:
            torch.Tensor: The received tensor.
        """
        recv_tensor = torch.empty(shape, dtype=dtype, device=self.device)
        self.irecv_prev(recv_tensor).wait()
        return recv_tensor

    def queue_irecv_prev(self, shape: torch.Size, dtype: torch.dtype) -> None:
        """Put the asynchronously received tensor and handle into the receive queue.

        Args:
            shape (torch.Size): The shape of the tensor to receive.
            dtype (torch.dtype): The data type of the tensor to receive.
        """
        recv_tensor = torch.empty(shape, dtype=dtype, device=self.device)
        handle = self.irecv_prev(recv_tensor)
        self.recv_queue.put(TensorAndHandler(tensor=recv_tensor, handler=handle))

    def queue_irecv_prev_data(self) -> torch.Tensor:
        """Get a tensor from the receive queue and wait for the receive operation to complete.

        Returns:
            torch.Tensor: The received tensor obtained from the queue.
        """
        tensor_and_handler = self.recv_queue.get()
        tensor_and_handler.handler.wait()
        return tensor_and_handler.tensor


_PP_SCHEDULER: Optional[PPScheduler] = None


def init_pp_scheduler():
    """Initialize the PPScheduler instance.

    Raises:
        AssertionError: If the PPScheduler is already initialized.
    """
    global _PP_SCHEDULER
    assert _PP_SCHEDULER is None, "pipeline model parallel group is already initialized"
    _PP_SCHEDULER = PPScheduler()


def pp_scheduler() -> PPScheduler:
    """Get the current PPScheduler instance.

    Returns:
        PPScheduler: The current PPScheduler instance.

    Raises:
        AssertionError: If the PPScheduler has not been initialized.
    """
    assert _PP_SCHEDULER is not None, "pipeline model parallel group is not initialized"
    return _PP_SCHEDULER

```

## /inference/infra/parallelism/tile_parallel.py

```py path="/inference/infra/parallelism/tile_parallel.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from collections import OrderedDict
from typing import List

import torch
from tqdm import tqdm


class ParallelHelper:
    def __init__(self):
        pass

    @staticmethod
    def split_tile_list(
        tile_numel_dict: OrderedDict[int, int], parallel_group: torch.distributed.ProcessGroup = None
    ) -> List[int]:
        """
        Splits the given tile size into a list of sizes that each rank should handle.

        This method takes into account the number of ranks in a distributed setting.
        If the distributed environment is not initialized, it returns a list of
        integers from 0 to tile_size - 1, representing each tile index.

        If the distributed environment is initialized, it calculates the base tile size
        for each rank and distributes any remaining tiles among the ranks.

        Args:
            tile_numel_dict (OrderedDict[int, int]): Dict of index and numel of tiles.
            parallel_group (torch.distributed.ProcessGroup, optional):
                Distributed decoding group. Defaults to None.

        Returns:
            List[int]: A list of tile indices assigned to the current rank.
            List[int]: A list of global tile indices.
        """
        if not torch.distributed.is_initialized():
            return list(range(len(tile_numel_dict))), list(range(len(tile_numel_dict)))
        else:
            tile_idxs = list(OrderedDict(sorted(tile_numel_dict.items(), key=lambda x: x[1], reverse=True)).keys())
            world_size = torch.distributed.get_world_size(group=parallel_group)
            cur_rank = torch.distributed.get_rank(group=parallel_group)
            global_tile_idxs = []
            cur_rank_tile_idxs = []
            for rank in range(world_size):
                rank_tile_idxs = [tile_idxs[rank + world_size * i] for i in range(len(tile_idxs) // world_size)]
                if rank < len(tile_idxs) % world_size:
                    rank_tile_idxs.append(tile_idxs[len(tile_idxs) // world_size * world_size + rank])
                if rank == cur_rank:
                    cur_rank_tile_idxs = rank_tile_idxs
                global_tile_idxs = global_tile_idxs + rank_tile_idxs
            return cur_rank_tile_idxs, global_tile_idxs

    @staticmethod
    def gather_frames(
        frames: List[torch.Tensor], global_tile_idxs: List[int], parallel_group: torch.distributed.ProcessGroup = None
    ) -> List[torch.Tensor]:
        """
        Gathers frame data from all ranks in a distributed environment.

        This method collects frames from all ranks and combines them into a single list.
        If the distributed environment is not initialized, it simply returns the input frames.

        Args:
            frames (List[torch.Tensor]): A list of frames (tensors) from the current rank.
            global_tile_idxs (List[int]): A list of global tile indices.
            parallel_group (torch.distributed.ProcessGroup, optional):
                Distributed decoding group. Defaults to None.

        Returns:
            List[torch.Tensor]: A list of frames (tensors) from all ranks.
        """
        if not torch.distributed.is_initialized():
            return frames
        else:
            #  assert len(frames) > 0
            # Communicate shapes
            if len(frames) == 0:
                cur_rank_shapes = []
            else:
                cur_rank_shapes = [frame.shape for frame in frames]
            all_rank_shapes = [None] * torch.distributed.get_world_size(group=parallel_group)
            torch.distributed.all_gather_object(all_rank_shapes, cur_rank_shapes, group=parallel_group)

            all_rank_sizes = []
            total_size = []
            for per_rank_shapes in all_rank_shapes:
                per_rank_sizes = []
                per_rank_total_size = 0
                for shape in per_rank_shapes:
                    per_rank_sizes.append(shape[0] * shape[1] * shape[2] * shape[3] * shape[4])
                    per_rank_total_size += shape[0] * shape[1] * shape[2] * shape[3] * shape[4]
                all_rank_sizes.append(per_rank_sizes)
                total_size.append(per_rank_total_size)

            # Gather all frames
            if len(frames) == 0:
                flattened_frames = torch.zeros([0], dtype=torch.bfloat16, device="cuda")
            else:
                flattened_frames = torch.cat([frame.flatten().contiguous() for frame in frames], dim=0)
                assert flattened_frames.dtype == torch.bfloat16
            gather_tensors = [
                torch.zeros(total_size[i], dtype=torch.bfloat16, device="cuda")
                for i in range(torch.distributed.get_world_size(group=parallel_group))
            ]
            torch.distributed.all_gather(gather_tensors, flattened_frames, group=parallel_group)

            result_frames = []
            for idx, per_rank_shapes in enumerate(all_rank_shapes):
                offset = 0
                for j, shape in enumerate(per_rank_shapes):
                    result_frames.append(gather_tensors[idx][offset : offset + all_rank_sizes[idx][j]].view(shape))
                    offset += all_rank_sizes[idx][j]
            result_frames_dict = OrderedDict((idx, frame) for idx, frame in zip(global_tile_idxs, result_frames))
            result_frames = list(OrderedDict(sorted(result_frames_dict.items())).values())
            return result_frames

    @staticmethod
    def index_undot(index: int, loop_size: List[int]) -> List[int]:
        """
        Converts a single index into a list of indices, representing the position in a multi-dimensional space.

        This method takes an integer index and a list of loop sizes, and converts the index into a list of indices
        that correspond to the position in a multi-dimensional space.

        Args:
            index (int): The single index to be converted.
            loop_size (List[int]): A list of integers representing the size of each dimension in the multi-dimensional space.

        Returns:
            List[int]: A list of integers representing the position in the multi-dimensional space.
        """
        undotted_index = []
        for i in range(len(loop_size) - 1, -1, -1):
            undotted_index.append(index % loop_size[i])
            index = index // loop_size[i]
        undotted_index.reverse()
        assert len(undotted_index) == len(loop_size)
        return undotted_index

    @staticmethod
    def index_dot(index: List[int], loop_size: List[int]) -> int:
        """
        Converts a list of indices into a single index, representing the position in a multi-dimensional space.

        This method takes a list of indices and a list of loop sizes, and converts the list of indices into a single index
        that corresponds to the position in a multi-dimensional space.

        Args:
            index (List[int]): A list of integers representing the position in the multi-dimensional space.
            loop_size (List[int]): A list of integers representing the size of each dimension in the multi-dimensional space.

        Returns:
            int: A single integer representing the position in the multi-dimensional space.
        """
        assert len(index) == len(loop_size)
        dot_index = 0
        strides = [1]
        for i in range(len(loop_size) - 1, -1, -1):
            strides.append(strides[-1] * loop_size[i])
        strides.reverse()
        strides = strides[1:]
        assert len(index) == len(strides)
        for i in range(len(index)):
            dot_index += index[i] * strides[i]
        return dot_index


class TileProcessor:
    def __init__(
        self,
        encode_fn,
        decode_fn,
        tile_sample_min_height: int = 256,
        tile_sample_min_width: int = 256,
        tile_sample_min_length: int = 16,
        spatial_downsample_factor: int = 8,
        temporal_downsample_factor: int = 1,
        spatial_tile_overlap_factor: float = 0.25,
        temporal_tile_overlap_factor: float = 0,
        sr_ratio=1,
        first_frame_as_image: bool = False,
        parallel_group: torch.distributed.ProcessGroup = None,
    ):
        """
        Initializes an instance of the class.

        Args:
            encode_fn (function): The encoding function used for tile sampling.
            decode_fn (function): The decoding function used for tile reconstruction.
            tile_sample_min_size (int, optional): The minimum size of the sampled tiles. Defaults to 256.
            tile_sample_min_length (int, optional): The minimum length of the sampled tiles. Defaults to 16.
            spatial_downsample_factor (int, optional): The actual spataial downsample factor of given encode_fn. Defaults to 8.
            temporal_downsample_factor (int, optional): The actual temporal downsample factor of the latent space tiles. Defaults to 1.
            tile_overlap_factor (float, optional): The overlap factor between adjacent tiles. Defaults to 0.25.
            parallel_group (torch.distributed.ProcessGroup, optional): Distributed decoding group. Defaults to None.
        """
        self.encode_fn = encode_fn
        self.decode_fn = decode_fn

        self.spatial_downsample_factor = spatial_downsample_factor
        self.temporal_downsample_factor = temporal_downsample_factor
        self.tile_sample_min_height = tile_sample_min_height
        self.tile_sample_min_width = tile_sample_min_width
        self.tile_sample_min_length = tile_sample_min_length
        self.tile_latent_min_height = tile_sample_min_height // spatial_downsample_factor
        self.tile_latent_min_width = tile_sample_min_width // spatial_downsample_factor

        self.tile_latent_min_length = tile_sample_min_length // temporal_downsample_factor
        if first_frame_as_image:
            self.tile_latent_min_length += 1

        self.spatial_tile_overlap_factor = spatial_tile_overlap_factor
        self.temporal_tile_overlap_factor = temporal_tile_overlap_factor
        self.sr_ratio = sr_ratio
        self.parallel_group = parallel_group

    def blend_t(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
        for t in range(blend_extent):
            b[:, :, t, :, :] = a[:, :, -blend_extent + t, :, :] * (1 - t / blend_extent) + b[:, :, t, :, :] * (
                t / blend_extent
            )
        return b

    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
        for y in range(blend_extent):
            b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
                y / blend_extent
            )
        return b

    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
        blend_extent = min(a.shape[4], b.shape[4], blend_extent)
        for x in range(blend_extent):
            b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
                x / blend_extent
            )
        return b

    def tiled_encode(self, x: torch.FloatTensor, verbose: bool = False):
        overlap_height = int(self.tile_sample_min_height * (1 - self.spatial_tile_overlap_factor))
        overlap_width = int(self.tile_sample_min_width * (1 - self.spatial_tile_overlap_factor))
        overlap_length = int(self.tile_sample_min_length * (1 - self.temporal_tile_overlap_factor))
        blend_extent_h = int(self.tile_latent_min_height * self.spatial_tile_overlap_factor)
        blend_extent_w = int(self.tile_latent_min_width * self.spatial_tile_overlap_factor)
        blend_extent_t = int(self.tile_latent_min_length * self.temporal_tile_overlap_factor)
        height_limit = self.tile_latent_min_height - blend_extent_h
        width_limit = self.tile_latent_min_width - blend_extent_w
        frame_limit = self.tile_latent_min_length - blend_extent_t

        length_tile_size = (x.shape[2] + overlap_length - 1) // overlap_length
        height_tile_size = (x.shape[3] + overlap_height - 1) // overlap_height
        width_tile_size = (x.shape[4] + overlap_width - 1) // overlap_width
        total_tile_size = length_tile_size * height_tile_size * width_tile_size
        for_loop_size = [length_tile_size, height_tile_size, width_tile_size]

        tiles = []
        tile_numel_dict = OrderedDict()
        for tile_index in range(total_tile_size):
            undot_tile_index = ParallelHelper.index_undot(tile_index, for_loop_size)
            f_idx, i_idx, j_idx = undot_tile_index
            f = f_idx * overlap_length
            i = i_idx * overlap_height
            j = j_idx * overlap_width

            # Extract the tile from the latent representation and decode it
            tile = x[
                :,
                :,
                f : f + self.tile_sample_min_length,
                i : i + self.tile_sample_min_height,
                j : j + self.tile_sample_min_width,
            ]
            tiles.append(tile)
            tile_numel_dict[tile_index] = tile.numel()
        tile_index_list, global_tile_index_list = ParallelHelper.split_tile_list(
            tile_numel_dict, parallel_group=self.parallel_group
        )
        progress_bar = tqdm(
            total=len(tile_index_list),
            desc=f"[Rank {torch.distributed.get_rank(group=self.parallel_group)}] Encoding Tiles",
            disable=not verbose,
        )

        frames = []
        # Encode each tile based on the tile index list
        for tile_index in tile_index_list:
            tile = tiles[tile_index]
            encoded = self.encode_fn(tile)
            frames.append(encoded)
            progress_bar.update(1)

        # Gather all decoded frames from different ranks
        frames = ParallelHelper.gather_frames(frames, global_tile_index_list, parallel_group=self.parallel_group)
        assert len(frames) == total_tile_size
        progress_bar.close()

        result_frames = []
        # Blend the encoded tiles to create the final output
        for tile_index in range(total_tile_size):
            undot_tile_index = ParallelHelper.index_undot(tile_index, for_loop_size)
            f, i, j = undot_tile_index

            tile = frames[tile_index]
            # Blend with previous tiles if applicable
            if f > 0:
                idx = ParallelHelper.index_dot([f - 1, i, j], for_loop_size)
                tile = self.blend_t(frames[idx], tile, blend_extent_t)
            if i > 0:
                idx = ParallelHelper.index_dot([f, i - 1, j], for_loop_size)
                tile = self.blend_v(frames[idx], tile, blend_extent_h)
            if j > 0:
                idx = ParallelHelper.index_dot([f, i, j - 1], for_loop_size)
                tile = self.blend_h(frames[idx], tile, blend_extent_w)
            result_frames.append(tile[:, :, :frame_limit, :height_limit, :width_limit])

        assert len(result_frames) == total_tile_size

        concat_frames = []
        for f in range(length_tile_size):
            result_rows = []
            for i in range(height_tile_size):
                result_row = []
                for j in range(width_tile_size):
                    idx = ParallelHelper.index_dot([f, i, j], for_loop_size)
                    result_row.append(result_frames[idx])
                result_rows.append(torch.cat(result_row, dim=4))
            concat_frames.append(torch.cat(result_rows, dim=3))

        # Concatenate all result frames along the temporal dimension
        result = torch.cat(concat_frames, dim=2)
        return result

    def tiled_decode(self, z: torch.FloatTensor, verbose: bool = False):
        overlap_height = int(self.tile_latent_min_height * (1 - self.spatial_tile_overlap_factor))
        overlap_width = int(self.tile_latent_min_width * (1 - self.spatial_tile_overlap_factor))
        overlap_length = int(self.tile_latent_min_length * (1 - self.temporal_tile_overlap_factor))

        real_tile_sample_min_height = int(self.tile_latent_min_height * self.spatial_downsample_factor * self.sr_ratio)
        real_tile_sample_min_width = int(self.tile_latent_min_width * self.spatial_downsample_factor * self.sr_ratio)
        real_tile_sample_min_length = int(self.tile_latent_min_length * self.temporal_downsample_factor)

        blend_extent_h = int(real_tile_sample_min_height * self.spatial_tile_overlap_factor)
        blend_extent_w = int(real_tile_sample_min_width * self.spatial_tile_overlap_factor)
        blend_extent_t = int(real_tile_sample_min_length * self.temporal_tile_overlap_factor)

        height_limit = real_tile_sample_min_height - blend_extent_h
        width_limit = real_tile_sample_min_width - blend_extent_w
        frame_limit = real_tile_sample_min_length - blend_extent_t

        length_tile_size = (z.shape[2] + overlap_length - 1) // overlap_length
        height_tile_size = (z.shape[3] + overlap_height - 1) // overlap_height
        width_tile_size = (z.shape[4] + overlap_width - 1) // overlap_width
        total_tile_size = length_tile_size * height_tile_size * width_tile_size
        for_loop_size = [length_tile_size, height_tile_size, width_tile_size]

        tiles = []
        tile_numel_dict = OrderedDict()
        for tile_index in range(total_tile_size):
            undot_tile_index = ParallelHelper.index_undot(tile_index, for_loop_size)
            f_idx, i_idx, j_idx = undot_tile_index
            f = f_idx * overlap_length
            i = i_idx * overlap_height
            j = j_idx * overlap_width

            # Extract the tile from the latent representation and decode it
            tile = z[
                :,
                :,
                f : f + self.tile_latent_min_length,
                i : i + self.tile_latent_min_height,
                j : j + self.tile_latent_min_width,
            ]
            tiles.append(tile)
            tile_numel_dict[tile_index] = tile.numel()
        tile_index_list, global_tile_index_list = ParallelHelper.split_tile_list(
            tile_numel_dict, parallel_group=self.parallel_group
        )
        progress_bar = tqdm(
            total=len(tile_index_list),
            desc=f"[Rank {torch.distributed.get_rank(group=self.parallel_group)}] Decoding Tiles",
            disable=not verbose,
        )

        frames = []
        # Decode each tile based on the tile index list
        for tile_index in tile_index_list:
            tile = tiles[tile_index]
            decoded = self.decode_fn(tile)
            frames.append(decoded)
            progress_bar.update(1)

        progress_bar.close()
        # Gather all decoded frames from different ranks
        frames = ParallelHelper.gather_frames(frames, global_tile_index_list, parallel_group=self.parallel_group)
        assert len(frames) == total_tile_size

        result_frames = []
        # Blend the decoded tiles to create the final output
        for tile_index in tile_index_list:
            undot_tile_index = ParallelHelper.index_undot(tile_index, for_loop_size)
            f, i, j = undot_tile_index

            tile = frames[tile_index].clone()
            # Blend with previous tiles if applicable
            if f > 0:
                idx = ParallelHelper.index_dot([f - 1, i, j], for_loop_size)
                tile = torch.compile(self.blend_t, dynamic=False)(frames[idx], tile, blend_extent_t)
            if i > 0:
                idx = ParallelHelper.index_dot([f, i - 1, j], for_loop_size)
                tile = torch.compile(self.blend_v, dynamic=False)(frames[idx], tile, blend_extent_h)
            if j > 0:
                idx = ParallelHelper.index_dot([f, i, j - 1], for_loop_size)
                tile = torch.compile(self.blend_h, dynamic=False)(frames[idx], tile, blend_extent_w)
            result_frames.append(tile[:, :, :frame_limit, :height_limit, :width_limit])

        # Gather and concatenate the final result frames
        result_frames = ParallelHelper.gather_frames(result_frames, global_tile_index_list, parallel_group=self.parallel_group)
        assert len(result_frames) == total_tile_size

        concat_frames = []
        for f in range(length_tile_size):
            result_rows = []
            for i in range(height_tile_size):
                result_row = []
                for j in range(width_tile_size):
                    idx = ParallelHelper.index_dot([f, i, j], for_loop_size)
                    result_row.append(result_frames[idx])
                result_rows.append(torch.cat(result_row, dim=4))
            concat_frames.append(torch.cat(result_rows, dim=3))

        # Concatenate all result frames along the temporal dimension
        result = torch.cat(concat_frames, dim=2)
        return result

```

## /inference/model/dit/__init__.py

```py path="/inference/model/dit/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .dit_model import get_dit

__all__ = ["get_dit"]

```

## /inference/model/dit/dit_model.py

```py path="/inference/model/dit/dit_model.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import gc
import math
import os
from typing import Tuple

import torch
import torch.distributed
import torch.nn as nn
from einops import rearrange

from inference.common import (
    InferenceParams,
    MagiConfig,
    ModelMetaArgs,
    PackedCoreAttnParams,
    PackedCrossAttnParams,
    env_is_true,
    print_per_rank,
    print_rank_0,
)
from inference.infra.checkpoint import load_checkpoint
from inference.infra.distributed import parallel_state as mpu
from inference.infra.parallelism import cp_post_process, cp_pre_process, pp_scheduler

from .dit_module import CaptionEmbedder, FinalLinear, LearnableRotaryEmbeddingCat, TimestepEmbedder, TransformerBlock


class VideoDiTModel(torch.nn.Module):
    """VideoDiT model for video diffusion.

    Args:
        config (MagiConfig): Transformer config
        pre_process (bool, optional): Include embedding layer (used with pipeline parallelism). Defaults to True.
        post_process (bool, optional): Include an output layer (used with pipeline parallelism). Defaults to True.
    """

    def __init__(self, config: MagiConfig, pre_process: bool = True, post_process: bool = True) -> None:
        super().__init__()

        self.model_config = config.model_config
        self.runtime_config = config.runtime_config
        self.engine_config = config.engine_config

        self.pre_process = pre_process
        self.post_process = post_process
        self.in_channels = self.model_config.in_channels
        self.out_channels = self.model_config.out_channels
        self.patch_size = self.model_config.patch_size
        self.t_patch_size = self.model_config.t_patch_size
        self.caption_max_length = self.model_config.caption_max_length
        self.num_heads = self.model_config.num_attention_heads

        self.x_embedder = nn.Conv3d(
            self.model_config.in_channels,
            self.model_config.hidden_size,
            kernel_size=(self.model_config.t_patch_size, self.model_config.patch_size, self.model_config.patch_size),
            stride=(self.model_config.t_patch_size, self.model_config.patch_size, self.model_config.patch_size),
            bias=False,
        )
        self.t_embedder = TimestepEmbedder(model_config=self.model_config)
        self.y_embedder = CaptionEmbedder(model_config=self.model_config)
        self.rope = LearnableRotaryEmbeddingCat(
            self.model_config.hidden_size // self.model_config.num_attention_heads, in_pixels=False
        )

        # trm block
        self.videodit_blocks = TransformerBlock(
            model_config=self.model_config,
            engine_config=self.engine_config,
            pre_process=pre_process,
            post_process=post_process,
        )

        self.final_linear = FinalLinear(
            self.model_config.hidden_size, self.model_config.patch_size, self.model_config.t_patch_size, self.out_channels
        )

    def generate_kv_range_for_uncondition(self, uncond_x) -> torch.Tensor:
        device = f"cuda:{torch.cuda.current_device()}"
        B, C, T, H, W = uncond_x.shape
        chunk_token_nums = (
            (T // self.model_config.t_patch_size) * (H // self.model_config.patch_size) * (W // self.model_config.patch_size)
        )

        k_chunk_start = torch.linspace(0, (B - 1) * chunk_token_nums, steps=B).reshape((B, 1))
        k_chunk_end = torch.linspace(chunk_token_nums, B * chunk_token_nums, steps=B).reshape((B, 1))
        return torch.concat([k_chunk_start, k_chunk_end], dim=1).to(torch.int32).to(device)

    def unpatchify(self, x, H, W):
        return rearrange(
            x,
            "(T H W) N (pT pH pW C) -> N C (T pT) (H pH) (W pW)",
            H=H,
            W=W,
            pT=self.t_patch_size,
            pH=self.patch_size,
            pW=self.patch_size,
        ).contiguous()

    @torch.no_grad()
    def get_embedding_and_meta(self, x, t, y, caption_dropout_mask, xattn_mask, kv_range, **kwargs):
        """
        Forward embedding and meta for VideoDiT.
        NOTE: This function should only handle single card behavior.

        Input:
            x: (N, C, T, H, W). torch.Tensor of spatial inputs (images or latent representations of images)
            t: (N, denoising_range_num). torch.Tensor of diffusion timesteps
            y: (N * denoising_range_num, 1, L, C). torch.Tensor of class labels
            caption_dropout_mask: (N). torch.Tensor of whether to drop caption
            xattn_mask: (N * denoising_range_num, 1, L). torch.Tensor of xattn mask
            kv_range: (N * denoising_range_num, 2). torch.Tensor of kv range

        Output:
            x: (S, N, D). torch.Tensor of inputs embedding (images or latent representations of images)
            condition: (N, denoising_range_num, D). torch.Tensor of condition embedding
            condition_map: (S, N). torch.Tensor determine which condition to use for each token
            rope: (S, 96). torch.Tensor of rope
            y_xattn_flat: (total_token, D). torch.Tensor of y_xattn_flat
            cuda_graph_inputs: (y_xattn_flat, xattn_mask) or None. None means no cuda graph
                NOTE: y_xattn_flat and xattn_mask with static shape
            H: int. Height of the input
            W: int. Width of the input
            ardf_meta: dict. Meta information for ardf
            cross_attn_params: PackedCrossAttnParams. Packed sequence parameters for cross_atten
        """

        ###################################
        #          Part1: Embed x         #
        ###################################
        x = self.x_embedder(x)  # [N, C, T, H, W]
        batch_size, _, T, H, W = x.shape

        # Prepare necessary variables
        range_num = kwargs["range_num"]
        denoising_range_num = kwargs["denoising_range_num"]
        slice_point = kwargs.get("slice_point", 0)
        frame_in_range = T // denoising_range_num
        prev_clean_T = frame_in_range * slice_point
        T_total = T + prev_clean_T

        ###################################
        #          Part2: rope            #
        ###################################
        # caculate rescale_factor for multi-resolution & multi aspect-ratio training
        # the base_size [16*16] is A predefined size based on data:(256x256)  vae: (8,8,4) patch size: (1,1,2)
        # This definition do not have any relationship with the actual input/model/setting.
        # ref_feat_shape is used to calculate innner rescale factor, so it can be float.
        rescale_factor = math.sqrt((H * W) / (16 * 16))
        rope = self.rope.get_embed(shape=[T_total, H, W], ref_feat_shape=[T_total, H / rescale_factor, W / rescale_factor])
        # the shape of rope is (T*H*W, -1) aka (seq_length, head_dim), as T is the first dimension, we can directly cut it.
        rope = rope[-(T * H * W) :]

        ###################################
        #          Part3: Embed t         #
        ###################################
        assert t.shape[0] == batch_size, f"Invalid t shape, got {t.shape[0]} != {batch_size}"  # nolint
        assert t.shape[1] == denoising_range_num, f"Invalid t shape, got {t.shape[1]} != {denoising_range_num}"  # nolint
        t_flat = t.flatten()  # (N * denoising_range_num,)
        t = self.t_embedder(t_flat)  # (N, D)

        if self.engine_config.distill:
            distill_dt_scalar = 2
            if kwargs["num_steps"] == 12:
                base_chunk_step = 4
                distill_dt_factor = base_chunk_step / kwargs["distill_interval"] * distill_dt_scalar
            else:
                distill_dt_factor = kwargs["num_steps"] / 4 * distill_dt_scalar
            distill_dt = torch.ones_like(t_flat) * distill_dt_factor
            distill_dt_embed = self.t_embedder(distill_dt)
            t = t + distill_dt_embed
        t = t.reshape(batch_size, denoising_range_num, -1)  # (N, range_num, D)

        ######################################################
        # Part4: Embed y, prepare condition and y_xattn_flat #
        ######################################################
        # (N * denoising_range_num, 1, L, D)
        y_xattn, y_adaln = self.y_embedder(y, self.training, caption_dropout_mask)

        assert xattn_mask is not None
        xattn_mask = xattn_mask.squeeze(1).squeeze(1)

        # condition: (N, range_num, D)
        y_adaln = y_adaln.squeeze(1)  # (N, D)
        condition = t + y_adaln.unsqueeze(1)

        assert condition.shape[0] == batch_size
        assert condition.shape[1] == denoising_range_num
        seqlen_per_chunk = (T * H * W) // denoising_range_num
        condition_map = torch.arange(batch_size * denoising_range_num, device=x.device)
        condition_map = torch.repeat_interleave(condition_map, seqlen_per_chunk)
        condition_map = condition_map.reshape(batch_size, -1).transpose(0, 1).contiguous()

        # y_xattn_flat: (total_token, D)
        y_xattn_flat = torch.masked_select(y_xattn.squeeze(1), xattn_mask.unsqueeze(-1).bool()).reshape(-1, y_xattn.shape[-1])
        xattn_mask_for_cuda_graph = None

        ######################################################
        # Part5: Prepare cross_attn_params for cross_atten   #
        ######################################################
        # (N * denoising_range_num, L)
        xattn_mask = xattn_mask.reshape(xattn_mask.shape[0], -1)
        y_index = torch.sum(xattn_mask, dim=-1)
        clip_token_nums = H * W * frame_in_range

        cu_seqlens_q = torch.Tensor([0] + ([clip_token_nums] * denoising_range_num * batch_size)).to(torch.int64).to(x.device)
        cu_seqlens_k = torch.cat([y_index.new_tensor([0]), y_index]).to(torch.int64).to(x.device)
        cu_seqlens_q = cu_seqlens_q.cumsum(-1).to(torch.int32)
        cu_seqlens_k = cu_seqlens_k.cumsum(-1).to(torch.int32)
        assert (
            cu_seqlens_q.shape == cu_seqlens_k.shape
        ), f"cu_seqlens_q.shape: {cu_seqlens_q.shape}, cu_seqlens_k.shape: {cu_seqlens_k.shape}"

        xattn_q_ranges = torch.cat([cu_seqlens_q[:-1].unsqueeze(1), cu_seqlens_q[1:].unsqueeze(1)], dim=1)
        xattn_k_ranges = torch.cat([cu_seqlens_k[:-1].unsqueeze(1), cu_seqlens_k[1:].unsqueeze(1)], dim=1)
        assert (
            xattn_q_ranges.shape == xattn_k_ranges.shape
        ), f"xattn_q_ranges.shape: {xattn_q_ranges.shape}, xattn_k_ranges.shape: {xattn_k_ranges.shape}"

        cross_attn_params = PackedCrossAttnParams(
            q_ranges=xattn_q_ranges,
            kv_ranges=xattn_k_ranges,
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_kv=cu_seqlens_k,
            max_seqlen_q=clip_token_nums,
            max_seqlen_kv=self.caption_max_length,
        )

        ##################################################
        #  Part6: Prepare core_atten related q/kv range  #
        ##################################################
        q_range = torch.cat([cu_seqlens_q[:-1].unsqueeze(1), cu_seqlens_q[1:].unsqueeze(1)], dim=1)
        flat_kv = torch.unique(kv_range, sorted=True)
        max_seqlen_k = (flat_kv[-1] - flat_kv[0]).cpu().item()

        ardf_meta = dict(
            clip_token_nums=clip_token_nums,
            slice_point=slice_point,
            range_num=range_num,
            denoising_range_num=denoising_range_num,
            q_range=q_range,
            k_range=kv_range,
            max_seqlen_q=clip_token_nums,
            max_seqlen_k=max_seqlen_k,
        )

        return (x, condition, condition_map, rope, y_xattn_flat, xattn_mask_for_cuda_graph, H, W, ardf_meta, cross_attn_params)

    @torch.no_grad()
    def forward_pre_process(
        self, x, t, y, caption_dropout_mask=None, xattn_mask=None, kv_range=None, **kwargs
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, ModelMetaArgs]:
        assert kv_range is not None, "Please ensure kv_range is provided"

        x = x * self.model_config.x_rescale_factor

        if self.model_config.half_channel_vae:
            assert x.shape[1] == 16
            x = torch.cat([x, x], dim=1)

        x = x.float()
        t = t.float()
        y = y.float()
        # embedder context will ensure that the processing is in high precision even if the embedder params is in bfloat16 mode
        with torch.autocast(device_type="cuda", dtype=torch.float32):
            (
                x,
                condition,
                condition_map,
                rope,
                y_xattn_flat,
                xattn_mask_for_cuda_graph,
                H,
                W,
                ardf_meta,
                cross_attn_params,
            ) = self.get_embedding_and_meta(x, t, y, caption_dropout_mask, xattn_mask, kv_range, **kwargs)

        # Downcast x and rearrange x
        x = x.to(self.model_config.params_dtype)
        x = rearrange(x, "N C T H W -> (T H W) N C").contiguous()  # (thw, N, D)

        # condition and y_xattn_flat will be downcast to bfloat16 in transformer block.
        condition = condition.to(self.model_config.params_dtype)
        y_xattn_flat = y_xattn_flat.to(self.model_config.params_dtype)

        core_attn_params = PackedCoreAttnParams(
            q_range=ardf_meta["q_range"],
            k_range=ardf_meta["k_range"],
            np_q_range=ardf_meta["q_range"].cpu().numpy(),
            np_k_range=ardf_meta["k_range"].cpu().numpy(),
            max_seqlen_q=ardf_meta["max_seqlen_q"],
            max_seqlen_k=ardf_meta["max_seqlen_k"],
        )

        (x, condition_map, rope, cp_pad_size, cp_split_sizes, core_attn_params, cross_attn_params) = cp_pre_process(
            self.engine_config.cp_size,
            self.engine_config.cp_strategy,
            x,
            condition_map,
            rope,
            xattn_mask_for_cuda_graph,
            ardf_meta,
            core_attn_params,
            cross_attn_params,
        )

        meta_args = ModelMetaArgs(
            H=H,
            W=W,
            cp_pad_size=cp_pad_size,
            cp_split_sizes=cp_split_sizes,
            slice_point=ardf_meta["slice_point"],
            denoising_range_num=ardf_meta["denoising_range_num"],
            range_num=ardf_meta["range_num"],
            extract_prefix_video_feature=kwargs.get("extract_prefix_video_feature", False),
            fwd_extra_1st_chunk=kwargs["fwd_extra_1st_chunk"],
            distill_nearly_clean_chunk=kwargs.get("distill_nearly_clean_chunk", False),
            clip_token_nums=ardf_meta["clip_token_nums"],
            enable_cuda_graph=xattn_mask_for_cuda_graph is not None,
            core_attn_params=core_attn_params,
            cross_attn_params=cross_attn_params,
        )

        return (x, condition, condition_map, y_xattn_flat, rope, meta_args)

    @torch.no_grad()
    def forward_post_process(self, x, meta_args: ModelMetaArgs) -> torch.Tensor:
        x = x.float()
        # embedder context will ensure that the processing is in high precision even if the embedder params is in bfloat16 mode
        with torch.autocast(device_type="cuda", dtype=torch.float32):
            x = self.final_linear(x)  # (thw/cp, N, patch_size ** 2 * out_channels)

        # leave context parallel region
        x = cp_post_process(self.engine_config.cp_size, self.engine_config.cp_strategy, x, meta_args)

        # N C T H W
        x = self.unpatchify(x, meta_args.H, meta_args.W)

        if self.model_config.half_channel_vae:
            assert x.shape[1] == 32
            x = x[:, :16]

        x = x / self.model_config.x_rescale_factor

        return x

    @torch.no_grad()
    def forward(
        self,
        x,
        t,
        y,
        caption_dropout_mask=None,
        xattn_mask=None,
        kv_range=None,
        inference_params: InferenceParams = None,
        **kwargs,
    ) -> torch.Tensor:
        (x, condition, condition_map, y_xattn_flat, rope, meta_args) = self.forward_pre_process(
            x, t, y, caption_dropout_mask, xattn_mask, kv_range, **kwargs
        )

        if not self.pre_process:
            x = pp_scheduler().recv_prev_data(x.shape, x.dtype)
            self.videodit_blocks.set_input_tensor(x)
        else:
            # clone a new tensor to ensure x is not a view of other tensor
            x = x.clone()

        x = self.videodit_blocks.forward(
            hidden_states=x,
            condition=condition,
            condition_map=condition_map,
            y_xattn_flat=y_xattn_flat,
            rotary_pos_emb=rope,
            inference_params=inference_params,
            meta_args=meta_args,
        )

        if not self.post_process:
            pp_scheduler().isend_next(x)

        return self.forward_post_process(x, meta_args)

    def forward_3cfg(
        self, x, timestep, y, mask, kv_range, inference_params, **kwargs
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, int]:
        """
        Forward pass of PixArt, but also batches the unconditional forward pass for classifier-free guidance.
        """
        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb

        assert x.shape[0] == 2
        assert mask.shape[0] % 2 == 0  # mask should be a multiple of 2
        x = torch.cat([x[0:1], x[0:1]], dim=0)
        caption_dropout_mask = torch.tensor([False, True], dtype=torch.bool, device=x.device)

        inference_params.update_kv_cache = False
        out_cond_pre_and_text = self.forward(
            x[0:1],
            timestep[0:1],
            y[0 : y.shape[0] // 2],
            caption_dropout_mask=caption_dropout_mask[0:1],
            xattn_mask=mask[0 : y.shape[0] // 2],
            kv_range=kv_range,
            inference_params=inference_params,
            **kwargs,
        )

        inference_params.update_kv_cache = True
        out_cond_pre = self.forward(
            x[1:2],
            timestep[1:2],
            y[y.shape[0] // 2 : y.shape[0]],
            caption_dropout_mask=caption_dropout_mask[1:2],
            xattn_mask=mask[y.shape[0] // 2 : y.shape[0]],
            kv_range=kv_range,
            inference_params=inference_params,
            **kwargs,
        )

        def chunk_to_batch(input, denoising_range_num):
            input = input.squeeze(0)
            input = input.reshape(-1, denoising_range_num, kwargs["chunk_width"], *input.shape[2:])
            return input.transpose(0, 1)  # (denoising_range_num, chn, chunk_width, h, w)

        def batch_to_chunk(input, denoising_range_num):
            input = input.transpose(0, 1)
            input = input.reshape(1, -1, denoising_range_num * kwargs["chunk_width"], *input.shape[3:])
            return input

        class UnconditionGuard:
            def __init__(self, kwargs):
                self.kwargs = kwargs
                self.prev_state = {
                    "range_num": kwargs["range_num"],
                    "denoising_range_num": kwargs["denoising_range_num"],
                    "slice_point": kwargs["slice_point"],
                    "fwd_extra_1st_chunk": kwargs["fwd_extra_1st_chunk"],
                }

            def __enter__(self):
                if self.kwargs.get("fwd_extra_1st_chunk", False):
                    self.kwargs["denoising_range_num"] -= 1
                    self.kwargs["slice_point"] += 1
                    self.kwargs["fwd_extra_1st_chunk"] = False

            def __exit__(self, exc_type, exc_val, exc_tb):
                self.kwargs["range_num"] = self.prev_state["range_num"]
                self.kwargs["denoising_range_num"] = self.prev_state["denoising_range_num"]
                self.kwargs["slice_point"] = self.prev_state["slice_point"]
                self.kwargs["fwd_extra_1st_chunk"] = self.prev_state["fwd_extra_1st_chunk"]

        with UnconditionGuard(kwargs):
            denoising_range_num = kwargs["denoising_range_num"]
            denoise_width = kwargs["chunk_width"] * denoising_range_num
            uncond_x = chunk_to_batch(x[0:1, :, -denoise_width:], denoising_range_num)
            timestep = timestep[0:1, -denoising_range_num:].transpose(0, 1)
            uncond_y = y[y.shape[0] // 2 : y.shape[0]][-denoising_range_num:]
            caption_dropout_mask = torch.tensor([True], dtype=torch.bool, device=x.device)
            uncond_mask = mask[y.shape[0] // 2 : y.shape[0]][-denoising_range_num:]
            uncond_kv_range = self.generate_kv_range_for_uncondition(uncond_x)

            kwargs["range_num"] = 1
            kwargs["denoising_range_num"] = 1
            kwargs["slice_point"] = 0
            out_uncond = self.forward(
                uncond_x,
                timestep,
                uncond_y,
                caption_dropout_mask=caption_dropout_mask,
                xattn_mask=uncond_mask,
                kv_range=uncond_kv_range,
                inference_params=None,
                **kwargs,
            )
            out_uncond = batch_to_chunk(out_uncond, denoising_range_num)

        return out_cond_pre_and_text, out_cond_pre, out_uncond, denoise_width

    def get_cfg_scale(self, t, cfg_t_range, prev_chunk_scale_s, text_scale_s):
        indices = torch.searchsorted(cfg_t_range - 1e-7, t) - 1
        assert indices.min() >= 0 and indices.max() < len(prev_chunk_scale_s)
        return prev_chunk_scale_s[indices], text_scale_s[indices]

    def forward_dispatcher(self, x, timestep, y, mask, kv_range, inference_params, **kwargs):
        if self.runtime_config.cfg_number == 3:
            (out_cond_pre_and_text, out_cond_pre, out_uncond, denoise_width) = self.forward_3cfg(
                x, timestep, y, mask, kv_range, inference_params, **kwargs
            )

            prev_chunk_scale_s = torch.tensor(self.runtime_config.prev_chunk_scales).cuda()
            text_scale_s = torch.tensor(self.runtime_config.text_scales).cuda()
            cfg_t_range = torch.tensor(self.runtime_config.cfg_t_range).cuda()
            applied_cfg_range_num, chunk_width = (kwargs["denoising_range_num"], kwargs["chunk_width"])
            if kwargs["fwd_extra_1st_chunk"]:
                applied_cfg_range_num -= 1
            cfg_timestep = timestep[0, -applied_cfg_range_num:]

            assert len(prev_chunk_scale_s) == len(cfg_t_range), "prev_chunks_scale and t_range should have the same length"
            assert len(text_scale_s) == len(cfg_t_range), "text_scale and t_range should have the same length"

            cfg_output_list = []

            for chunk_idx in range(applied_cfg_range_num):
                prev_chunk_scale, text_scale = self.get_cfg_scale(
                    cfg_timestep[chunk_idx], cfg_t_range, prev_chunk_scale_s, text_scale_s
                )
                l = chunk_idx * chunk_width
                r = (chunk_idx + 1) * chunk_width
                cfg_output = (
                    (1 - prev_chunk_scale) * out_uncond[:, :, l:r]
                    + (prev_chunk_scale - text_scale) * out_cond_pre[:, :, -denoise_width:][:, :, l:r]
                    + text_scale * out_cond_pre_and_text[:, :, -denoise_width:][:, :, l:r]
                )
                cfg_output_list.append(cfg_output)

            cfg_output = torch.cat(cfg_output_list, dim=2)

            x = torch.cat([x[0:1, :, :-denoise_width], cfg_output], dim=2)
            x = torch.cat([x, x], dim=0)
            return x
        elif self.runtime_config.cfg_number == 1:
            assert x.shape[0] == 2
            x = torch.cat([x[0:1], x[0:1]], dim=0)

            kwargs["caption_dropout_mask"] = torch.tensor([False], dtype=torch.bool, device=x.device)
            inference_params.update_kv_cache = True
            if kwargs.get("distill_nearly_clean_chunk", False):
                prev_chunks_scale = float(os.getenv("prev_chunks_scale", 0.7))
                slice_start = 1 if kwargs["fwd_extra_1st_chunk"] else 0
                cond_pre_and_text_channel = x.shape[2]
                new_x_chunk = x[0:1, :, slice_start * kwargs["chunk_width"] : (slice_start + 1) * kwargs["chunk_width"]]
                new_kvrange = self.generate_kv_range_for_uncondition(new_x_chunk)
                kwargs["denoising_range_num"] += 1
                cat_x_chunk = torch.cat([x[0:1], new_x_chunk], dim=2)
                new_kvrange = new_kvrange + kv_range.max()
                cat_kvrange = torch.cat([kv_range, new_kvrange], dim=0)
                cat_t = torch.cat([timestep[0:1], timestep[0:1, slice_start : slice_start + 1]], dim=1)
                cat_y = torch.cat([y[0 : y.shape[0] // 2], y[slice_start : slice_start + 1]], dim=0)
                cat_xattn_mask = torch.cat([mask[0 : y.shape[0] // 2], mask[slice_start : slice_start + 1]], dim=0)

                cat_out = self.forward(
                    cat_x_chunk,
                    cat_t,
                    cat_y,
                    xattn_mask=cat_xattn_mask,
                    kv_range=cat_kvrange,
                    inference_params=inference_params,
                    **kwargs,
                )
                near_clean_out_cond_pre_and_text = cat_out[
                    :, :, slice_start * kwargs["chunk_width"] : (slice_start + 1) * kwargs["chunk_width"]
                ]
                near_clean_out_cond_text = cat_out[:, :, cond_pre_and_text_channel:]
                near_out_cond_pre_and_text = (
                    near_clean_out_cond_pre_and_text * prev_chunks_scale + near_clean_out_cond_text * (1 - prev_chunks_scale)
                )
                cat_out[
                    :, :, slice_start * kwargs["chunk_width"] : (slice_start + 1) * kwargs["chunk_width"]
                ] = near_out_cond_pre_and_text
                out_cond_pre_and_text = cat_out[:, :, :cond_pre_and_text_channel]
            else:
                out_cond_pre_and_text = self.forward(
                    x[0:1],
                    timestep[0:1],
                    y[0 : y.shape[0] // 2],
                    xattn_mask=mask[0 : y.shape[0] // 2],
                    kv_range=kv_range,
                    inference_params=inference_params,
                    **kwargs,
                )

            denoise_width = kwargs["chunk_width"] * kwargs["denoising_range_num"]
            if kwargs["fwd_extra_1st_chunk"]:
                denoise_width -= kwargs["chunk_width"]

            x = torch.cat([x[0:1, :, :-denoise_width], out_cond_pre_and_text[:, :, -denoise_width:]], dim=2)
            x = torch.cat([x[0:1], x[0:1]], dim=0)
            return x
        else:
            raise NotImplementedError


def _build_dit_model(config: MagiConfig):
    """Builds the model"""
    device = "cuda" if env_is_true("SKIP_LOAD_MODEL") else "meta"
    with torch.device(device):
        model = VideoDiTModel(
            config=config, pre_process=mpu.is_pipeline_first_stage(), post_process=mpu.is_pipeline_last_stage()
        )
    print_rank_0(model)

    # Print number of parameters.
    param_count = sum([p.nelement() for p in model.parameters()])
    model_size_gb = sum([p.nelement() * p.element_size() for p in model.parameters()]) / (1024**3)
    print_per_rank(
        f"(cp, pp) rank ({mpu.get_cp_rank()}, {mpu.get_pp_rank()}): param count {param_count}, model size {model_size_gb:.2f} GB".format(
            mpu.get_cp_rank(), mpu.get_pp_rank(), param_count, model_size_gb
        )
    )

    return model


def _high_precision_promoter(module: VideoDiTModel):
    module.x_embedder.float()
    module.y_embedder.float()
    module.t_embedder.float()
    module.final_linear.float()
    module.rope.float()
    for name, sub_module in module.named_modules():
        # skip qk_layernorm_xattn
        if "_xattn" in name:
            continue
        # high precision qk_layernorm by default
        if "q_layernorm" in name or "k_layernorm" in name:
            sub_module.float()
        if "self_attn_post_norm" in name or "mlp_post_norm" in name:
            sub_module.float()
        if "final_layernorm" in name:
            sub_module.float()
    return module


def get_dit(config: MagiConfig):
    """Build and load VideoDiT model"""
    model = _build_dit_model(config)
    print_rank_0("Build DiTModel successfully")

    mem_allocated_gb = torch.cuda.memory_allocated() / 1024**3
    mem_reserved_gb = torch.cuda.memory_reserved() / 1024**3
    print_rank_0(
        f"After build_dit_model, memory allocated: {mem_allocated_gb:.2f} GB, memory reserved: {mem_reserved_gb:.2f} GB"
    )

    # To avoid Error in debug mode, set default iteration to 0
    if not env_is_true("SKIP_LOAD_MODEL"):
        model = load_checkpoint(model)
        mem_allocated_gb = torch.cuda.memory_allocated() / 1024**3
        mem_reserved_gb = torch.cuda.memory_reserved() / 1024**3
        print_rank_0(
            f"After load_checkpoint, memory allocated: {mem_allocated_gb:.2f} GB, memory reserved: {mem_reserved_gb:.2f} GB"
        )

    model = _high_precision_promoter(model)
    mem_allocated_gb = torch.cuda.memory_allocated() / 1024**3
    mem_reserved_gb = torch.cuda.memory_reserved() / 1024**3
    print_rank_0(
        f"After high_precision_promoter, memory allocated: {mem_allocated_gb:.2f} GB, memory reserved: {mem_reserved_gb:.2f} GB"
    )

    model.eval()
    gc.collect()
    torch.cuda.empty_cache()

    print_rank_0("Load checkpoint successfully")
    return model

```

## /inference/model/t5/__init__.py

```py path="/inference/model/t5/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .t5_model import T5Embedder

__all__ = ["T5Embedder"]

```

## /inference/model/vae/__init__.py

```py path="/inference/model/vae/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .vae_model import AutoModel, VideoTokenizerABC, ViTVAE
from .vae_module import DiagonalGaussianDistribution

__all__ = ["AutoModel", "VideoTokenizerABC", "ViTVAE", "DiagonalGaussianDistribution"]

```

## /inference/pipeline/__init__.py

```py path="/inference/pipeline/__init__.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from .pipeline import MagiPipeline

__all__ = ["MagiPipeline"]

```

## /inference/pipeline/entry.py

```py path="/inference/pipeline/entry.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import sys

from inference.pipeline import MagiPipeline


def parse_arguments():
    parser = argparse.ArgumentParser(description="Run MagiPipeline with different modes.")
    parser.add_argument('--config_file', type=str, help='Path to the configuration file.')
    parser.add_argument(
        '--mode', type=str, choices=['t2v', 'i2v', 'v2v'], required=True, help='Mode to run: t2v, i2v, or v2v.'
    )
    parser.add_argument('--prompt', type=str, required=True, help='Prompt for the pipeline.')
    parser.add_argument('--image_path', type=str, help='Path to the image file (for i2v mode).')
    parser.add_argument('--prefix_video_path', type=str, help='Path to the prefix video file (for v2v mode).')
    parser.add_argument('--output_path', type=str, required=True, help='Path to save the output video.')
    return parser.parse_args()


def main():
    args = parse_arguments()

    pipeline = MagiPipeline(args.config_file)

    if args.mode == 't2v':
        pipeline.run_text_to_video(prompt=args.prompt, output_path=args.output_path)
    elif args.mode == 'i2v':
        if not args.image_path:
            print("Error: --image_path is required for i2v mode.")
            sys.exit(1)
        pipeline.run_image_to_video(prompt=args.prompt, image_path=args.image_path, output_path=args.output_path)
    elif args.mode == 'v2v':
        if not args.prefix_video_path:
            print("Error: --prefix_video_path is required for v2v mode.")
            sys.exit(1)
        pipeline.run_video_to_video(prompt=args.prompt, prefix_video_path=args.prefix_video_path, output_path=args.output_path)


if __name__ == "__main__":
    main()

```

## /inference/pipeline/pipeline.py

```py path="/inference/pipeline/pipeline.py" 
# Copyright (c) 2025 SandAI. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import torch

from inference.common import MagiConfig, print_rank_0, set_random_seed
from inference.infra.distributed import dist_init
from inference.model.dit import get_dit

from .prompt_process import get_txt_embeddings
from .video_generate import generate_per_chunk
from .video_process import post_chunk_process, process_image, process_prefix_video, save_video_to_disk


class MagiPipeline:
    def __init__(self, config_path):
        self.config = MagiConfig.from_json(config_path)
        set_random_seed(self.config.runtime_config.seed)
        dist_init(self.config)
        print_rank_0(self.config)

    def run_text_to_video(self, prompt: str, output_path: str):
        self._run(prompt, None, output_path)

    def run_image_to_video(self, prompt: str, image_path: str, output_path: str):
        prefix_video = process_image(image_path, self.config)
        self._run(prompt, prefix_video, output_path)

    def run_video_to_video(self, prompt: str, prefix_video_path: str, output_path: str):
        prefix_video = process_prefix_video(prefix_video_path, self.config)
        self._run(prompt, prefix_video, output_path)

    def _run(self, prompt: str, prefix_video: torch.Tensor, output_path: str):
        caption_embs, emb_masks = get_txt_embeddings(prompt, self.config)
        dit = get_dit(self.config)
        videos = torch.cat(
            [
                post_chunk_process(chunk, self.config)
                for chunk in generate_per_chunk(
                    model=dit, prefix_video=prefix_video, caption_embs=caption_embs, emb_masks=emb_masks
                )
            ],
            dim=0,
        )
        save_video_to_disk(videos, output_path, fps=self.config.runtime_config.fps)

        mem_allocated_gb = torch.cuda.max_memory_allocated() / 1024**3
        mem_reserved_gb = torch.cuda.max_memory_reserved() / 1024**3
        print_rank_0(
            f"Finish MagiPipeline, max memory allocated: {mem_allocated_gb:.2f} GB, max memory reserved: {mem_reserved_gb:.2f} GB"
        )

```

## /requirements.txt

accelerate==0.32.1
beautifulsoup4==4.13.4
debugpy==1.8.14
diffusers==0.29.2
einops>=0.6.0
ffmpeg-python
flash-attn==2.4.2
flashinfer-python==0.2.0.post2 --extra-index-url https://flashinfer.ai/whl/cu124/torch2.4/
ftfy==6.2.0
gpustat==1.1.1
imageio==2.34.0
imageio[ffmpeg]
matplotlib==3.10.1
numpy==1.26.4
protobuf==5.28.3
rich==14.0.0
sentencepiece==0.2.0
timm==1.0.15
torchdiffeq==0.2.4
transformers==4.42.3


The content has been capped at 50000 tokens. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.