```
├── .github/
├── workflows/
├── publish.yaml (200 tokens)
├── .gitignore (900 tokens)
├── .pre-commit-config.yaml (100 tokens)
├── CMakeLists.txt (400 tokens)
├── LICENSE (omitted)
├── README.md (2.6k tokens)
├── TRAINING.md (700 tokens)
├── __init__.py
├── examples/
├── README.md (500 tokens)
├── __init__.py
├── basic_example.py (500 tokens)
├── basic_streaming_example.py (1200 tokens)
├── encode_reference.py (300 tokens)
├── finetune.py (900 tokens)
├── finetune_config.yaml (100 tokens)
├── interactive_example.ipynb (500 tokens)
├── onnx_example.py (400 tokens)
├── neutts/
├── __init__.py
├── neutts.py (3.6k tokens)
├── phonemizers.py (1100 tokens)
├── neuttsair/
├── __init__.py
├── neutts.py (100 tokens)
├── output.wav
├── pyproject.toml (200 tokens)
├── requirements-dev.txt
├── samples/
├── dave.pt
├── dave.txt
├── dave.wav
├── greta.pt
├── greta.txt
├── greta.wav
├── jo.pt
├── jo.txt (100 tokens)
├── jo.wav
├── juliette.pt
├── juliette.txt
├── juliette.wav
├── mateo.pt
├── mateo.txt
├── mateo.wav
├── tests/
├── __init__.py
├── test_neutts.py (700 tokens)
```
## /.github/workflows/publish.yaml
```yaml path="/.github/workflows/publish.yaml"
name: Publish to PyPi
on:
workflow_dispatch: # Allows manual triggering from the GitHub UI
jobs:
build:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install Poetry
run: |
curl -sSL https://install.python-poetry.org | python3 -
export PATH="$HOME/.local/bin:$PATH"
- name: Install dependencies
run: |
poetry install
- name: Set up PyPi repo and token
run: |
poetry config repositories.pypi https://upload.pypi.org/legacy/
poetry config http-basic.pypi __token__ ${{ secrets.PYPI_TOKEN }}
- name: Publish package
run: |
poetry publish --build --repository pypi
```
## /.gitignore
```gitignore path="/.gitignore"
# MacOS
**.DS_Store
# Emacs
*~
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[codz]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py.cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# UV
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
#uv.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
#poetry.toml
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
#pdm.lock
#pdm.toml
.pdm-python
.pdm-build/
# pixi
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
#pixi.lock
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
# in the .venv directory. It is recommended not to include this directory in version control.
.pixi
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.envrc
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
# Abstra
# Abstra is an AI-powered process automation framework.
# Ignore directories containing user credentials, local state, and settings.
# Learn more at https://abstra.io/docs
.abstra/
# Visual Studio Code
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
# and can be added to the global gitignore or merged into this file. However, if you prefer,
# you could uncomment the following to ignore the entire vscode folder
# .vscode/
# Ruff stuff:
.ruff_cache/
# PyPI configuration file
.pypirc
# Cursor
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
# refer to https://docs.cursor.com/context/ignore-files
.cursorignore
.cursorindexingignore
# Marimo
marimo/_static/
marimo/_lsp/
__marimo__/
```
## /.pre-commit-config.yaml
```yaml path="/.pre-commit-config.yaml"
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0 # Use the latest tag from the repo
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- id: requirements-txt-fixer
- repo: https://github.com/psf/black
rev: 24.4.2 # Use the appropriate version of black
hooks:
- id: black
language_version: python3
args: [--line-length=100]
- repo: https://github.com/PyCQA/flake8
rev: 7.0.0
hooks:
- id: flake8
args: ["--ignore=E203,W503", --max-line-length=100]
```
## /CMakeLists.txt
# Builds espeak-ng for neutts using cmake.
#
# This is called automatically by scikit-build-core from pyproject.toml.
cmake_minimum_required(VERSION 3.26)
project(neutts LANGUAGES C CXX)
include(ExternalProject)
# Install location for espeak-ng
set(ESPEAKNG_BUILD_DIR ${CMAKE_BINARY_DIR}/espeak_ng)
set(ESPEAKNG_INSTALL_DIR ${CMAKE_BINARY_DIR}/espeak_ng-install)
ExternalProject_Add(espeak_ng_external
GIT_REPOSITORY https://github.com/espeak-ng/espeak-ng.git
GIT_TAG 69bfd1efde0236654ccc117a28d04aef27673a30
PREFIX ${ESPEAKNG_BUILD_DIR}
CMAKE_ARGS
-DCMAKE_INSTALL_PREFIX=${ESPEAKNG_INSTALL_DIR}
-DBUILD_SHARED_LIBS:BOOL=ON
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DUSE_ASYNC:BOOL=OFF
-DUSE_MBROLA:BOOL=OFF
-DUSE_LIBSONIC:BOOL=OFF
-DUSE_LIBPCAUDIO:BOOL=OFF
-DUSE_KLATT:BOOL=OFF
-DUSE_SPEECHPLAYER:BOOL=OFF
-DEXTRA_cmn:BOOL=ON
-DEXTRA_ru:BOOL=ON
# Need to explicitly add ucd include directory for CI
"-DCMAKE_C_FLAGS=-D_FILE_OFFSET_BITS=64 -I${ESPEAKNG_BUILD_DIR}/src/espeak_ng_external/src/ucd-tools/src/include"
"-DCMAKE_CXX_FLAGS=-D_FILE_OFFSET_BITS=64 -I${ESPEAKNG_BUILD_DIR}/src/espeak_ng_external/src/ucd-tools/src/include"
UPDATE_DISCONNECTED TRUE
)
# Bundle the shared library into the neutts package so phonemizer can load it.
# On Unix: the .so/.dylib lands in lib/. On Windows: the .dll lands in bin/.
if(WIN32)
install(DIRECTORY ${ESPEAKNG_INSTALL_DIR}/bin/
DESTINATION neutts
FILES_MATCHING
PATTERN "espeak-ng*.dll"
)
else()
install(DIRECTORY ${ESPEAKNG_INSTALL_DIR}/lib/
DESTINATION neutts
FILES_MATCHING
PATTERN "libespeak-ng*"
)
endif()
# Bundle espeak-ng-data so phonemizer can find the voice/language data
install(DIRECTORY ${ESPEAKNG_INSTALL_DIR}/share/espeak-ng-data/
DESTINATION neutts/espeak-ng-data
)
## /README.md
# NeuTTS
HuggingFace 🤗:
- NeuTTS-Air (English): [Model](https://huggingface.co/neuphonic/neutts-air), [Q8 GGUF](https://huggingface.co/neuphonic/neutts-air-q8-gguf), [Q4 GGUF](https://huggingface.co/neuphonic/neutts-air-q4-gguf), [Space](https://huggingface.co/spaces/neuphonic/neutts-air)
- NeuTTS-Nano Multilingual Collection:
- NeuTTS-Nano (English): [Model](https://huggingface.co/neuphonic/neutts-nano), [Q8 GGUF](https://huggingface.co/neuphonic/neutts-nano-q8-gguf), [Q4 GGUF](https://huggingface.co/neuphonic/neutts-nano-q4-gguf)
- NeuTTS-Nano-French: [Model](https://huggingface.co/neuphonic/neutts-nano-french), [Q8 GGUF](https://huggingface.co/neuphonic/neutts-nano-french-q8-gguf), [Q4 GGUF](https://huggingface.co/neuphonic/neutts-nano-french-q4-gguf)
- NeuTTS-Nano-German: [Model](https://huggingface.co/neuphonic/neutts-nano-german), [Q8 GGUF](https://huggingface.co/neuphonic/neutts-nano-german-q8-gguf), [Q4 GGUF](https://huggingface.co/neuphonic/neutts-nano-german-q4-gguf)
- NeuTTS-Nano-Spanish: [Model](https://huggingface.co/neuphonic/neutts-nano-spanish), [Q8 GGUF](https://huggingface.co/neuphonic/neutts-nano-spanish-q8-gguf), [Q4 GGUF](https://huggingface.co/neuphonic/neutts-nano-spanish-q4-gguf)
- [Multilingual Space](https://huggingface.co/spaces/neuphonic/neutts-nano-multilingual-collection)
[NeuTTS-Nano Demo Video](https://github.com/user-attachments/assets/629ec5b2-4818-4fa6-987a-99fcbadc56bc)
_Created by [Neuphonic](http://neuphonic.com/) - building faster, smaller, on-device voice AI_
State-of-the-art Voice AI has been locked behind web APIs for too long. NeuTTS is a collection of open source, on-device, TTS speech language models with instant voice cloning. Built off of LLM backbones, NeuTTS brings natural-sounding speech, real-time performance, built-in security and speaker cloning to your local device - unlocking a new category of embedded voice agents, assistants, toys, and compliance-safe apps.
## Key Features
- 🗣Best-in-class realism for their size - produce natural, ultra-realistic voices that sound human, at the sweet spot between speed, size, and quality for real-world applications
- 📱Optimised for on-device deployment - quantisations provided in GGUF format, ready to run on phones, laptops, or even Raspberry Pis
- 👫Instant voice cloning - create your own speaker with as little as 3 seconds of audio
- 🚄Simple LM + codec architecture - making development and deployment simple
> [!CAUTION]
> Websites like neutts.com are popping up and they're not affliated with Neuphonic, our github or this repo.
>
> We are on neuphonic.com only. Please be careful out there! 🙏
## Model Details
NeuTTS models are built from small LLM backbones - lightweight yet capable language models optimised for text understanding and generation - as well as a powerful combination of technologies designed for efficiency and quality:
- **Supported Languages**: English, Spanish, German, French (model-dependent)
- **Audio Codec**: [NeuCodec](https://huggingface.co/neuphonic/neucodec) - our 50hz neural audio codec that achieves exceptional audio quality at low bitrates using a single codebook
- **Context Window**: 2048 tokens, enough for processing ~30 seconds of audio (including prompt duration)
- **Format**: Quantisations available in GGUF format for efficient on-device inference
- **Responsibility**: Watermarked outputs
- **Inference Speed**: Real-time generation on mid-range devices
- **Power Consumption**: Optimised for mobile and embedded devices
| | NeuTTS-Air | NeuTTS-Nano Models |
|---|---:|---:|
| **# Params (Active)** | ~360m | ~120m |
| **# Params (Emb + Active)** | ~552m | ~229m |
| **Cloning** | Yes | Yes |
| **License** | Apache 2.0 | NeuTTS Open License 1.0 |
## Throughput Benchmarking
These benchmarks are for the Q4_0 quantisations [neutts-air-Q4_0](https://huggingface.co/neuphonic/neutts-air-q4-gguf) and [neutts-nano-Q4_0](https://huggingface.co/neuphonic/neutts-nano-q4-gguf). Note that all models in the NeuTTS-Nano Multilingual Collection have an identical architecture, so these results should apply for any Q4_0 model in the collection.
CPU benchmarking used [llama-bench](https://github.com/ggml-org/llama.cpp/tree/master/tools/llama-bench) (from llama.cpp) to measure prefill and decode throughput at multiple context sizes. For the GPU benchmark (RTX 4090), we leverage vLLM to maximise throughput, using the [vLLM benchmark](https://docs.vllm.ai/en/stable/cli/bench/throughput/).
We include benchmarks on four devices: Galaxy A25 5G, AMD Ryzen 9HX 370, iMac M4 16GB, NVIDIA GeForce RTX 4090.
| | NeuTTS-Air | NeuTTS-Nano |
|---|---:|---:|
| **Galaxy A25 5G (CPU only)** | 20 tokens/s | 45 tokens/s|
| **AMD Ryzen 9 HX 370 (CPU only)** | 119 tokens/s | 221 tokens/s |
| **iMAc M4 16 GB (CPU only)** | 111 tokens/s | 195 tokens/s |
| **RTX 4090** | 16194 tokens/s | 19268 tokens/s |
> [!NOTE]
> llama-bench used 14 threads for prefill and 16 threads for decode (as configured in the benchmark run) on AMD Ryzen 9HX 370 and iMac M4 16GB, and 6 threads for each on the Galaxy A25 5G. The tokens/s reported are when having 500 prefill tokens and generating 250 output tokens.
> [!NOTE]
> Please note that these benchmarks only include the Speech Language Model and do not include the Codec which is needed for a full audio generation pipeline.
## Get Started with NeuTTS
> [!NOTE]
> We have added a [streaming example](examples/basic_streaming_example.py) using the `llama-cpp-python` library as well as a [finetuning script](examples/finetune.py). For finetuning, please refer to the [finetune guide](TRAINING.md) for more details.
1. **Install NeuTTS**
```bash
pip install neutts
```
Or for a local editable install, clone this repository and run in the base folder:
```bash
pip install -e .
```
Alternatively to install all dependencies, including `onnxruntime` and `llama-cpp-python` (equivalent to steps 3 and 4 below):
```bash
pip install neutts[all]
```
or for an editable install:
```bash
pip install -e .[all]
```
2. **(Optional) Install `llama-cpp-python` to use `.gguf` models.**
To use any of the GGUF backbones (e.g., in basic_streaming_example.py) you need to install the llama-cpp-python package.
For the best performance, you must compile this package from source with hardware acceleration enabled for your specific operating system and target device (CPU or GPU).
#### macOS (Apple Silicon)
For M-series Macs, it is highly recommended to use Apple's native Accelerate framework for optimized CPU performance:
```bash
CMAKE_ARGS="-DGGML_METAL=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=Apple" pip install "neutts[llama]" --force-reinstall --no-cache-dir
```
#### Linux (OpenBLAS)
For Linux, you can accelerate CPU performance using OpenBLAS.
*Prerequisite: Ensure you have OpenBLAS installed on your system (e.g., `sudo apt-get install libopenblas-dev` on Ubuntu). For other distros, refer to the [OpenBLAS Installation Guide](https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md).*
```bash
CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS" pip install "neutts[llama]" --force-reinstall --no-cache-dir
```
#### Windows (OpenBLAS)
*Prerequisite: Ensure you have OpenBLAS installed on your system. Please refer to the [OpenBLAS Installation Guide](https://github.com/OpenMathLib/OpenBLAS/blob/develop/docs/install.md).*
For Windows users utilizing PowerShell, set the environment variable and run the install command like this:
```pwsh
$env:CMAKE_ARGS="-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS"; pip install "neutts[llama]" --force-reinstall --no-cache-dir
```
#### Looking for GPU Support?
If you have a dedicated GPU (Nvidia/CUDA, AMD/ROCm, M-Series Mac/Metal) and want to utilize it instead of the CPU, the CMAKE flags will be different.Please refer to the official [llama-cpp-python documentation](https://github.com/abetlen/llama-cpp-python/blob/main/README.md) for the exact flags required for your specific hardware.
3. **(Optional) Install `onnxruntime` to use the `.onnx` decoder.**
```bash
pip install "neutts[onnx]"
```
## Examples
To get started with the example scripts, clone this repository and navigate into the project directory:
```bash
git clone https://github.com/neuphonic/neutts.git
cd neutts
```
Several examples are available, including a Jupyter notebook in the `examples` folder.
### Basic Example
Run the basic example script to synthesize speech:
```bash
python -m examples.basic_example \
--input_text "My name is Andy. I'm 25 and I just moved to London. The underground is pretty confusing, but it gets me around in no time at all." \
--ref_audio samples/jo.wav \
--ref_text samples/jo.txt
```
To specify a particular model repo for the backbone or codec, add the `--backbone` argument. Available backbones are listed in [NeuTTS-Air](https://huggingface.co/collections/neuphonic/neutts-air) and [NeuTTS-Nano Multilingual Collection](https://huggingface.co/collections/neuphonic/neutts-nano-multilingual-collection) huggingface collections.
> [!CAUTION]
> If you are using a non-English backbone, it is highly recommended to use a same-language reference for best performance. See the 'example reference files' section below to select an appropriate example reference.
### One-Code Block Usage
```python
from neutts import NeuTTS
import soundfile as sf
tts = NeuTTS(
backbone_repo="neuphonic/neutts-nano", # or 'neuphonic/neutts-nano-q4-gguf' with llama-cpp-python installed
backbone_device="cpu",
codec_repo="neuphonic/neucodec",
codec_device="cpu"
)
input_text = "My name is Andy. I'm 25 and I just moved to London. The underground is pretty confusing, but it gets me around in no time at all."
ref_text = "samples/jo.txt"
ref_audio_path = "samples/jo.wav"
ref_text = open(ref_text, "r").read().strip()
ref_codes = tts.encode_reference(ref_audio_path)
wav = tts.infer(input_text, ref_codes, ref_text)
sf.write("test.wav", wav, 24000)
```
### Streaming
Speech can also be synthesised in _streaming mode_, where audio is generated in chunks and plays as generated. Note that this requires pyaudio to be installed. To do this, run:
```bash
python -m examples.basic_streaming_example \
--input_text "My name is Andy. I'm 25 and I just moved to London. The underground is pretty confusing, but it gets me around in no time at all." \
--ref_codes samples/jo.pt \
--ref_text samples/jo.txt
```
Again, a particular model repo can be specified with the `--backbone` argument - note that for streaming the model must be in GGUF format.
## Preparing References for Cloning
NeuTTS requires two inputs:
1. A reference audio sample (`.wav` file)
2. A text string
The model then synthesises the text as speech in the style of the reference audio. This is what enables NeuTTS models' instant voice cloning capability.
### Example Reference Files
You can find some ready-to-use references in the `samples` folder:
- English:
- `dave.wav`
- `jo.wav`
- Spanish:
- `mateo.wav`
- German:
- `greta.wav`
- French:
- `juliette.wav`
### Guidelines for Best Results
For optimal performance, reference audio samples should be:
1. **Mono channel**
2. **16-44 kHz sample rate**
3. **3–15 seconds in length**
4. **Saved as a `.wav` file**
5. **Clean** — minimal to no background noise
6. **Natural, continuous speech** — like a monologue or conversation, with few pauses, so the model can capture tone effectively
## Guidelines for minimizing Latency
For optimal performance on-device:
1. Use the GGUF model backbones
2. Pre-encode references (see `examples/encode_reference.py` or `examples/basic_example.py`)
3. Use the [onnx codec decoder](https://huggingface.co/neuphonic/neucodec-onnx-decoder)
Take a look at this example in the [examples README](examples/README.md###minimal-latency-example) to get started.
## Responsibility
Every audio file generated by NeuTTS includes by default a [Perth (Perceptual Threshold) Watermark](https://github.com/resemble-ai/perth).
Note: If you install neutts using `uv sync` within the repo, the program will still run, but watermarking will be disabled (you will see warning that perth is missing). This is because `uv sync` currently fails to pull the required Perth dependencies, please see [This Issue](https://github.com/resemble-ai/Perth/). To ensure watermarking is active, please install the package via PyPI instead (`pip install neutts`).
## Disclaimer
Don't use this model to do bad things… please.
## Developer Requirements
To run the pre commit hooks to contribute to this project run:
```bash
pip install pre-commit
```
Then:
```bash
pre-commit install
```
## Running Tests
First, install the dev requirements:
```
pip install -r requirements-dev.txt
```
To run the tests:
```
pytest tests/
```
To test loading of all the official backbone and codecs, use:
```
RUN_SLOW=true pytest tests/
```
## /TRAINING.md
# Model finetuning
NeuTTS-Air follows [Llasa](https://github.com/zhenye234/LLaSA_training) in its training and inference setup. In order to finetune a model, you can use the `transformers` library from Hugging Face. We have an [example script](/examples/finetune.py) for finetuning using the [Emilia-YODAS dataset](https://huggingface.co/datasets/neuphonic/emilia-yodas-english-neucodec) that is encoded with [NeuCodec](https://huggingface.co/neuphonic/neucodec).
> [!NOTE]
> We have an on-going discussion about finetuning [here](https://github.com/neuphonic/neutts-air/issues/7) where some users have reported success with finetuning using the example script.
# Finetuning on your own dataset
You can prepare your own dataset by following these steps:
1. Encode your audio files using the [NeuCodec](https://huggingface.co/neuphonic/neucodec) model into a format similar to the [Emilia-YODAS dataset](https://huggingface.co/datasets/neuphonic/emilia-yodas-english-neucodec).
2. Setup your configuration file similar to the [example config](/examples/finetune_config.yaml).
3. Check and modify the phonemizer and the tokenizer in the script such that they suit your dataset/task. See [the phonemizer documentation](https://bootphon.github.io/phonemizer/api_reference.html#phonemizer.backend.espeak.espeak.EspeakBackend) for phonemizer arguments.
4. Run the finetuning script with your dataset and configuration file. To do this, navigate to the base directory of your cloned repo in the terminal and run:
```bash
python examples/finetune.py examples/finetune_config.yaml
```
replacing the argument with the path to your own config file if needed.
# Finetuning config
An example finetuning config lives in `examples/finetune_config.yaml`.
- In the past we've found a learning rate of `1e-5` to `4e-5` to have worked well for finetuning depending on the size of the dataset.
- We generally find that you do not need many steps for finetuning. For example, for a dataset of 10 hours, 1000 to 2000 steps is often sufficient.
- A warmup ratio as well as different learning rate schedulers can be experimented with to see what works best for your dataset.
# Training from scratch or using additional labels
The NeuTTS Air model is based on the [Qwen2.5 0.5B model](https://huggingface.co/Qwen/Qwen2.5-0.5B). To use this instead of the trained NeuTTS Air model, change the `restore_from` parameter in your config file to `"Qwen/Qwen2.5-0.5B"`.
Using Qwen means you would need to add the speech token tags to the model vocabulary. With either Qwen or NeuTTS you can also add additional custom tags. Both of these steps can be done as such in the script after loading the model:
```python
codec_special_tokens = [
# speech token tags to add if using Qwen
"<|TEXT_REPLACE|>",
"<|TEXT_PROMPT_START|>",
"<|TEXT_PROMPT_END|>",
"<|SPEECH_REPLACE|>",
"<|SPEECH_GENERATION_START|>",
"<|SPEECH_GENERATION_END|>",
# optional additional tags that you can add to enable features if you have the labels in your dataset
"<|EN|>",
"<|ZH|>",
"<|LAUGHING|>",
"<|WHISPERING|>",
]
codec_tokens = [f"<|speech_{idx}|>" for idx in range(config.codebook_size)]
new_tokens = codec_special_tokens + codec_tokens
n_added_tokens = tokenizer.add_tokens(new_tokens)
model.resize_token_embeddings(len(tokenizer), mean_resizing=False)
model.vocab_size = len(tokenizer)
```
You can then modify the input to the model to include these additional labels. For example, if you have speaker IDs or emotion labels, you can concatenate them with the phoneme tokens before passing them to the model.
## /__init__.py
```py path="/__init__.py"
from neutts.neutts import NeuTTS, BACKBONE_LANGUAGE_MAP # noqa
```
## /examples/README.md
# Examples
### GGUF Backbones
To run the model with `llama-cpp-python` in GGUF format, select a GGUF backbone when intializing the example script.
```bash
python -m examples.basic_example \
--input_text "My name is Andy. I'm 25 and I just moved to London. The underground is pretty confusing, but it gets me around in no time at all." \
--ref_audio ./samples/jo.wav \
--ref_text ./samples/jo.txt \
--backbone neuphonic/neutts-nano-q4-gguf
```
### Pre-encode a reference
Reference encoding can be done ahead of time to reduce latency whilst inferencing the model; to pre-encode a reference you only need to provide a reference audio, as in the following script:
```bash
python -m examples.encode_reference \
--ref_audio ./samples/jo.wav \
--output_path ./samples/jo.pt
```
Note that `basic_streaming_example.py` requires a pre-encoded reference. `basic_example.py` will encode your reference if a pre-encoding does not exist, and will save and use it in future runs with the same reference.
### Minimal Latency Example
To take advantage of encoding references ahead of time, we have a compiled the codec decoder into an [onnx graph](https://huggingface.co/neuphonic/neucodec-onnx-decoder) that enables inferencing NeuTTS without loading the encoder.
This can be useful for running the model in resource-constrained environments where the encoder may add a large amount of extra latency/memory usage.
To test the decoder, make sure you have installed ```onnxruntime``` and run the following:
```bash
python -m examples.onnx_example \
--input_text "My name is Andy. I'm 25 and I just moved to London. The underground is pretty confusing, but it gets me around in no time at all." \
--ref_codes samples/jo.pt \
--ref_text samples/jo.txt \
--backbone neuphonic/neutts-nano-q4-gguf
```
### Streaming Support
To stream the model output in chunks, try out the `basic_streaming_example.py` example. For streaming, only the GGUF backbones are currently supported. Ensure you have `llama-cpp-python`, `onnxruntime` and `pyaudio` installed to run this example.
```bash
python -m examples.basic_streaming_example \
--input_text "My name is Andy. I'm 25 and I just moved to London. The underground is pretty confusing, but it gets me around in no time at all." \
--ref_codes samples/jo.pt \
--ref_text samples/jo.txt \
--backbone neuphonic/neutts-nano-q4-gguf
```
## /examples/__init__.py
```py path="/examples/__init__.py"
```
## /examples/basic_example.py
```py path="/examples/basic_example.py"
import os
import soundfile as sf
from neutts import NeuTTS
import torch
def main(input_text, ref_audio_path, ref_text, backbone, output_path="output.wav"):
if not ref_audio_path or not ref_text:
print("No reference audio or text provided.")
return None
# Initialize NeuTTS with the desired model and codec
tts = NeuTTS(
backbone_repo=backbone,
backbone_device="cpu",
codec_repo="neuphonic/neucodec",
codec_device="cpu",
)
# Check if ref_text is a path if it is read it if not just return string
if ref_text and os.path.exists(ref_text):
with open(ref_text, "r") as f:
ref_text = f.read().strip()
if not os.path.exists(ref_audio_path.replace(".wav", ".pt")):
print("Encoding reference audio")
ref_codes = tts.encode_reference(ref_audio_path)
torch.save(ref_codes, ref_audio_path.replace(".wav", ".pt"))
else:
print("Loading pre-encoded reference audio")
ref_codes = torch.load(ref_audio_path.replace(".wav", ".pt"))
print(f"Generating audio for input text: {input_text}")
wav = tts.infer(input_text, ref_codes, ref_text)
print(f"Saving output to {output_path}")
sf.write(output_path, wav, 24000)
if __name__ == "__main__":
# get arguments from command line
import argparse
parser = argparse.ArgumentParser(description="NeuTTS Example")
parser.add_argument(
"--input_text",
type=str,
required=True,
help="Input text to be converted to speech",
)
parser.add_argument(
"--ref_audio", type=str, default="./samples/jo.wav", help="Path to reference audio file"
)
parser.add_argument(
"--ref_text",
type=str,
default="./samples/jo.txt",
help="Reference text corresponding to the reference audio",
)
parser.add_argument(
"--output_path",
type=str,
default="output.wav",
help="Path to save the output audio",
)
parser.add_argument(
"--backbone",
type=str,
default="neuphonic/neutts-nano",
help="Huggingface repo containing the backbone checkpoint",
)
args = parser.parse_args()
main(
input_text=args.input_text,
ref_audio_path=args.ref_audio,
ref_text=args.ref_text,
backbone=args.backbone,
output_path=args.output_path,
)
```
## /examples/basic_streaming_example.py
```py path="/examples/basic_streaming_example.py"
import os
import torch
import numpy as np
from neutts import NeuTTS
import pyaudio
import time
import queue
import threading
def _read_if_path(value: str) -> str:
return open(value, "r", encoding="utf-8").read().strip() if os.path.exists(value) else value
def audio_player_thread(audio_queue, stream, prefill_chunks=0):
# Increase prefill_chunks if RTF is slow to allow for smooth playback
PLAYBACK_CHUNK_BYTES = 2048
buffer = []
for _ in range(prefill_chunks):
chunk = audio_queue.get()
if chunk is None:
buffer.append(None)
break
buffer.append(chunk)
for chunk in buffer:
if chunk is None:
audio_queue.task_done()
return
for i in range(0, len(chunk), PLAYBACK_CHUNK_BYTES):
stream.write(
chunk[i : i + PLAYBACK_CHUNK_BYTES], exception_on_underflow=False
)
audio_queue.task_done()
while True:
audio_bytes = audio_queue.get()
if audio_bytes is None:
audio_queue.task_done()
break
for i in range(0, len(audio_bytes), PLAYBACK_CHUNK_BYTES):
slice_bytes = audio_bytes[i : i + PLAYBACK_CHUNK_BYTES]
stream.write(slice_bytes, exception_on_underflow=False)
audio_queue.task_done()
def main(input_text, ref_codes_path, ref_text, backbone):
assert backbone in [
"neuphonic/neutts-air-q4-gguf",
"neuphonic/neutts-air-q8-gguf",
"neuphonic/neutts-nano-q4-gguf",
"neuphonic/neutts-nano-q8-gguf",
"neuphonic/neutts-nano-french-q4-gguf",
"neuphonic/neutts-nano-french-q8-gguf",
"neuphonic/neutts-nano-spanish-q4-gguf",
"neuphonic/neutts-nano-spanish-q8-gguf",
"neuphonic/neutts-nano-german-q4-gguf",
"neuphonic/neutts-nano-german-q8-gguf",
], "Must be a GGUF ckpt as streaming is only currently supported by llama-cpp."
# Initialize NeuTTS with the desired model and codec
tts = NeuTTS(
backbone_repo=backbone,
backbone_device="cpu",
codec_repo="neuphonic/neucodec-onnx-decoder",
codec_device="cpu",
)
input_text = _read_if_path(input_text)
ref_text = _read_if_path(ref_text)
ref_codes = None
if ref_codes_path and os.path.exists(ref_codes_path):
ref_codes = torch.load(ref_codes_path)
print(f"Generating audio for input text: {input_text}")
p = pyaudio.PyAudio()
stream = p.open(
format=pyaudio.paInt16, channels=1, rate=tts.sample_rate, output=True
)
audio_queue = queue.Queue()
player = threading.Thread(target=audio_player_thread, args=(audio_queue, stream))
player.start()
total_audio_samples = 0
total_gen_time = 0.0
chunk_count = 0
start_time = time.perf_counter()
last_yield_time = start_time
print("Streaming...")
print("-" * 80)
for chunk in tts.infer_stream(input_text, ref_codes, ref_text):
chunk_count += 1
now = time.perf_counter()
gen_duration = now - last_yield_time
total_gen_time += gen_duration
last_yield_time = now
# Write audio
audio = (chunk * 32767).astype(np.int16)
audio_queue.put(audio.tobytes())
total_audio_samples += audio.shape[0]
# Per-chunk timing log for latency info
chunk_ms_actual = audio.shape[0] / tts.sample_rate * 1000
gen_ms = f"{gen_duration * 1000:6.1f}ms"
rt_percent = gen_duration / (chunk_ms_actual / 1000) * 100
if chunk_count == 1:
print(
f"Chunk {chunk_count:2d}: Generation Time={gen_ms} (TTFA) │ Chunk Size={chunk_ms_actual:5.1f}ms │ {rt_percent:5.1f}% RT"
)
else:
print(
f"Chunk {chunk_count:2d}: Generation Time={gen_ms} │ Chunk Size={chunk_ms_actual:5.1f}ms │ {rt_percent:5.1f}% RT"
)
total_time = time.perf_counter() - start_time
# Add a tail pad to avoid cutting off any final generation.
tail_pad = np.zeros(int(0.25 * tts.sample_rate), dtype=np.int16)
audio_queue.put(tail_pad.tobytes())
audio_queue.put(None)
player.join()
total_audio_seconds = total_audio_samples / tts.sample_rate if total_audio_samples else 0.0
# Print stats
print("-" * 80)
print(
f"Streaming complete. Generated {total_audio_seconds:.2f}s of audio in {total_time:.2f}s."
)
if chunk_count:
print(
f" → Average generation time per chunk: {(total_gen_time / chunk_count) * 1000:.1f}ms"
)
if total_audio_seconds:
rtf = total_time / total_audio_seconds
print(f" → Real-Time Factor (RTF): {rtf:.2f}")
stream.stop_stream()
stream.close()
p.terminate()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="NeuTTS Example")
parser.add_argument(
"--input_text",
type=str,
required=True,
help="Input text to be converted to speech",
)
parser.add_argument(
"--ref_codes",
type=str,
default="./samples/jo.pt",
help="Path to pre-encoded reference audio",
)
parser.add_argument(
"--ref_text",
type=str,
default="./samples/jo.txt",
help="Reference text corresponding to the reference audio",
)
parser.add_argument(
"--output_path",
type=str,
default="output.wav",
help="Path to save the output audio",
)
parser.add_argument(
"--backbone",
type=str,
default="neuphonic/neutts-nano-q8-gguf",
help="Huggingface repo containing the backbone checkpoint. Must be GGUF.",
)
args = parser.parse_args()
main(
input_text=args.input_text,
ref_codes_path=args.ref_codes,
ref_text=args.ref_text,
backbone=args.backbone,
)
```
## /examples/encode_reference.py
```py path="/examples/encode_reference.py"
# This file contains an example of how to use the NeuTTSAir class to generate codes
import torch
from librosa import load
from neucodec import NeuCodec
def main(ref_audio_path, output_path="output.pt"):
print("Encoding reference audio")
# Make sure output path ends with .pt
if not output_path.endswith(".pt"):
print("Output path should end with .pt to save the codes.")
return
# Initialize codec
codec = NeuCodec.from_pretrained("neuphonic/neucodec")
codec.eval().to("cpu")
# Load and encode reference audio
wav, _ = load(ref_audio_path, sr=16000, mono=True) # load as 16kHz
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0) # [1, 1, T]
ref_codes = codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
# Save the codes
torch.save(ref_codes, output_path)
if __name__ == "__main__":
# get arguments from command line
import argparse
parser = argparse.ArgumentParser(description="NeuTTSAir Reference Encoding Example")
parser.add_argument(
"--ref_audio", type=str, default="./samples/jo.wav", help="Path to reference audio"
)
parser.add_argument(
"--output_path",
type=str,
default="encoded_reference.pt",
help="Path to save the output codes",
)
args = parser.parse_args()
main(
ref_audio_path=args.ref_audio,
output_path=args.output_path,
)
```
## /examples/finetune.py
```py path="/examples/finetune.py"
import warnings
import re
import os
import torch
import phonemizer
from fire import Fire
from omegaconf import OmegaConf
from functools import partial
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
default_data_collator,
)
from loguru import logger as LOGGER
from datasets import load_dataset
warnings.filterwarnings("ignore")
ACRONYM = re.compile(r"(?:[a-zA-Z]\.){2,}")
ACRONYM_NO_PERIOD = re.compile(r"(?:[A-Z]){2,}")
def data_filter(sample):
text = sample["text"]
if len(text) == 0:
return False
if re.search(r"\d", text):
return False
if re.search(ACRONYM, text) or re.search(ACRONYM_NO_PERIOD, text):
return False
if text[-1] not in ".,?!":
return False
if "£" in text or "{{contextString}}quot; in text:
return False
return True
def preprocess_sample(sample, tokenizer, max_len, g2p):
# get special tokens
speech_gen_start = tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
ignore_index = -100 # this is from LLaMA
# unpack sample
vq_codes = sample["codes"]
text = sample["text"]
# phonemize
phones = g2p.phonemize([text])
# SAFE CHECK
if not phones or not phones[0]:
LOGGER.warning(f"⚠️ Empty phonemization output for sample: {sample['__key__']} text={text}")
return None
phones = phones[0].split()
phones = " ".join(phones)
codes_str = "".join([f"<|speech_{i}|>" for i in vq_codes])
# get chat format
chat = f"""user: Convert the text to speech:<|TEXT_PROMPT_START|>{phones}<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}<|SPEECH_GENERATION_END|>""" # noqa
ids = tokenizer.encode(chat)
# pad to make seq len
if len(ids) < max_len:
ids = ids + [tokenizer.pad_token_id] * (max_len - len(ids))
else:
ids = ids[:max_len]
# convert to tensor
input_ids = torch.tensor(ids, dtype=torch.long)
labels = torch.full_like(input_ids, ignore_index)
speech_gen_start_idx = (input_ids == speech_gen_start).nonzero(as_tuple=True)[0]
if len(speech_gen_start_idx) > 0:
speech_gen_start_idx = speech_gen_start_idx[0]
labels[speech_gen_start_idx:] = input_ids[speech_gen_start_idx:]
# create attention mask
attention_mask = (input_ids != tokenizer.pad_token_id).long()
# return in hf format
return {
"input_ids": input_ids,
"labels": labels,
"attention_mask": attention_mask,
}
def main(config_fpath: str):
# load config
print(f"Loading config from {config_fpath}")
config = OmegaConf.load(config_fpath)
checkpoints_dir = os.path.join(config.save_root, config.run_name)
LOGGER.info(f"Logging to: {checkpoints_dir}")
restore_from = config.restore_from
print(f"Loading checkpoint from {restore_from}")
tokenizer = AutoTokenizer.from_pretrained(restore_from)
model = AutoModelForCausalLM.from_pretrained(restore_from, torch_dtype="auto")
g2p = phonemizer.backend.EspeakBackend(
language="en-us",
preserve_punctuation=True,
with_stress=True,
words_mismatch="ignore",
language_switch="remove-flags",
)
partial_preprocess = partial(
preprocess_sample,
tokenizer=tokenizer,
max_len=config.max_seq_len,
g2p=g2p,
)
emilia_dataset = load_dataset(
"neuphonic/emilia-yodas-english-neucodec",
split="train[:2000]",
)
emilia_dataset = emilia_dataset.filter(data_filter).map(
partial_preprocess, remove_columns=["text", "codes"]
)
training_args = TrainingArguments(
output_dir=checkpoints_dir,
do_train=True,
learning_rate=config.lr,
max_steps=config.max_steps,
bf16=True,
per_device_train_batch_size=config.per_device_train_batch_size,
warmup_ratio=config.warmup_ratio,
save_steps=config.save_steps,
logging_steps=config.logging_steps,
save_strategy="steps",
ignore_data_skip=True,
dataloader_drop_last=True,
remove_unused_columns=False,
torch_compile=True,
dataloader_num_workers=64,
)
trainer = Trainer(
model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=emilia_dataset,
data_collator=default_data_collator,
)
trainer.train()
trainer.save_model(checkpoints_dir)
if __name__ == "__main__":
Fire(main)
```
## /examples/finetune_config.yaml
```yaml path="/examples/finetune_config.yaml"
# run info
restore_from: "neuphonic/neutts-air"
save_root: "/data"
run_name: "neutts-finetune"
# model info
codebook_size: 65536 # xcodec
max_seq_len: 2048
lr: 0.00004
lr_scheduler_type: "cosine"
warmup_ratio: 0.00
# train info
per_device_train_batch_size: 2
max_steps: 10000
logging_steps: 100
save_steps: 20000
seed: 1337
```
## /examples/interactive_example.ipynb
```ipynb path="/examples/interactive_example.ipynb"
{
"cells": [
{
"cell_type": "markdown",
"id": "a0fa9718",
"metadata": {},
"source": [
"Import required libraries"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "939f4fdc",
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"#Append the parent directory to the sys.path to allow imports from neuttsair package\n",
"sys.path.append('..')\n",
"from neutts import NeuTTS\n",
"from IPython.display import Audio"
]
},
{
"cell_type": "markdown",
"id": "e4e61937",
"metadata": {},
"source": [
"Downloads files and loads the model into memory"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "cfabf9dd",
"metadata": {},
"outputs": [],
"source": [
"%%capture\n",
"tts = NeuTTS(\n",
" backbone_repo=\"neuphonic/neutts-nano-q8-gguf\",\n",
" backbone_device=\"cpu\",\n",
" codec_repo=\"neuphonic/neucodec\",\n",
" codec_device=\"cpu\"\n",
")"
]
},
{
"cell_type": "markdown",
"id": "75c87818",
"metadata": {},
"source": [
"Pick your speaker and type up your input text - and generate!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "914e5e00",
"metadata": {},
"outputs": [],
"source": [
"speaker = \"jo\" # default speakers are 'dave' and 'jo'\n",
"input_text = \"My name is Andy. I'm 25 and I just moved to London. The underground is pretty confusing, but it gets me around in no time at all.\"\n",
"\n",
"ref_text = f\"../samples/{speaker}.txt\"\n",
"ref_audio_path = f\"../samples/{speaker}.wav\"\n",
"\n",
"ref_text = open(ref_text, \"r\").read().strip()\n",
"ref_codes = tts.encode_reference(ref_audio_path)\n",
"wav = tts.infer(input_text, ref_codes, ref_text)"
]
},
{
"cell_type": "markdown",
"id": "e8da4a9d",
"metadata": {},
"source": [
"Listen to your generation!"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "21774af1",
"metadata": {},
"outputs": [],
"source": [
"Audio(wav, rate=24000)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
```
## /examples/onnx_example.py
```py path="/examples/onnx_example.py"
import os
import soundfile as sf
import torch
from neutts import NeuTTS
def main(input_text, ref_codes_path, ref_text, backbone, output_path="output.wav"):
if not ref_codes_path or not ref_text:
print("No reference audio or text provided.")
return None
# Initialize NeuTTS with the desired model and codec
tts = NeuTTS(
backbone_repo=backbone,
backbone_device="cpu",
codec_repo="neuphonic/neucodec-onnx-decoder",
codec_device="cpu",
)
# Check if ref_text is a path if it is read it if not just return string
if ref_text and os.path.exists(ref_text):
with open(ref_text, "r") as f:
ref_text = f.read().strip()
if ref_codes_path and os.path.exists(ref_codes_path):
ref_codes = torch.load(ref_codes_path)
print(f"Generating audio for input text: {input_text}")
wav = tts.infer(input_text, ref_codes, ref_text)
print(f"Saving output to {output_path}")
sf.write(output_path, wav, 24000)
if __name__ == "__main__":
# get arguments from command line
import argparse
parser = argparse.ArgumentParser(description="NeuTTS Example")
parser.add_argument(
"--input_text",
type=str,
required=True,
help="Input text to be converted to speech",
)
parser.add_argument(
"--ref_codes",
type=str,
default="./samples/jo.pt",
help="Path to pre-encoded reference audio",
)
parser.add_argument(
"--ref_text",
type=str,
default="./samples/jo.txt",
help="Reference text corresponding to the reference audio",
)
parser.add_argument(
"--output_path",
type=str,
default="output.wav",
help="Path to save the output audio",
)
parser.add_argument(
"--backbone",
type=str,
default="neuphonic/neutts-nano",
help="Huggingface repo containing the backbone checkpoint",
)
args = parser.parse_args()
main(
input_text=args.input_text,
ref_codes_path=args.ref_codes,
ref_text=args.ref_text,
backbone=args.backbone,
output_path=args.output_path,
)
```
## /neutts/__init__.py
```py path="/neutts/__init__.py"
from .neutts import NeuTTS, BACKBONE_LANGUAGE_MAP # noqa
__all__ = ["NeuTTS"]
```
## /neutts/neutts.py
```py path="/neutts/neutts.py"
import os
import random
from typing import Generator
from pathlib import Path
import librosa
import numpy as np
import torch
import re
import warnings
from neucodec import NeuCodec, DistillNeuCodec
from transformers import AutoTokenizer, AutoModelForCausalLM
from .phonemizers import BasePhonemizer, CUSTOM_PHONEMIZERS
BACKBONE_LANGUAGE_MAP = {
# en models
"neuphonic/neutts-air": "en-us",
"neuphonic/neutts-air-q4-gguf": "en-us",
"neuphonic/neutts-air-q8-gguf": "en-us",
"neuphonic/neutts-nano": "en-us",
"neuphonic/neutts-nano-q4-gguf": "en-us",
"neuphonic/neutts-nano-q8-gguf": "en-us",
# de models
"neuphonic/neutts-nano-german": "de",
"neuphonic/neutts-nano-german-q4-gguf": "de",
"neuphonic/neutts-nano-german-q8-gguf": "de",
# fr models
"neuphonic/neutts-nano-french": "fr-fr",
"neuphonic/neutts-nano-french-q4-gguf": "fr-fr",
"neuphonic/neutts-nano-french-q8-gguf": "fr-fr",
# es models
"neuphonic/neutts-nano-spanish": "es",
"neuphonic/neutts-nano-spanish-q4-gguf": "es",
"neuphonic/neutts-nano-spanish-q8-gguf": "es",
}
def _linear_overlap_add(
frames: list[np.ndarray], stride: int, power: float = 1.0
) -> np.ndarray:
# original impl --> https://github.com/facebookresearch/encodec/blob/main/encodec/utils.py
assert len(frames)
dtype = frames[0].dtype
shape = frames[0].shape[:-1]
total_size = 0
for i, frame in enumerate(frames):
frame_end = stride * i + frame.shape[-1]
total_size = max(total_size, frame_end)
sum_weight = np.zeros(total_size, dtype=dtype)
out = np.zeros((*shape, total_size), dtype=dtype)
offset: int = 0
for frame in frames:
frame_length = frame.shape[-1]
t = np.linspace(0, 1, frame_length + 2, dtype=dtype)[1:-1]
weight = (0.5 - np.abs(t - 0.5)) ** power
out[..., offset : offset + frame_length] += weight * frame
sum_weight[offset : offset + frame_length] += weight
offset += stride
assert sum_weight.min() > 0
return out / sum_weight
class NeuTTS:
def __init__(
self,
backbone_repo="neuphonic/neutts-nano",
backbone_device="cpu",
codec_repo="neuphonic/neucodec",
codec_device="cpu",
language=None,
):
# Consts
self.sample_rate = 24_000
self.max_context = 2048
self.hop_length = 480
self.streaming_overlap_frames = 1
self.streaming_frames_per_chunk = 25
self.streaming_lookforward = 5
self.streaming_lookback = 50
self.streaming_stride_samples = self.streaming_frames_per_chunk * self.hop_length
# ggml & onnx flags
self._is_quantized_model = False
self._is_onnx_codec = False
# HF tokenizer
self.tokenizer = None
# Load phonemizer + models
print("Loading phonemizer...")
self._load_phonemizer(language, backbone_repo)
self._load_backbone(backbone_repo, backbone_device)
self._load_codec(codec_repo, codec_device)
# Load watermarker (optional)
try:
import perth
self.watermarker = perth.PerthImplicitWatermarker()
except (ImportError, AttributeError, TypeError) as e:
warnings.warn(
f"Perth watermarking unavailable: {e}. "
"Audio will not be watermarked. "
"Install with: pip install perth>=0.2.0"
)
self.watermarker = None
def _load_phonemizer(self, language, backbone_repo):
if not language:
if BACKBONE_LANGUAGE_MAP.get(backbone_repo) is not None:
language = BACKBONE_LANGUAGE_MAP[backbone_repo]
else:
raise ValueError(
"If you aren't using a Neuphonic model, make sure to specify any "
"eSpeak language code as the `language` parameter."
)
if language in CUSTOM_PHONEMIZERS:
self.phonemizer = CUSTOM_PHONEMIZERS[language]
else:
self.phonemizer = BasePhonemizer(language_code=language)
def _load_backbone(self, backbone_repo, backbone_device):
print(f"Loading backbone from: {backbone_repo} on {backbone_device} ...")
if backbone_repo.endswith("gguf"):
try:
from llama_cpp import Llama
except ImportError as e:
raise ImportError(
"Failed to import `llama_cpp`. "
"Please install it with:\n"
" pip install llama-cpp-python"
) from e
seed = random.randint(0, 2**32)
print(f"Using seed {seed}")
if os.path.isfile(backbone_repo):
self.backbone = Llama(
model_path=backbone_repo,
verbose=False,
n_gpu_layers=-1 if backbone_device == "gpu" else 0,
n_ctx=self.max_context,
mlock=True,
flash_attn=True if backbone_device == "gpu" else False,
seed=seed,
)
else:
self.backbone = Llama.from_pretrained(
repo_id=backbone_repo,
filename="*.gguf",
verbose=False,
n_gpu_layers=-1 if backbone_device == "gpu" else 0,
n_ctx=self.max_context,
mlock=True,
flash_attn=True if backbone_device == "gpu" else False,
seed=seed,
)
self._is_quantized_model = True
else:
self.tokenizer = AutoTokenizer.from_pretrained(backbone_repo)
self.backbone = AutoModelForCausalLM.from_pretrained(backbone_repo).to(
torch.device(backbone_device)
)
def _load_codec(self, codec_repo, codec_device):
print(f"Loading codec from: {codec_repo} on {codec_device} ...")
if codec_repo.endswith(".onnx") and os.path.isfile(codec_repo):
try:
from neucodec import NeuCodecOnnxDecoder
except ImportError as e:
raise ImportError(
"Failed to import NeuCodecOnnxDecoder. "
"Make sure `neucodec` and `onnxruntime` are installed."
) from e
self.codec = NeuCodecOnnxDecoder(codec_repo)
self._is_onnx_codec = True
match codec_repo:
case "neuphonic/neucodec":
self.codec = NeuCodec.from_pretrained(codec_repo)
self.codec.eval().to(codec_device)
case "neuphonic/distill-neucodec":
self.codec = DistillNeuCodec.from_pretrained(codec_repo)
self.codec.eval().to(codec_device)
case "neuphonic/neucodec-onnx-decoder" | "neuphonic/neucodec-onnx-decoder-int8":
if codec_device != "cpu":
raise ValueError("Onnx decoder only currently runs on CPU.")
try:
from neucodec import NeuCodecOnnxDecoder
except ImportError as e:
raise ImportError(
"Failed to import the onnx decoder."
" Ensure you have onnxruntime installed as well as neucodec >= 0.0.4."
) from e
self.codec = NeuCodecOnnxDecoder.from_pretrained(codec_repo)
self._is_onnx_codec = True
case _:
raise ValueError(
"Invalid codec repo! Must be one of:"
" 'neuphonic/neucodec', 'neuphonic/distill-neucodec',"
" 'neuphonic/neucodec-onnx-decoder'."
)
def infer(self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str) -> np.ndarray:
"""
Perform inference to generate speech from text using the TTS model and reference audio.
Args:
text (str): Input text to be converted to speech.
ref_codes (np.ndarray | torch.tensor): Encoded reference.
ref_text (str): Reference text for reference audio. Defaults to None.
Returns:
np.ndarray: Generated speech waveform.
"""
# Generate tokens
if self._is_quantized_model:
output_str = self._infer_ggml(ref_codes, ref_text, text)
else:
prompt_ids = self._apply_chat_template(ref_codes, ref_text, text)
output_str = self._infer_torch(prompt_ids)
# Decode
wav = self._decode(output_str)
watermarked_wav = (
wav
if self.watermarker is None
else self.watermarker.apply_watermark(wav, sample_rate=24_000)
)
return watermarked_wav
def infer_stream(
self, text: str, ref_codes: np.ndarray | torch.Tensor, ref_text: str
) -> Generator[np.ndarray, None, None]:
"""
Perform streaming inference to generate speech from
text using the TTS model and reference audio.
Args:
text (str): Input text to be converted to speech.
ref_codes (np.ndarray | torch.tensor): Encoded reference.
ref_text (str): Reference text for reference audio. Defaults to None.
Yields:
np.ndarray: Generated speech waveform.
"""
if self._is_quantized_model:
return self._infer_stream_ggml(ref_codes, ref_text, text)
else:
raise NotImplementedError("Streaming is not implemented for the torch backend!")
def encode_reference(self, ref_audio_path: str | Path):
wav, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
wav_tensor = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0) # [1, 1, T]
with torch.no_grad():
ref_codes = self.codec.encode_code(audio_or_path=wav_tensor).squeeze(0).squeeze(0)
return ref_codes
def _decode(self, codes: str):
# Extract speech token IDs using regex
speech_ids = [int(num) for num in re.findall(r"<\|speech_(\d+)\|>", codes)]
if len(speech_ids) > 0:
# Onnx decode
if self._is_onnx_codec:
codes = np.array(speech_ids, dtype=np.int32)[np.newaxis, np.newaxis, :]
recon = self.codec.decode_code(codes)
# Torch decode
else:
with torch.no_grad():
codes = torch.tensor(speech_ids, dtype=torch.long)[None, None, :].to(
self.codec.device
)
recon = self.codec.decode_code(codes).cpu().numpy()
return recon[0, 0, :]
else:
raise ValueError("No valid speech tokens found in the output.")
def _to_phones(self, text: str) -> str:
phones = self.phonemizer.phonemize([text])
phones = phones[0].split()
phones = " ".join(phones)
return phones
def _apply_chat_template(
self, ref_codes: list[int], ref_text: str, input_text: str
) -> list[int]:
input_text = self._to_phones(ref_text) + " " + self._to_phones(input_text)
speech_replace = self.tokenizer.convert_tokens_to_ids("<|SPEECH_REPLACE|>")
speech_gen_start = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_START|>")
text_replace = self.tokenizer.convert_tokens_to_ids("<|TEXT_REPLACE|>")
text_prompt_start = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_START|>")
text_prompt_end = self.tokenizer.convert_tokens_to_ids("<|TEXT_PROMPT_END|>")
input_ids = self.tokenizer.encode(input_text, add_special_tokens=False)
chat = """user: Convert the text to speech:<|TEXT_REPLACE|>\nassistant:<|SPEECH_REPLACE|>"""
ids = self.tokenizer.encode(chat)
text_replace_idx = ids.index(text_replace)
ids = (
ids[:text_replace_idx]
+ [text_prompt_start]
+ input_ids
+ [text_prompt_end]
+ ids[text_replace_idx + 1 :] # noqa
)
speech_replace_idx = ids.index(speech_replace)
codes_str = "".join([f"<|speech_{i}|>" for i in ref_codes])
codes = self.tokenizer.encode(codes_str, add_special_tokens=False)
ids = ids[:speech_replace_idx] + [speech_gen_start] + list(codes)
return ids
def _infer_torch(self, prompt_ids: list[int]) -> str:
prompt_tensor = torch.tensor(prompt_ids).unsqueeze(0).to(self.backbone.device)
speech_end_id = self.tokenizer.convert_tokens_to_ids("<|SPEECH_GENERATION_END|>")
with torch.no_grad():
output_tokens = self.backbone.generate(
prompt_tensor,
max_length=self.max_context,
eos_token_id=speech_end_id,
do_sample=True,
temperature=1.0,
top_k=50,
use_cache=True,
min_new_tokens=50,
)
input_length = prompt_tensor.shape[-1]
output_str = self.tokenizer.decode(
output_tokens[0, input_length:].cpu().numpy().tolist(), add_special_tokens=False
)
return output_str
def _infer_ggml(self, ref_codes: list[int], ref_text: str, input_text: str) -> str:
ref_text = self._to_phones(ref_text)
input_text = self._to_phones(input_text)
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
prompt = (
f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
)
output = self.backbone(
prompt,
max_tokens=self.max_context,
temperature=1.0,
top_k=50,
stop=["<|SPEECH_GENERATION_END|>"],
)
output_str = output["choices"][0]["text"]
return output_str
def _infer_stream_ggml(
self, ref_codes: torch.Tensor, ref_text: str, input_text: str
) -> Generator[np.ndarray, None, None]:
ref_text = self._to_phones(ref_text)
input_text = self._to_phones(input_text)
codes_str = "".join([f"<|speech_{idx}|>" for idx in ref_codes])
prompt = (
f"user: Convert the text to speech:<|TEXT_PROMPT_START|>{ref_text} {input_text}"
f"<|TEXT_PROMPT_END|>\nassistant:<|SPEECH_GENERATION_START|>{codes_str}"
)
audio_cache: list[np.ndarray] = []
token_cache: list[str] = [f"<|speech_{idx}|>" for idx in ref_codes]
n_decoded_samples: int = 0
n_decoded_tokens: int = len(ref_codes)
for item in self.backbone(
prompt,
max_tokens=self.max_context,
temperature=1.0,
top_k=50,
stop=["<|SPEECH_GENERATION_END|>"],
stream=True,
):
output_str = item["choices"][0]["text"]
token_cache.append(output_str)
if (
len(token_cache[n_decoded_tokens:])
>= self.streaming_frames_per_chunk + self.streaming_lookforward
):
# decode chunk
tokens_start = max(
n_decoded_tokens - self.streaming_lookback - self.streaming_overlap_frames, 0
)
tokens_end = (
n_decoded_tokens
+ self.streaming_frames_per_chunk
+ self.streaming_lookforward
+ self.streaming_overlap_frames
)
sample_start = (n_decoded_tokens - tokens_start) * self.hop_length
sample_end = (
sample_start
+ (self.streaming_frames_per_chunk + 2 * self.streaming_overlap_frames)
* self.hop_length
)
curr_codes = token_cache[tokens_start:tokens_end]
recon = self._decode("".join(curr_codes))
recon = (
recon
if self.watermarker is None
else self.watermarker.apply_watermark(recon, sample_rate=24_000)
)
recon = recon[sample_start:sample_end]
audio_cache.append(recon)
# postprocess
processed_recon = _linear_overlap_add(
audio_cache, stride=self.streaming_stride_samples
)
new_samples_end = len(audio_cache) * self.streaming_stride_samples
processed_recon = processed_recon[n_decoded_samples:new_samples_end]
n_decoded_samples = new_samples_end
n_decoded_tokens += self.streaming_frames_per_chunk
yield processed_recon
# final decoding handled seperately as non-constant chunk size
remaining_tokens = len(token_cache) - n_decoded_tokens
if len(token_cache) > n_decoded_tokens:
tokens_start = max(
len(token_cache)
- (self.streaming_lookback + self.streaming_overlap_frames + remaining_tokens),
0,
)
sample_start = (
len(token_cache) - tokens_start - remaining_tokens - self.streaming_overlap_frames
) * self.hop_length
curr_codes = token_cache[tokens_start:]
recon = self._decode("".join(curr_codes))
recon = (
recon
if self.watermarker is None
else self.watermarker.apply_watermark(recon, sample_rate=24_000)
)
recon = recon[sample_start:]
audio_cache.append(recon)
processed_recon = _linear_overlap_add(audio_cache, stride=self.streaming_stride_samples)
processed_recon = processed_recon[n_decoded_samples:]
yield processed_recon
```
## /neutts/phonemizers.py
```py path="/neutts/phonemizers.py"
from typing import Union, List
from phonemizer.backend import EspeakBackend
import platform
import glob
import os
from pathlib import Path
def _configure_espeak_library() -> bool:
"""Configure phonemizer to use the espeak-ng bundled with this package.
Falls back to system/Homebrew espeak-ng if the bundled version is not present
(e.g. when running from a source checkout without building).
Returns True if the bundled version was loaded, False if a system fallback was used.
"""
try:
from phonemizer.backend.espeak.wrapper import EspeakWrapper
pkg_dir = Path(__file__).parent
# Locate the bundled shared library
system = platform.system()
if system == "Windows":
patterns = ["espeak-ng*.dll"]
elif system == "Darwin":
patterns = ["libespeak-ng*.dylib"]
else:
patterns = ["libespeak-ng.so*", "libespeak-ng*.so"]
lib_path = None
for pattern in patterns:
matches = list(pkg_dir.glob(pattern))
if matches:
lib_path = str(matches[0])
break
if lib_path:
EspeakWrapper.set_library(lib_path)
# Point espeak-ng at the bundled data directory
data_dir = pkg_dir / "espeak-ng-data"
if data_dir.exists():
os.environ["ESPEAK_DATA_PATH"] = str(data_dir)
return True
except Exception:
pass
# Fallback 1: look for the bundled library in the active venv's site-packages.
# This handles running pytest from a source checkout whose venv contains the
# installed neutts wheel (which ships libespeak-ng).
try:
import site
from phonemizer.backend.espeak.wrapper import EspeakWrapper
system = platform.system()
if system == "Windows":
lib_pattern = "espeak-ng*.dll"
elif system == "Darwin":
lib_pattern = "libespeak-ng*.dylib"
else:
lib_pattern = "libespeak-ng*.so*"
for site_dir in site.getsitepackages():
for candidate in Path(site_dir).glob(f"neutts/{lib_pattern}"):
EspeakWrapper.set_library(str(candidate))
data_dir = candidate.parent / "espeak-ng-data"
if data_dir.exists():
os.environ["ESPEAK_DATA_PATH"] = str(data_dir)
return True
except Exception:
pass
# Fallback 2: search common Homebrew/system paths on macOS
if platform.system() == "Darwin":
search_paths = [
"/opt/homebrew/Cellar/espeak-ng/*/lib/libespeak-ng.*.dylib",
"/usr/local/Cellar/espeak-ng/*/lib/libespeak-ng.*.dylib",
"/opt/homebrew/Cellar/espeak/*/lib/libespeak.*.dylib",
"/usr/local/Cellar/espeak/*/lib/libespeak.*.dylib",
]
for pattern in search_paths:
matches = glob.glob(pattern)
if matches:
try:
from phonemizer.backend.espeak.wrapper import EspeakWrapper
EspeakWrapper.set_library(matches[0])
except Exception:
pass
break
return False
# Call before using phonemizer. Tracks whether we loaded the bundled espeak-ng.
_using_bundled_espeak = _configure_espeak_library()
class BasePhonemizer:
def __init__(self, language_code: str = None):
self.code = language_code
if not self.code:
raise ValueError(
"A language code must be provided either via argument or subclass default"
)
self.g2p = EspeakBackend(
language=self.code,
preserve_punctuation=True,
with_stress=True,
words_mismatch="ignore",
language_switch="remove-flags",
)
self.espeak_version = self.g2p.version() # returns (major, minor, patch)
if not _using_bundled_espeak:
version_str = ".".join(str(v) for v in self.espeak_version)
print(
f"\nWARNING: You are using espeak-ng version {version_str}, which is not the "
"supported version bundled with NeuTTS. This version is not supported and may "
"not work as intended, particularly for non-English languages. "
"To use the correct version, reinstall the package via pip: pip install neutts\n"
)
def preprocess(self, text: str) -> str:
"""Language-specific text preprocessing."""
return text
def clean(self, phonemes: str) -> str:
"""Language-specific phoneme cleanup."""
return phonemes
def phonemize(self, text: Union[str, List[str]]) -> Union[str, List[str]]:
"""Phonemize text (or list of texts), then clean the output."""
single_input = False
if isinstance(text, str):
text = [text]
single_input = True
preprocessed_text = [self.preprocess(t) for t in text]
phonemes_list = self.g2p.phonemize(preprocessed_text)
cleaned_list = [self.clean(p) for p in phonemes_list]
return cleaned_list[0] if single_input else cleaned_list
class FrenchPhonemizer(BasePhonemizer):
def __init__(self, language_code: str = "fr-fr"):
super().__init__(language_code)
def clean(self, phonemes: str) -> str:
# Remove dashes (common in french output - indicates syllable, but not needed)
return phonemes.replace("-", "")
CUSTOM_PHONEMIZERS = {
"fr-fr": FrenchPhonemizer(),
}
```
## /neuttsair/__init__.py
```py path="/neuttsair/__init__.py"
from .neutts import NeuTTSAir
__all__ = ["NeuTTSAir"]
```
## /neuttsair/neutts.py
```py path="/neuttsair/neutts.py"
from neutts.neutts import NeuTTS
class NeuTTSAir(NeuTTS):
"""
NeuTTSAir is a subclass of NeuTTS.
It inherits all methods and attributes automatically.
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
```
## /output.wav
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/output.wav
## /pyproject.toml
```toml path="/pyproject.toml"
[build-system]
requires = ["scikit-build-core>=0.9"]
build-backend = "scikit_build_core.build"
[project]
name = "neutts"
version = "1.2.0"
authors = [
{ name="neuphonic", email="general@neuphonic.com" },
]
description = "NeuTTS - a package for text-to-speech generation using Neuphonic's TTS models."
readme = "README.md"
requires-python = ">=3.10, <3.14"
dependencies = [
"librosa==0.11.0",
"neucodec>=0.0.4",
"numpy~=2.2.6",
"phonemizer>=3.0.0",
"resemble-perth==1.0.1",
"soundfile==0.13.1",
"torch>=2.8.0",
"transformers~=4.56.1",
]
# Defining optional installations
[project.optional-dependencies]
llama = ["llama-cpp-python"]
onnx = ["onnxruntime"]
all = ["llama-cpp-python", "onnxruntime"]
[tool.scikit-build]
# Include the espeak-ng-data directory that cmake copies into the package
wheel.packages = ["neutts", "neuttsair"]
```
## /requirements-dev.txt
pre-commit
pyaudio
pytest
## /samples/dave.pt
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/dave.pt
## /samples/dave.txt
So I'm live on radio. And I say, well, my dear friend James here clearly, and the whole room just froze. Turns out I'd completely misspoken and mentioned our other friend.
## /samples/dave.wav
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/dave.wav
## /samples/greta.pt
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/greta.pt
## /samples/greta.txt
Es wurde eine Untersuchung zur Aufklärung des Unfalls eingeleitet.
## /samples/greta.wav
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/greta.wav
## /samples/jo.pt
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/jo.pt
## /samples/jo.txt
So I just tried Neuphonic and I’m genuinely impressed. It's super responsive, it sounds clean, supports voice cloning, and the agent feature is fun to play with too. Highly recommend it for podcasts, conversations, or even just messing around with voiceovers.
## /samples/jo.wav
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/jo.wav
## /samples/juliette.pt
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/juliette.pt
## /samples/juliette.txt
Dans les zones rurales où de nombreuses communautés n'ont pas accès à l'électricité, l'énergie solaire peut faire une énorme différence.
## /samples/juliette.wav
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/juliette.wav
## /samples/mateo.pt
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/mateo.pt
## /samples/mateo.txt
Además su eficiencia depende del clima. En días nublados o durante la noche producen menos energía.
## /samples/mateo.wav
Binary file available at https://raw.githubusercontent.com/neuphonic/neutts-air/refs/heads/main/samples/mateo.wav
## /tests/__init__.py
```py path="/tests/__init__.py"
```
## /tests/test_neutts.py
```py path="/tests/test_neutts.py"
import os
import torch
import numpy as np
import pytest
from neutts import NeuTTS, BACKBONE_LANGUAGE_MAP
_ALL_BACKBONES = list(BACKBONE_LANGUAGE_MAP.keys())
_QUICK_BACKBONES = [
"neuphonic/neutts-air",
"neuphonic/neutts-air-q4-gguf",
]
_SLOW_BACKBONES = [b for b in _ALL_BACKBONES if b not in _QUICK_BACKBONES]
_SLOW_GGUF_BACKBONES = [b for b in _SLOW_BACKBONES if b.endswith("gguf")]
_QUICK_GGUF_BACKBONES = [b for b in _QUICK_BACKBONES if b.endswith("gguf")]
CODECS = [
"neuphonic/neucodec",
"neuphonic/distill-neucodec",
"neuphonic/neucodec-onnx-decoder",
]
@pytest.fixture()
def reference_data() -> tuple[torch.Tensor, str]:
ref_codes = torch.load("./samples/dave.pt")
with open("./samples/dave.txt", "r") as f:
ref_text = f.read()
return ref_codes, ref_text
def _run_inference_test(backbone, codec, reference_data):
"""Loads a backbone+codec pair and validates the audio output."""
ref_codes, ref_text = reference_data
try:
model = NeuTTS(
backbone_repo=backbone,
backbone_device="cpu",
codec_repo=codec,
codec_device="cpu",
)
except Exception as e:
pytest.fail(f"Failed to load combination {backbone} + {codec}: {e}")
audio = model.infer(text="Testing.", ref_codes=ref_codes, ref_text=ref_text)
assert isinstance(audio, np.ndarray), "Output should be a numpy array"
assert len(audio) > 0, "Generated audio should not be empty"
assert not np.isnan(audio).any(), "Audio contains NaN values"
assert audio.dtype in [np.float32, np.float64]
print(f"Successfully generated {len(audio) / 24000:.2f}s of audio for {codec}")
def _run_streaming_test(backbone, codec, reference_data):
"""Loads a backbone+codec pair and validates streaming output."""
ref_codes, ref_text = reference_data
try:
model = NeuTTS(
backbone_repo=backbone,
backbone_device="cpu",
codec_repo=codec,
codec_device="cpu",
)
except Exception as e:
pytest.fail(f"Failed to load combination {backbone} + {codec}: {e}")
gen = model.infer_stream(
"This is a streaming test that should be comprised of multiple chunks.",
ref_codes,
ref_text,
)
chunks = []
for chunk in gen:
assert isinstance(chunk, np.ndarray)
chunks.append(chunk)
assert len(chunks) > 0, "Stream yielded no audio chunks"
@pytest.mark.parametrize("backbone", _QUICK_BACKBONES)
@pytest.mark.parametrize("codec", CODECS)
def test_model_loading_and_inference(backbone, codec, reference_data):
_run_inference_test(backbone, codec, reference_data)
@pytest.mark.parametrize("backbone", _SLOW_BACKBONES)
@pytest.mark.parametrize("codec", CODECS)
def test_model_loading_and_inference_slow(backbone, codec, reference_data):
if "RUN_SLOW" not in os.environ:
pytest.skip("Skipping slow tests...")
else:
_run_inference_test(backbone, codec, reference_data)
@pytest.mark.parametrize("backbone", _QUICK_GGUF_BACKBONES)
@pytest.mark.parametrize("codec", CODECS)
def test_streaming_ggml(backbone, codec, reference_data):
_run_streaming_test(backbone, codec, reference_data)
@pytest.mark.parametrize("backbone", _SLOW_GGUF_BACKBONES)
@pytest.mark.parametrize("codec", CODECS)
def test_streaming_ggml_slow(backbone, codec, reference_data):
if "RUN_SLOW" not in os.environ:
pytest.skip("Skipping slow tests...")
else:
_run_streaming_test(backbone, codec, reference_data)
```
The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.