```
├── .claude-plugin/
├── marketplace.json (100 tokens)
├── plugin.json (100 tokens)
├── .github/
├── workflows/
├── check-links.yaml (300 tokens)
├── claude-review.yml (400 tokens)
├── downstream-compat.yaml (100 tokens)
├── ghstack_land.yaml (1700 tokens)
├── integration-tests.yaml (400 tokens)
├── nightly.yaml (500 tokens)
├── pre-commit.yaml
├── publish-pypi.yaml (400 tokens)
├── pyright.yaml (100 tokens)
├── pytest.yaml (100 tokens)
├── scripts/
├── ghstack-perm-check.py (2.2k tokens)
├── smoke-test-evals.yaml (200 tokens)
├── smoke-test-recipes.yaml (300 tokens)
├── .gitignore
├── .pre-commit-config.yaml (100 tokens)
├── .sync_state
├── AGENTS.md (1200 tokens)
├── CHANGELOG.md (9.3k tokens)
├── CLAUDE.md
├── CONTRIBUTING.md (1500 tokens)
├── LICENSE (omitted)
├── README.md (1800 tokens)
├── assets/
├── tinker-cover.png
├── pyproject.toml (1200 tokens)
├── skills/
├── debug/
├── SKILL.md (6.9k tokens)
├── references/
├── async-task-dump.md (2000 tokens)
├── error-reference.md (1300 tokens)
├── merge-debugging.md (1200 tokens)
├── renderer-debugging.md (2000 tokens)
├── serialization-test.md (700 tokens)
├── research/
├── SKILL.md (5.7k tokens)
├── references/
├── cli.md (500 tokens)
├── dev.md (1300 tokens)
├── distillation.md (800 tokens)
├── hyperparams.md (600 tokens)
├── models.md (700 tokens)
├── ops.md (2.3k tokens)
├── preferences.md (2.1k tokens)
├── rl.md (2.3k tokens)
├── sdk.md (1800 tokens)
├── sft.md (1700 tokens)
├── tests/
├── __init__.py
├── compare_sampling_training_logprobs.py (1200 tokens)
├── conftest.py (300 tokens)
├── downstream_compat/
├── __init__.py
├── conftest.py (100 tokens)
├── sig_helpers.py (200 tokens)
├── test_checkpoint_utils.py (600 tokens)
├── test_cli_and_hyperparam.py (3.2k tokens)
├── test_completers.py (500 tokens)
├── test_install_skills.py (700 tokens)
├── test_model_info.py (500 tokens)
├── test_package_exports.py (1400 tokens)
├── test_recipes.py (1600 tokens)
├── test_renderers.py (2.2k tokens)
├── test_rl_train.py (600 tokens)
├── test_rl_types.py (1400 tokens)
├── test_supervised.py (400 tokens)
├── test_tokenizer_utils.py (300 tokens)
├── test_utils.py (600 tokens)
├── helpers.py (600 tokens)
├── integration/
├── __init__.py
├── test_math_dataset_builder.py (200 tokens)
├── recipes/
├── __init__.py
├── test_recipe_chat_sl.py (200 tokens)
├── test_recipe_dpo.py
├── test_recipe_guess_number.py (100 tokens)
├── test_recipe_math_rl.py (200 tokens)
├── test_recipe_off_policy_reasoning.py (100 tokens)
├── test_recipe_on_policy_distillation.py (100 tokens)
├── test_recipe_on_policy_multi_teacher.py (100 tokens)
├── test_recipe_rlhf_pipeline.py
├── test_recipe_rolling_checkpoints.py (600 tokens)
├── test_recipe_shorter.py
├── test_recipe_text_arena.py (100 tokens)
├── test_recipe_twenty_questions.py (100 tokens)
├── test_recipe_vlm_classifier.py (100 tokens)
├── test_inspect_eval.py (1100 tokens)
├── test_modal_sandbox.py (900 tokens)
├── third_party/
├── __init__.py
├── test_litellm.py (1100 tokens)
├── validate_temperature_logprobs.py (2.4k tokens)
├── weights/
├── __init__.py
├── conftest.py (1100 tokens)
├── gpu/
├── README.md (1100 tokens)
├── __init__.py
├── conftest.py (1700 tokens)
├── test_deepseek.py (700 tokens)
├── test_gpt_oss.py (600 tokens)
├── test_kimi.py (700 tokens)
├── test_kimi_k25.py (600 tokens)
├── test_nemotron.py (400 tokens)
├── test_qwen3.py (400 tokens)
├── test_qwen3_5.py (700 tokens)
├── test_qwen3_vl.py (600 tokens)
├── test_adapter_deepseek.py (600 tokens)
├── test_adapter_gpt_oss.py (800 tokens)
├── test_adapter_kimi.py (1200 tokens)
├── test_adapter_publish.py (900 tokens)
├── test_adapter_qwen3.py (1900 tokens)
├── test_adapter_qwen3_5.py (2.7k tokens)
├── test_download.py (300 tokens)
├── test_export_deepseek.py (2.5k tokens)
├── test_export_gpt_oss.py (800 tokens)
├── test_export_kimi_common.py (1300 tokens)
├── test_export_kimi_k2.py (1500 tokens)
├── test_export_kimi_k25.py (1700 tokens)
├── test_export_nemotron.py (1300 tokens)
├── test_export_qwen3.py (2.2k tokens)
├── test_export_qwen3_5.py (4.6k tokens)
├── test_lifecycle.py (800 tokens)
├── test_partial_lora_lifecycle.py (2.4k tokens)
├── test_profile_kimi.py (1500 tokens)
├── test_publish.py (1100 tokens)
├── test_quantized.py (4.9k tokens)
├── test_quantized_equivalence.py (3.8k tokens)
├── test_strategy_consistency.py (1200 tokens)
├── test_tokenizer_coverage.py (600 tokens)
├── test_weight_export_e2e.py (2.6k tokens)
├── verify_before_after.py (2000 tokens)
├── vllm_serving/
├── README.md (500 tokens)
├── __init__.py
├── conftest.py (600 tokens)
├── requirements.txt (100 tokens)
├── setup_env.sh (200 tokens)
├── test_deepseek.py (100 tokens)
├── test_gpt_oss.py (400 tokens)
├── test_kimi.py (200 tokens)
├── test_nemotron.py (2.5k tokens)
├── test_qwen3.py (1100 tokens)
├── test_qwen3_5.py (1200 tokens)
├── tinker_cookbook/
├── __init__.py (200 tokens)
├── chat_app/
├── README.md (300 tokens)
├── tinker_chat_cli.py (1200 tokens)
├── checkpoint_utils.py (6.5k tokens)
├── checkpoint_utils_test.py (5.3k tokens)
├── cli_utils.py (500 tokens)
├── cli_utils_test.py (300 tokens)
├── completers.py (1400 tokens)
├── display.py (400 tokens)
├── distillation/
├── __init__.py (300 tokens)
├── datasets.py (1900 tokens)
├── sdft.py (9k tokens)
├── train_off_policy.py (3.7k tokens)
├── train_on_policy.py (3.9k tokens)
├── eval/
├── README.md (3.3k tokens)
├── __init__.py (600 tokens)
├── benchmark_evaluator.py (700 tokens)
├── benchmarks/
├── __init__.py (600 tokens)
├── _arena_hard.py (1500 tokens)
├── _bfcl.py (1800 tokens)
├── _common.py (3.5k tokens)
├── _hmmt.py (1600 tokens)
├── _ifeval_verify.py (2.8k tokens)
├── _livecodebench.py (1900 tokens)
├── _longbench.py (1300 tokens)
├── _runner.py (7.8k tokens)
├── _swe_bench.py (3.2k tokens)
├── _tau2_bench.py (6.6k tokens)
├── _terminal_bench.py (2.8k tokens)
├── _types.py (5.3k tokens)
├── aime.py (1300 tokens)
├── benchmark_test.py (4.6k tokens)
├── ceval.py (1400 tokens)
├── gpqa.py (1100 tokens)
├── gsm8k.py (800 tokens)
├── ifbench.py (1300 tokens)
├── ifeval.py (1000 tokens)
├── math500.py (900 tokens)
├── mbpp.py (1100 tokens)
├── mmlu_pro.py (1000 tokens)
├── mmlu_redux.py (1400 tokens)
├── supergpqa.py (1000 tokens)
├── custom_evaluators.py (600 tokens)
├── custom_inspect_task.py (400 tokens)
├── evaluators.py (400 tokens)
├── inspect_evaluators.py (1100 tokens)
├── inspect_utils.py (3.1k tokens)
├── inspect_utils_test.py (2.7k tokens)
├── run_inspect_evals.py (500 tokens)
├── example_data/
├── conversations.jsonl (4.7k tokens)
├── multilingual.txt (33.3k tokens)
├── exceptions.py (1900 tokens)
├── exceptions_test.py (1000 tokens)
├── hyperparam_utils.py (2.9k tokens)
├── image_processing_utils.py (400 tokens)
├── image_processing_utils_test.py (400 tokens)
├── model_info.py (2.3k tokens)
├── model_info_test.py (800 tokens)
├── preference/
├── __init__.py (100 tokens)
├── comparison_policy_evaluator.py (1000 tokens)
├── dpo_datasets.py (1000 tokens)
├── preference_datasets.py (1900 tokens)
├── train_dpo.py (5.2k tokens)
├── types.py (2.3k tokens)
├── py.typed
├── recipes/
├── README.md (800 tokens)
├── chat_sl/
├── README.md (300 tokens)
├── chat_datasets.py (600 tokens)
├── results/
├── plots/
├── Qwen-Qwen3-235B-A22B-Instruct-2507_nll_curves.png
├── Qwen-Qwen3-30B-A3B-Base_nll_curves.png
├── Qwen-Qwen3-30B-A3B-Instruct-2507_nll_curves.png
├── Qwen-Qwen3-30B-A3B_nll_curves.png
├── Qwen-Qwen3-32B_nll_curves.png
├── Qwen-Qwen3-4B-Instruct-2507_nll_curves.png
├── Qwen-Qwen3-8B-Base_nll_curves.png
├── Qwen-Qwen3-8B_nll_curves.png
├── Qwen-Qwen3-VL-235B-A22B-Instruct_nll_curves.png
├── Qwen-Qwen3-VL-30B-A3B-Instruct_nll_curves.png
├── Qwen-Qwen3.5-27B_nll_curves.png
├── Qwen-Qwen3.5-35B-A3B_nll_curves.png
├── Qwen-Qwen3.5-397B-A17B_nll_curves.png
├── Qwen-Qwen3.5-4B_nll_curves.png
├── deepseek-ai-DeepSeek-V3.1-Base_nll_curves.png
├── meta-llama-Llama-3.1-70B_nll_curves.png
├── meta-llama-Llama-3.1-8B-Instruct_nll_curves.png
├── meta-llama-Llama-3.1-8B_nll_curves.png
├── meta-llama-Llama-3.2-1B_nll_curves.png
├── meta-llama-Llama-3.2-3B_nll_curves.png
├── meta-llama-Llama-3.3-70B-Instruct_nll_curves.png
├── moonshotai-Kimi-K2-Thinking_nll_curves.png
├── moonshotai-Kimi-K2.5_nll_curves.png
├── nvidia-NVIDIA-Nemotron-3-Nano-30B-A3B-BF16_nll_curves.png
├── nvidia-NVIDIA-Nemotron-3-Super-120B-A12B-BF16_nll_curves.png
├── openai-gpt-oss-120b_nll_curves.png
├── openai-gpt-oss-20b_nll_curves.png
├── sft_sweep.md (9.8k tokens)
├── sweep/
├── README.md (400 tokens)
├── __init__.py (100 tokens)
├── __main__.py
├── analyze.py (4.5k tokens)
├── cli.py (1800 tokens)
├── cli_test.py (700 tokens)
├── grid.py (400 tokens)
├── grid_test.py (600 tokens)
├── launch_all.py (2.2k tokens)
├── results.py (800 tokens)
├── results_test.py (1100 tokens)
├── runner.py (1600 tokens)
├── runner_test.py (1200 tokens)
├── train.py (1200 tokens)
├── code_rl/
├── README.md (600 tokens)
├── code_env.py (2.1k tokens)
├── code_grading.py (1200 tokens)
├── deepcoder_tool.py (900 tokens)
├── lcb_utils.py (6.2k tokens)
├── sandbox_config/
├── local.yaml (900 tokens)
├── train.py (900 tokens)
├── distillation/
├── README.md (1600 tokens)
├── harbor_multiturn.py (200 tokens)
├── harbor_multiturn_test.py (300 tokens)
├── off_policy_reasoning.py (1300 tokens)
├── on_policy_distillation.py (1200 tokens)
├── on_policy_distillation_harbor_multi_turn.py (1300 tokens)
├── on_policy_multi_teacher.py (1300 tokens)
├── harbor_rl/
├── README.md (1200 tokens)
├── eval.py (1600 tokens)
├── harbor_env.py (1800 tokens)
├── harbor_tools.py (1000 tokens)
├── harbor_tools_test.py (1900 tokens)
├── scripts/
├── .gitignore
├── eval_harbor_rl.py (600 tokens)
├── train_terminal_bench.py (100 tokens)
├── train.py (800 tokens)
├── math_rl/
├── README.md (700 tokens)
├── arithmetic_env.py (700 tokens)
├── math_env.py (3.1k tokens)
├── math_env_test.py (1000 tokens)
├── math_grading.py (3.4k tokens)
├── train.py (1200 tokens)
├── multiplayer_rl/
├── README.md (200 tokens)
├── guess_number/
├── README.md (900 tokens)
├── env.py (1300 tokens)
├── train.py (500 tokens)
├── text_arena/
├── README.md (1900 tokens)
├── env.py (2.4k tokens)
├── env_test.py (900 tokens)
├── play.py (1200 tokens)
├── tictactoe.py (1200 tokens)
├── train.py (600 tokens)
├── twenty_questions/
├── README.md (1000 tokens)
├── common_english_nouns.txt (200 tokens)
├── env.py (2.1k tokens)
├── train.py (500 tokens)
├── preference/
├── README.md (400 tokens)
├── datasets.py (2.5k tokens)
├── dpo/
├── README.md (200 tokens)
├── train.py (900 tokens)
├── rlhf/
├── README.md (500 tokens)
├── rlhf_pipeline.py (2000 tokens)
├── shorter/
├── README.md (400 tokens)
├── env.py (400 tokens)
├── train.py (500 tokens)
├── prompt_distillation/
├── README.md (800 tokens)
├── create_data.py (1600 tokens)
├── train.py (800 tokens)
├── rl_basic.py (300 tokens)
├── rl_loop.py (2000 tokens)
├── rubric/
├── README.md (900 tokens)
├── assets/
├── prometheus-reward.png
├── test-reward.png
├── data.py (1400 tokens)
├── debug_env.py (500 tokens)
├── env.py (2.1k tokens)
├── generate_data.py (400 tokens)
├── prometheus_experimental.py (1100 tokens)
├── train.py (1200 tokens)
├── sdft/
├── README.md (1800 tokens)
├── __init__.py
├── benchmark.py (2.9k tokens)
├── datasets.py (3.8k tokens)
├── eval.py (1800 tokens)
├── run_continual_learning.py (2.6k tokens)
├── sdft_test.py (4.7k tokens)
├── train.py (1100 tokens)
├── search_tool/
├── README.md (900 tokens)
├── chroma_pickle_test.py (200 tokens)
├── embedding.py (900 tokens)
├── offline_eval.py (1500 tokens)
├── search_env.py (1900 tokens)
├── tools.py (2k tokens)
├── train.py (900 tokens)
├── sl_basic.py (400 tokens)
├── sl_loop.py (1200 tokens)
├── true_thinking_score/
├── README.md (2.4k tokens)
├── __init__.py
├── analyze.py (1900 tokens)
├── run_small_experiment.py (1100 tokens)
├── tts.py (3.8k tokens)
├── tts_test.py (1000 tokens)
├── verifiers_rl/
├── README.md (500 tokens)
├── evaluate.py (1100 tokens)
├── tinker_openai.py (2000 tokens)
├── train.py (1100 tokens)
├── verifiers_env.py (1400 tokens)
├── verifiers_pickle_test.py (200 tokens)
├── vlm_classifier/
├── README.md (400 tokens)
├── data.py (3.5k tokens)
├── eval.py (3k tokens)
├── eval_sweep.py (1800 tokens)
├── sweep.py (1500 tokens)
├── train.py (1100 tokens)
├── renderers/
├── README.md (400 tokens)
├── __init__.py (2.6k tokens)
├── base.py (14.8k tokens)
├── deepseek_v3.py (4.8k tokens)
├── deepseek_v3_test.py (2.1k tokens)
├── gpt_oss.py (6k tokens)
├── gpt_oss_test.py (1700 tokens)
├── image_token_count_test.py (700 tokens)
├── kimi_k2.py (5.7k tokens)
├── kimi_k25.py (1600 tokens)
├── kimi_k25_test.py (6.1k tokens)
├── kimi_k26.py (500 tokens)
├── kimi_k26_test.py (2.5k tokens)
├── kimi_k2_5_tool_declaration_ts.py (3.2k tokens)
├── kimi_k2_test.py (5.9k tokens)
├── kimi_k2_tool_declaration_test.py (2000 tokens)
├── llama3.py (700 tokens)
├── nemotron3.py (4.6k tokens)
├── nemotron3_test.py (8.7k tokens)
├── parsing_test.py (3.4k tokens)
├── qwen3.py (5.4k tokens)
├── qwen3_5.py (2.6k tokens)
├── qwen3_test.py (5.5k tokens)
├── qwen3_tool_declaration_test.py (2.7k tokens)
├── renderer_pickle_test.py (1300 tokens)
├── renderers_test.py (12.1k tokens)
├── role_colon.py (1200 tokens)
├── role_colon_test.py (700 tokens)
├── testing_utils.py (200 tokens)
├── tool_calling_test.py (2.5k tokens)
├── rl/
├── __init__.py (200 tokens)
├── builder_pickle_test.py (900 tokens)
├── data_processing.py (1800 tokens)
├── interleaved.py (3.2k tokens)
├── interleaved_test.py (4.2k tokens)
├── message_env.py (1400 tokens)
├── message_env_test.py (5.6k tokens)
├── metric_util.py (3k tokens)
├── metric_util_test.py (1200 tokens)
├── metrics.py (2.2k tokens)
├── multiturn_weight_assignment_test.py (2k tokens)
├── play_w_env.py (800 tokens)
├── preference_envs.py (3.9k tokens)
├── problem_env.py (1300 tokens)
├── problem_env_test.py (800 tokens)
├── rollout_error_resilience_test.py (3.2k tokens)
├── rollout_logging.py (1900 tokens)
├── rollout_logging_test.py (300 tokens)
├── rollout_strategy.py (2.4k tokens)
├── rollouts.py (3.5k tokens)
├── shutdown_test.py (2k tokens)
├── train.py (16.9k tokens)
├── types.py (3.2k tokens)
├── sandbox/
├── README.md (500 tokens)
├── __init__.py (100 tokens)
├── modal_sandbox.py (2.6k tokens)
├── sandbox_interface.py (600 tokens)
├── sandboxfusion.py (800 tokens)
├── scripts/
├── copy_checkpoint.py (900 tokens)
├── merge_tinker_adapter_to_hf_model.py (500 tokens)
├── save_audit_log.py (1100 tokens)
├── test_tool_calling_e2e.py (1400 tokens)
├── stores/
├── __init__.py (200 tokens)
├── _incremental.py (800 tokens)
├── eval_store.py (2.7k tokens)
├── eval_store_test.py (900 tokens)
├── registry.py (1600 tokens)
├── storage.py (4.8k tokens)
├── storage_test.py (2.4k tokens)
├── training_store.py (2.8k tokens)
├── training_store_test.py (3.5k tokens)
├── supervised/
├── __init__.py (200 tokens)
├── common.py (1600 tokens)
├── common_test.py (900 tokens)
├── data.py (3.5k tokens)
├── interleaved_test.py (1400 tokens)
├── nll_evaluator.py (500 tokens)
├── resume_test.py (1200 tokens)
├── train.py (4.9k tokens)
├── types.py (800 tokens)
├── viz_sft_dataset.py (400 tokens)
├── third_party/
├── __init__.py
├── litellm/
├── README.md (1400 tokens)
├── __init__.py
├── provider.py (3.4k tokens)
├── provider_test.py (3.6k tokens)
├── openai_compat.py (300 tokens)
├── openai_compat_test.py (800 tokens)
├── tokenizer_utils.py (1100 tokens)
├── tokenizer_utils_test.py (600 tokens)
├── tool_use/
├── README.md (600 tokens)
├── __init__.py (100 tokens)
├── agent_tool_message_env.py (1300 tokens)
├── agent_tool_message_env_test.py (2.5k tokens)
├── tools.py (2.2k tokens)
├── types.py (300 tokens)
├── utils/
├── __init__.py
├── code_state.py (1000 tokens)
├── deprecation.py (1600 tokens)
├── deprecation_test.py (1500 tokens)
├── file_utils.py (100 tokens)
├── format_colorized.py (400 tokens)
├── git_rev.py (100 tokens)
├── git_rev_test.py (100 tokens)
├── logtree.py (6.9k tokens)
├── logtree_formatters.py (1300 tokens)
├── logtree_test.py (5.2k tokens)
├── lr_scheduling.py (300 tokens)
├── misc_utils.py (1200 tokens)
├── ml_log.py (4.7k tokens)
├── ml_log_test.py (300 tokens)
├── trace.py (7k tokens)
├── trace_test.py (5.3k tokens)
├── weights/
├── README.md (1600 tokens)
├── __init__.py (300 tokens)
├── _adapter.py (4.5k tokens)
├── _artifacts.py (2.3k tokens)
├── _download.py (800 tokens)
├── _export/
├── __init__.py (3k tokens)
├── _full.py (600 tokens)
├── _quant_format.py (900 tokens)
├── _quantized.py (4.4k tokens)
├── _shard.py (600 tokens)
├── _shard_engine.py (3k tokens)
├── _shard_mx_block.py (1700 tokens)
├── _shard_packed_int4.py (1500 tokens)
├── _merge.py (5k tokens)
├── _merge_deepseek.py (600 tokens)
├── _merge_default.py (700 tokens)
├── _merge_gpt_oss.py (700 tokens)
├── _merge_kimi_k25.py (1100 tokens)
├── _merge_nemotron.py (400 tokens)
├── _merge_qwen3_5.py (1500 tokens)
├── _merge_utils.py (3.7k tokens)
├── _model_card.py (1100 tokens)
├── _mxfp4.py (1200 tokens)
├── _packed_int4.py (900 tokens)
├── _publish.py (500 tokens)
├── adapter_test.py (8.7k tokens)
├── artifacts_test.py (1700 tokens)
├── download_test.py (900 tokens)
├── export_test.py (5k tokens)
├── merge_test.py (15.3k tokens)
├── model_card_test.py (1700 tokens)
├── mxfp4_test.py (1100 tokens)
├── publish_test.py (400 tokens)
├── quantized_test.py (9.9k tokens)
├── stress_test.py (3.5k tokens)
├── xmux/
├── README.md (500 tokens)
├── __init__.py
├── control.py (3.6k tokens)
├── core.py (4.8k tokens)
├── examples/
├── async_rl_sweep.py (600 tokens)
├── fake_train.py (500 tokens)
├── ml_sweep.py (1700 tokens)
├── run_job.py (300 tokens)
├── utils.py (1700 tokens)
├── tutorials/
├── 101_hello_tinker.py (1300 tokens)
├── 102_first_sft.py (3.9k tokens)
├── 103_async_patterns.py (2k tokens)
├── 104_first_rl.py (2.7k tokens)
├── 201_rendering.py (2.8k tokens)
├── 202_loss_functions.py (2.4k tokens)
├── 203_completers.py (2000 tokens)
├── 204_weights.py (2.3k tokens)
├── 205_evaluations.py (2.5k tokens)
├── 301_cookbook_abstractions.py (3.8k tokens)
├── 302_custom_environment.py (4k tokens)
├── 303_sft_with_config.py (1300 tokens)
├── 304_rl_with_config.py (1500 tokens)
├── 401_sl_hyperparams.py (1500 tokens)
├── 402_rl_hyperparams.py (1400 tokens)
├── 403_dpo_preferences.py (1400 tokens)
├── 404_sequence_extension.py (1500 tokens)
├── 405_multi_agent.py (1800 tokens)
├── 406_prompt_distillation.py (2.7k tokens)
├── 407_rlhf_pipeline.py (2.8k tokens)
├── 501_export_hf.py (1300 tokens)
├── 502_lora_adapter.py (1100 tokens)
├── 503_publish_hub.py (900 tokens)
├── README.md (1700 tokens)
```
## /.claude-plugin/marketplace.json
```json path="/.claude-plugin/marketplace.json"
{
"name": "tinker-skills",
"owner": {
"name": "Thinking Machines Lab",
"email": "tinker@thinkingmachines.ai"
},
"metadata": {
"description": "Claude Code skills for fine-tuning language models with the Tinker API.",
"version": "0.3.0"
},
"plugins": [
{
"name": "tinker",
"description": "Skills for fine-tuning language models with the Tinker API — research, debugging, and more.",
"source": "./",
"strict": true
}
]
}
```
## /.claude-plugin/plugin.json
```json path="/.claude-plugin/plugin.json"
{
"name": "tinker",
"description": "Skills for fine-tuning language models with the Tinker API — research, debugging, and more.",
"version": "0.3.0",
"author": {
"name": "Thinking Machines Lab",
"email": "tinker@thinkingmachines.ai"
},
"homepage": "https://thinkingmachines.ai/tinker/",
"repository": "https://github.com/thinking-machines-lab/tinker-cookbook",
"license": "MIT",
"keywords": ["tinker", "fine-tuning", "rl", "sft", "dpo", "llm"],
"skills": "skills/"
}
```
## /.github/workflows/check-links.yaml
```yaml path="/.github/workflows/check-links.yaml"
name: check-links
on:
push:
branches: [main]
paths: ["**/*.md"]
pull_request:
paths: ["**/*.md"]
jobs:
check-links:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Check relative links in Markdown files
run: |
python3 -c "
import re, os, glob, sys
link_re = re.compile(r'\[([^\]]*)\]\(([^)]+)\)')
exists_cache = {}
broken = []
for md in glob.glob('**/*.md', recursive=True):
dirpath = os.path.dirname(md)
with open(md) as f:
for lineno, line in enumerate(f, 1):
for m in link_re.finditer(line):
link = m.group(2)
if link.startswith(('http://', 'https://', '#', 'mailto:')):
continue
path = link.split('#')[0]
if not path:
continue
resolved = os.path.normpath(os.path.join(dirpath, path))
if resolved not in exists_cache:
exists_cache[resolved] = os.path.exists(resolved)
if not exists_cache[resolved]:
broken.append((md, lineno, link))
if broken:
for src, lineno, link in broken:
print(f'::error file={src},line={lineno}::{src}:{lineno}: broken link -> {link}')
sys.exit(1)
else:
print('All relative links OK')
"
```
## /.github/workflows/claude-review.yml
```yml path="/.github/workflows/claude-review.yml"
name: Claude Code
permissions:
contents: write # allow Claude to edit files & push commits
pull-requests: write # allow PR comments/reviews & PR creation
issues: write # allow issue comments & labels
actions: read
on:
# Respond to @claude mentions in PRs & issues (trusted users only)
issue_comment:
types: [created]
pull_request_review_comment:
types: [created]
issues:
types: [opened]
env:
CLAUDE_ARGS: >
--model claude-opus-4-5-20251101
--max-turns 50
--allowedTools "Read" "Write" "Edit" "MultiEdit"
"Glob" "Grep" "LS"
"Bash(git:*)" "Bash(gh:*)"
"mcp__github_inline_comment__create_inline_comment"
jobs:
claude_mention:
name: Respond to @claude in issues & PRs
runs-on: ubuntu-latest
# Only trusted users (OWNER, MEMBER, COLLABORATOR) can trigger Claude
if: >
(github.event_name == 'issue_comment' &&
contains(github.event.comment.body, '@claude') &&
contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.comment.author_association)) ||
(github.event_name == 'pull_request_review_comment' &&
contains(github.event.comment.body, '@claude') &&
contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.comment.author_association)) ||
(github.event_name == 'issues' &&
contains(github.event.issue.body, '@claude') &&
contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association))
steps:
- name: Checkout repository
uses: actions/checkout@v5
with:
fetch-depth: 1
- name: Claude on @mention
uses: anthropics/claude-code-action@v1
with:
anthropic_api_key: ${{ secrets.ANTHROPIC_API_KEY }}
github_token: ${{ secrets.GITHUB_TOKEN }}
claude_args: ${{ env.CLAUDE_ARGS }}
```
## /.github/workflows/downstream-compat.yaml
```yaml path="/.github/workflows/downstream-compat.yaml"
name: downstream-compat
on:
workflow_dispatch:
push:
branches: [main]
pull_request:
jobs:
downstream-compat:
runs-on: ubuntu-latest
steps:
- name: checkout
uses: actions/checkout@v4
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: venv
run: uv venv && uv sync --all-extras
- name: downstream compat tests
run: uv run pytest tests/downstream_compat/ -v
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
```
## /.github/workflows/ghstack_land.yaml
```yaml path="/.github/workflows/ghstack_land.yaml"
name: ghstack land
# Land a PR by commenting `/land` on it.
#
# /land Land a ghstack stack bottom-up once every PR is
# approved and CI is green (waits for in-flight CI).
# /land --force <reason> Skip CI. For a ghstack PR this lands the stack
# anyway; for a regular PR it does an admin squash
# merge. A reason is mandatory.
#
# Landing pushes to `main` (a protected branch) and must be able to trigger the
# normal push CI, so the default `GITHUB_TOKEN` is not enough. Configure a
# GitHub App with `contents: write` + `pull_requests: write` that is allowed to
# bypass branch protection, and store its credentials as repo secrets:
# BOT_APP_ID the App's id
# BOT_APP_PRIVATE_KEY the App's private key (PEM)
on:
issue_comment:
types: [created]
jobs:
ghstack_land:
# `issue_comment` runs in the base repo with access to secrets no matter who
# commented, so on a public repo the trigger must be gated to trusted users.
# `OWNER`/`MEMBER` means a member of the org that owns this repo
# (thinking-machines-lab); `COLLABORATOR` is intentionally excluded so that
# outside collaborators cannot land.
if: >
github.event.issue.pull_request &&
startsWith(github.event.comment.body, '/land') &&
contains(fromJSON('["OWNER", "MEMBER"]'), github.event.comment.author_association)
runs-on: ubuntu-latest
steps:
- uses: actions/create-github-app-token@v1
id: app-token
with:
app-id: ${{ secrets.BOT_APP_ID }}
private-key: ${{ secrets.BOT_APP_PRIVATE_KEY }}
- name: Acknowledge the comment
run: |
curl -X POST \
-H "Authorization: token ${{ steps.app-token.outputs.token }}" \
-H "Accept: application/vnd.github.v3+json" \
"${{ github.api_url }}/repos/${{ github.repository }}/issues/comments/${{ github.event.comment.id }}/reactions" \
-d '{"content":"rocket"}'
- name: Parse --force flag
id: parse
env:
GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
COMMENT: ${{ github.event.comment.body }}
run: |
if [[ "$COMMENT" == *"--force"* ]]; then
# Everything after `/land --force` is the (mandatory) reason.
# Collapse to a single line first: newlines in the reason would
# otherwise inject extra `key=value` pairs into $GITHUB_OUTPUT.
REASON=$(printf '%s' "$COMMENT" \
| tr '\n\r' ' ' \
| sed -E 's/(^|[[:space:]])--force([[:space:]]|$)/ /g' \
| sed -E 's/^[[:space:]]*\/land[[:space:]]*//' \
| sed -E 's/[[:space:]]+/ /g; s/^ //; s/ $//')
if [[ -z "$REASON" ]]; then
echo "::error::--force requires a reason"
gh pr comment "${{ github.event.issue.html_url }}" \
--body "⚠️ Force landing failed: provide a reason, e.g. \`/land --force <reason>\`"
exit 1
fi
echo "force=true" >> "$GITHUB_OUTPUT"
echo "reason=$REASON" >> "$GITHUB_OUTPUT"
else
echo "force=false" >> "$GITHUB_OUTPUT"
echo "reason=" >> "$GITHUB_OUTPUT"
fi
- name: Get PR details
id: get-pr
run: |
PR_NUMBER=${{ github.event.issue.number }}
PR_DATA=$(curl -s \
-H "Authorization: token ${{ steps.app-token.outputs.token }}" \
-H "Accept: application/vnd.github.v3+json" \
"${{ github.api_url }}/repos/${{ github.repository }}/pulls/$PR_NUMBER")
PR_HEAD_REF=$(echo "$PR_DATA" | jq -r .head.ref)
PR_URL="${{ github.server_url }}/${{ github.repository }}/pull/$PR_NUMBER"
echo "pr_number=$PR_NUMBER" >> "$GITHUB_OUTPUT"
echo "pr_branch=$PR_HEAD_REF" >> "$GITHUB_OUTPUT"
echo "pr_url=$PR_URL" >> "$GITHUB_OUTPUT"
# ghstack PRs use the gh/<user>/<n>/head branch convention.
if [[ "$PR_HEAD_REF" =~ ^gh/[^/]+/[0-9]+/head$ ]]; then
echo "is_ghstack=true" >> "$GITHUB_OUTPUT"
else
echo "is_ghstack=false" >> "$GITHUB_OUTPUT"
fi
# --- Regular (non-ghstack) PRs ---------------------------------------
- name: Reject /land on a regular PR without --force
if: ${{ steps.get-pr.outputs.is_ghstack == 'false' && steps.parse.outputs.force == 'false' }}
env:
GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
run: |
gh pr comment "${{ steps.get-pr.outputs.pr_url }}" \
--body "ℹ️ \`/land\` is only for ghstack PRs. For a regular PR, use the merge button or \`/land --force <reason>\` to admin-merge."
exit 1
- uses: actions/checkout@v4
if: ${{ steps.get-pr.outputs.is_ghstack == 'false' && steps.parse.outputs.force == 'true' }}
with:
token: ${{ steps.app-token.outputs.token }}
fetch-depth: 0
- name: Force admin-merge regular PR
if: ${{ steps.get-pr.outputs.is_ghstack == 'false' && steps.parse.outputs.force == 'true' }}
env:
GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
FORCE_REASON: ${{ steps.parse.outputs.reason }}
run: |
gh pr comment "${{ steps.get-pr.outputs.pr_url }}" \
--body "⚠️ **Force landing** (admin merge) by @${{ github.event.comment.user.login }}: $FORCE_REASON"
if ! gh pr merge "${{ steps.get-pr.outputs.pr_number }}" --admin --squash; then
gh pr comment "${{ steps.get-pr.outputs.pr_url }}" \
--body "❌ Admin merge failed. See: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
exit 1
fi
# --- ghstack PRs ------------------------------------------------------
- uses: actions/checkout@v4
if: ${{ steps.get-pr.outputs.is_ghstack == 'true' }}
with:
token: ${{ steps.app-token.outputs.token }}
fetch-depth: 0
- uses: actions/setup-python@v5
if: ${{ steps.get-pr.outputs.is_ghstack == 'true' }}
with:
python-version: "3.12"
- name: Install uv
if: ${{ steps.get-pr.outputs.is_ghstack == 'true' }}
uses: astral-sh/setup-uv@v6
- name: Require approvals and green CI
if: ${{ steps.get-pr.outputs.is_ghstack == 'true' && steps.parse.outputs.force == 'false' }}
env:
GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
# The branch name is attacker-influenced, so pass it via env and quote
# it below rather than interpolating into the run block directly.
PR_NUMBER: ${{ github.event.issue.number }}
PR_BRANCH: ${{ steps.get-pr.outputs.pr_branch }}
REPO: ${{ github.repository }}
run: |
uvx --with requests python .github/workflows/scripts/ghstack-perm-check.py \
"$PR_NUMBER" "$PR_BRANCH" "$REPO"
- name: Log force landing
if: ${{ steps.get-pr.outputs.is_ghstack == 'true' && steps.parse.outputs.force == 'true' }}
env:
GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
FORCE_REASON: ${{ steps.parse.outputs.reason }}
run: |
gh pr comment "${{ steps.get-pr.outputs.pr_url }}" \
--body "⚠️ **Force landing** by @${{ github.event.comment.user.login }}: $FORCE_REASON"
- name: Land it
if: ${{ steps.get-pr.outputs.is_ghstack == 'true' }}
env:
GITHUB_TOKEN: ${{ steps.app-token.outputs.token }}
run: |
git config --global user.email "bot@users.noreply.github.com"
git config --global user.name "ghstack bot"
cat <<EOF > ~/.ghstackrc
[ghstack]
github_url = github.com
github_oauth = $GITHUB_TOKEN
github_username = ghstack-bot
remote_name = origin
EOF
if ! uvx --with ghstack==0.11.0 ghstack land "${{ steps.get-pr.outputs.pr_url }}"; then
gh pr comment "${{ steps.get-pr.outputs.pr_url }}" \
--body "Failed to land via ghstack (likely a merge conflict). See: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
exit 1
fi
```
## /.github/workflows/integration-tests.yaml
```yaml path="/.github/workflows/integration-tests.yaml"
name: integration-tests
on:
push:
branches: [main]
workflow_dispatch:
jobs:
integration:
runs-on: ubuntu-latest
name: integration tests
steps:
- name: checkout
uses: actions/checkout@v4
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: venv
run: uv venv && uv sync --all-extras
- name: pytest (integration)
run: uv run pytest tests/integration/ -v
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
TINKER_API_KEY: ${{ secrets.TINKER_API_KEY }}
weights:
if: github.repository == 'thinking-machines-lab/tinker-cookbook'
runs-on: ubuntu-latest
timeout-minutes: 20
strategy:
fail-fast: false
matrix:
test:
- test_export_gpt_oss
- test_export_qwen3
- test_export_qwen3_5
- test_export_deepseek
- test_adapter_qwen3
- test_adapter_gpt_oss
- test_adapter_qwen3_5
- test_adapter_kimi
- test_adapter_deepseek
- test_strategy_consistency
- test_quantized
- test_quantized_equivalence
- test_lifecycle
- test_publish
- test_adapter_publish
name: ${{ matrix.test }}
steps:
- name: checkout
uses: actions/checkout@v4
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: venv
run: uv venv && uv sync --all-extras
- name: run test
run: uv run pytest tests/weights/${{ matrix.test }}.py -v --timeout=120
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
TINKER_API_KEY: ${{ secrets.TINKER_API_KEY }}
```
## /.github/workflows/nightly.yaml
```yaml path="/.github/workflows/nightly.yaml"
name: nightly
on:
workflow_run:
workflows: ["smoke-test-recipes"]
types: [completed]
workflow_dispatch:
permissions:
contents: write # needed to create/delete releases
jobs:
build-and-release:
runs-on: ubuntu-latest
# Only run on the upstream repo (forks lack secrets and shouldn't publish releases)
# and only if: manually triggered, or smoke tests passed on schedule
if: >
github.repository == 'thinking-machines-lab/tinker-cookbook' &&
(github.event_name == 'workflow_dispatch' ||
(github.event.workflow_run.conclusion == 'success' &&
github.event.workflow_run.event == 'schedule'))
steps:
- name: checkout
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: build
run: uv build
- name: smoke test
run: |
uv run python -c "import tinker_cookbook; print(f'Version: {tinker_cookbook.__version__}')"
- name: get version
id: version
run: |
VERSION=$(uv run python -c "import tinker_cookbook; print(tinker_cookbook.__version__)")
echo "version=$VERSION" >> "$GITHUB_OUTPUT"
- name: upload artifacts
uses: actions/upload-artifact@v4
with:
name: tinker-cookbook-nightly
path: dist/
retention-days: 7
- name: delete existing nightly release
run: gh release delete nightly --yes --cleanup-tag || true
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: create nightly release
run: |
VERSION="${{ steps.version.outputs.version }}"
REPO="${{ github.repository }}"
SHORT_SHA="${GITHUB_SHA::8}"
{
echo "Automated nightly build from \`main\` at $(date -u '+%Y-%m-%d %H:%M UTC')."
echo ""
echo "**Version:** \`${VERSION}\`"
echo "**Commit:** [\`${SHORT_SHA}\`](https://github.com/${REPO}/commit/${GITHUB_SHA})"
echo ""
echo "### Install"
echo "\`\`\`bash"
echo "pip install 'tinker_cookbook @ https://github.com/${REPO}/releases/download/nightly/tinker_cookbook-${VERSION}-py3-none-any.whl'"
echo "\`\`\`"
} > /tmp/release-notes.md
gh release create nightly dist/* \
--prerelease \
--title "Nightly Build (${VERSION})" \
--notes-file /tmp/release-notes.md
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Update nightly branch
run: git push origin HEAD:refs/heads/nightly --force
```
## /.github/workflows/pre-commit.yaml
```yaml path="/.github/workflows/pre-commit.yaml"
name: pre-commit
on:
push:
branches: [ main ]
pull_request:
jobs:
pre-commit:
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: pre-commit
uses: pre-commit/action@v3.0.1
```
## /.github/workflows/publish-pypi.yaml
```yaml path="/.github/workflows/publish-pypi.yaml"
name: publish-pypi
on:
push:
tags: ["v[0-9]+.[0-9]+.[0-9]+"] # only semver tags like v1.2.3
workflow_dispatch:
inputs:
tag:
description: "Git tag to publish (e.g. v0.2.0). Must already exist."
required: true
jobs:
publish:
runs-on: ubuntu-latest
if: github.repository == 'thinking-machines-lab/tinker-cookbook'
steps:
- name: determine ref
id: ref
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "ref=${{ inputs.tag }}" >> "$GITHUB_OUTPUT"
else
echo "ref=${{ github.ref }}" >> "$GITHUB_OUTPUT"
fi
- name: checkout
uses: actions/checkout@v4
with:
ref: ${{ steps.ref.outputs.ref }}
fetch-depth: 0 # hatch-vcs needs full history for version
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: build
run: uv build
- name: verify version matches tag
run: |
BUILT_VERSION=$(ls dist/*.tar.gz | sed 's/.*tinker_cookbook-//;s/\.tar\.gz//')
TAG_VERSION="${GITHUB_REF_NAME#v}"
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
TAG_VERSION="${{ inputs.tag }}"
TAG_VERSION="${TAG_VERSION#v}"
fi
echo "Built version: $BUILT_VERSION"
echo "Tag version: $TAG_VERSION"
if [ "$BUILT_VERSION" != "$TAG_VERSION" ]; then
echo "ERROR: Built version ($BUILT_VERSION) does not match tag ($TAG_VERSION)"
exit 1
fi
- name: run smoke test
run: |
uv run python -c "import tinker_cookbook; print(f'Version: {tinker_cookbook.__version__}')"
- name: publish
run: uv publish --token="$PYPI_TOKEN"
env:
PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
```
## /.github/workflows/pyright.yaml
```yaml path="/.github/workflows/pyright.yaml"
name: pyright
on:
push:
branches: [main]
pull_request:
jobs:
type-check:
runs-on: ubuntu-latest
name: type-check
steps:
- name: checkout
uses: actions/checkout@v4
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: venv
run: uv venv && uv sync --all-extras
- name: pyright
run: uv run pyright tinker_cookbook
```
## /.github/workflows/pytest.yaml
```yaml path="/.github/workflows/pytest.yaml"
name: pytest
on:
workflow_dispatch:
push:
branches: [main]
pull_request:
jobs:
test:
runs-on: ubuntu-latest
name: test
steps:
- name: checkout
uses: actions/checkout@v4
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: venv
run: uv venv && uv sync --all-extras
- name: pytest (unit)
run: uv run pytest tinker_cookbook/
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
```
## /.github/workflows/scripts/ghstack-perm-check.py
```py path="/.github/workflows/scripts/ghstack-perm-check.py"
#!/usr/bin/env python3
"""Gate a ghstack `/land` on approvals and green CI.
Given the top PR of a ghstack stack, this:
1. Reconstructs the stack by reading the `Pull-Request-resolved` trailers from
the commits on the PR's `orig` branch.
2. Requires every not-yet-merged PR in the stack to have at least one approval.
3. Waits for the top PR's checks to settle and only succeeds when GitHub
reports the PR as mergeable (clean). Fails fast on conflicts or failing
required checks.
Relies only on GitHub-native status/check-runs, so it works for any CI setup.
"""
import argparse
import os
import re
import subprocess
import time
from typing import Any, Literal
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
# Only count approvals from people with write access to the repo, so two
# outside accounts can't approve each other's PRs to clear the gate.
TRUSTED_ASSOCIATIONS = {"OWNER", "MEMBER", "COLLABORATOR"}
def classify_checks(
statuses: list[dict[str, Any]],
check_runs: list[dict[str, Any]],
) -> tuple[Literal["pending", "failed", "success"], list[str]]:
"""Classify the head commit's combined checks without hardcoding context names."""
failed: list[str] = []
pending: list[str] = []
# Legacy commit statuses (e.g. third-party integrations).
for status in statuses:
state = status.get("state")
if state in ("failure", "error"):
failed.append(status.get("context", "status"))
elif state == "pending":
pending.append(status.get("context", "status"))
# GitHub Actions / check-runs.
for run in check_runs:
if run.get("status") != "completed":
pending.append(run.get("name", "check"))
elif run.get("conclusion") not in ("success", "neutral", "skipped"):
failed.append(run.get("name", "check"))
if failed:
return "failed", failed
if pending:
return "pending", pending
return "success", []
def main():
parser = argparse.ArgumentParser(description="Check ghstack PR approvals and CI status")
parser.add_argument("pr_number", type=int, help="PR number to check")
parser.add_argument("head_ref", help="Head reference of the PR")
parser.add_argument("repo", help="Repository in owner/repo format")
parser.add_argument(
"--max-wait-time",
type=int,
default=1800,
help="Maximum wait time in seconds for checks to settle",
)
args = parser.parse_args()
gh = requests.Session()
gh.headers.update(
{
"Authorization": f"Bearer {os.environ['GITHUB_TOKEN']}",
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
)
# Retry transient GitHub-side 5xx / rate-limit responses. urllib3's Retry
# ships with `requests`, so this needs no extra dependency.
retry = Retry(
total=5,
backoff_factor=2.0, # 0s, 2s, 4s, 8s, 16s
status_forcelist=(429, 500, 502, 503, 504),
allowed_methods=frozenset(["GET", "POST"]),
raise_on_status=False,
respect_retry_after_header=True,
)
adapter = HTTPAdapter(max_retries=retry)
gh.mount("https://", adapter)
gh.mount("http://", adapter)
NUMBER, head_ref, REPO = args.pr_number, args.head_ref, args.repo
MAX_WAIT_TIME = args.max_wait_time
def must(cond: Any, msg: str):
if not cond:
print(msg)
gh.post(
f"https://api.github.com/repos/{REPO}/issues/{NUMBER}/comments",
json={"body": f"ghstack land check failed: {msg}"},
)
exit(1)
print(head_ref)
must(
head_ref and re.match(r"^gh/[\w-]+/\d+/head{{contextString}}quot;, head_ref),
"Not a ghstack PR",
)
orig_ref = head_ref.replace("/head", "/orig")
def git_fetch_with_retry(ref: str, *, attempts: int = 5, initial_backoff: float = 2.0) -> bool:
backoff = initial_backoff
for attempt in range(1, attempts + 1):
rc = os.system(f"git fetch origin {ref}")
if rc == 0:
return True
print(f"git fetch origin {ref} failed (attempt {attempt}/{attempts}, exit={rc})")
if attempt < attempts:
print(f" retrying in {backoff:.0f}s...")
time.sleep(backoff)
backoff *= 2
return False
print(":: Fetching newest main...")
must(git_fetch_with_retry("main"), "Can't fetch main")
print(":: Fetching orig branch...")
must(git_fetch_with_retry(orig_ref), "Can't fetch orig branch")
proc = subprocess.Popen(
"git log FETCH_HEAD...$(git merge-base FETCH_HEAD origin/main)",
stdout=subprocess.PIPE,
shell=True,
)
out, _ = proc.communicate()
must(proc.wait() == 0, "`git log` command failed!")
pr_numbers = re.findall(
r"Pull[- ]Request(?:[- ]resolved)?: https://github.com/.*?/pull/([0-9]+)",
out.decode("utf-8"),
)
pr_numbers = list(map(int, pr_numbers))
print(pr_numbers)
must(pr_numbers and pr_numbers[0] == NUMBER, "Extracted PR numbers don't seem right!")
# Every not-yet-merged PR in the stack needs an approval.
print(":: Checking approvals for all PRs...")
for n in pr_numbers:
resp = gh.get(f"https://api.github.com/repos/{REPO}/pulls/{n}")
must(resp.ok, f"Error checking merge status for PR #{n}!")
pr_data = resp.json()
if pr_data["merged"]:
continue
head_sha = pr_data["head"]["sha"]
print(f"Checking approvals for PR #{n}... ", end="")
resp = gh.get(f"https://api.github.com/repos/{REPO}/pulls/{n}/reviews")
must(resp.ok, f"Error getting reviews for PR #{n}!")
# The approval must be from a user with write access AND on the current
# head commit. The repo's ruleset does not dismiss stale reviews on push,
# so without the commit_id check an approval of a benign commit would
# still clear the gate after the PR is updated with different code.
has_approval = any(
review["state"] == "APPROVED"
and review.get("author_association") in TRUSTED_ASSOCIATIONS
and review.get("commit_id") == head_sha
for review in resp.json()
)
must(
has_approval,
f"PR #{n} has no current approval from a user with write access "
f"(an approval on the latest commit {head_sha[:7]} is required).",
)
print("APPROVED!")
def check_pr_status(pr_number: int):
waiting_comment_posted = False
start_time = time.time()
def post_success_comment():
gh.post(
f"https://api.github.com/repos/{REPO}/issues/{pr_number}/comments",
json={"body": f"PR #{pr_number} checks have completed successfully!"},
)
while True:
resp = gh.get(
f"https://api.github.com/repos/{REPO}/pulls/{pr_number}",
headers={"Accept": "application/vnd.github.v3+json"},
)
must(resp.ok, f"Error getting PR #{pr_number}!")
pr_obj = resp.json()
mergeable_state = pr_obj.get("mergeable_state", "unknown")
if mergeable_state == "unknown":
# GitHub is still computing mergeability; give it a moment.
time.sleep(2)
resp = gh.get(
f"https://api.github.com/repos/{REPO}/pulls/{pr_number}",
headers={"Accept": "application/vnd.github.v3+json"},
)
must(resp.ok, f"Error getting PR #{pr_number} on retry!")
pr_obj = resp.json()
mergeable_state = pr_obj.get("mergeable_state", "unknown")
if mergeable_state == "unstable":
# Non-required checks are still running (or a required one is
# pending). Inspect the checks and wait until they settle.
if time.time() - start_time > MAX_WAIT_TIME:
must(
False,
f"PR #{pr_number} stayed unstable for over {MAX_WAIT_TIME // 60} minutes!",
)
sha = pr_obj["head"]["sha"]
status_resp = gh.get(f"https://api.github.com/repos/{REPO}/commits/{sha}/status")
must(status_resp.ok, f"Error getting statuses for PR #{pr_number}!")
check_resp = gh.get(
f"https://api.github.com/repos/{REPO}/commits/{sha}/check-runs",
params={"per_page": 100},
)
must(check_resp.ok, f"Error getting check runs for PR #{pr_number}!")
result, relevant = classify_checks(
status_resp.json().get("statuses", []),
check_resp.json().get("check_runs", []),
)
if result == "failed":
must(False, f"PR #{pr_number} has failing checks: {', '.join(relevant)}")
elif result == "success":
post_success_comment()
return pr_obj
message = f"PR #{pr_number} has pending checks: {', '.join(relevant)}"
if not waiting_comment_posted:
print(f"\n{message}. Waiting for checks to settle...")
gh.post(
f"https://api.github.com/repos/{REPO}/issues/{pr_number}/comments",
json={"body": message},
)
waiting_comment_posted = True
time.sleep(30)
print(".", end="", flush=True)
continue
if mergeable_state == "blocked":
must(
False,
f"PR #{pr_number} is blocked from merging (failing or missing "
f"required checks)! Use `/land --force <reason>` to bypass CI.",
)
elif mergeable_state == "dirty":
must(False, f"PR #{pr_number} has merge conflicts that need to be resolved!")
elif mergeable_state == "clean":
if waiting_comment_posted:
post_success_comment()
return pr_obj
else:
must(False, f"PR #{pr_number} is not ready to merge (state: {mergeable_state})!")
if pr_numbers:
print(f":: Checking status for primary PR #{pr_numbers[0]}... ", end="")
check_pr_status(pr_numbers[0])
print("SUCCESS!")
print(":: All PRs are ready to be landed!")
if __name__ == "__main__":
main()
```
## /.github/workflows/smoke-test-evals.yaml
```yaml path="/.github/workflows/smoke-test-evals.yaml"
name: smoke-test-evals
on:
workflow_dispatch: # manual trigger
schedule:
- cron: "0 7 * * *" # daily at 7am UTC (1h after recipes)
# Only one eval smoke test run at a time to avoid API contention
concurrency:
group: smoke-test-evals
cancel-in-progress: true
jobs:
smoke-test:
if: github.repository == 'thinking-machines-lab/tinker-cookbook'
runs-on: ubuntu-latest
timeout-minutes: 10
steps:
- name: checkout
uses: actions/checkout@v4
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: venv
run: uv venv && uv sync --all-extras
- name: run eval smoke tests
env:
TINKER_API_KEY: ${{ secrets.TINKER_API_KEY }}
run: uv run pytest tests/test_inspect_eval.py -v -x -s
```
## /.github/workflows/smoke-test-recipes.yaml
```yaml path="/.github/workflows/smoke-test-recipes.yaml"
name: smoke-test-recipes
on:
workflow_dispatch: # manual trigger
schedule:
- cron: "0 6 * * *" # daily at 6am UTC
# Only one smoke test run at a time to avoid API contention
concurrency:
group: smoke-test-recipes
cancel-in-progress: true
jobs:
# Discover all smoke test files so the matrix is auto-generated.
# Adding a new test file in tests/ automatically adds a CI job.
discover:
if: github.repository == 'thinking-machines-lab/tinker-cookbook'
runs-on: ubuntu-latest
outputs:
tests: ${{ steps.find.outputs.tests }}
steps:
- name: checkout
uses: actions/checkout@v4
- name: find smoke tests
id: find
run: |
tests=$(find tests/recipes -maxdepth 1 -name 'test_*.py' -printf '%f\n' \
| sed 's/\.py$//' \
| jq -R -s -c 'split("\n") | map(select(length > 0))')
echo "tests=$tests" >> "$GITHUB_OUTPUT"
smoke-test:
if: github.repository == 'thinking-machines-lab/tinker-cookbook'
needs: discover
runs-on: ubuntu-latest
timeout-minutes: 35
strategy:
fail-fast: false
matrix:
test: ${{ fromJson(needs.discover.outputs.tests) }}
name: ${{ matrix.test }}
steps:
- name: checkout
uses: actions/checkout@v4
- name: install-uv
uses: astral-sh/setup-uv@v6
with:
enable-cache: true
- name: venv
run: uv venv && uv sync --all-extras
- name: run smoke test
env:
TINKER_API_KEY: ${{ secrets.TINKER_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: uv run pytest tests/recipes/${{ matrix.test }}.py -v -x -s
```
## /.gitignore
```gitignore path="/.gitignore"
**/__pycache__
tinker_cookbook/_version.py
.DS_Store
.env
.env.*
.venv
uv.lock
skills-workspace/
tinker-debug-workspace/
eval_qwen35_35b.md
run_eval_qwen35.py
**/__marimo__
```
## /.pre-commit-config.yaml
```yaml path="/.pre-commit-config.yaml"
default_install_hook_types: [pre-commit, pre-push]
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-added-large-files
args: ["--maxkb=500"]
- id: end-of-file-fixer
exclude: |
(?x)
^(
\.sync_state
)$
- id: trailing-whitespace
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.13.2
hooks:
# Run the linter.
- id: ruff-check
exclude: tool_declaration_ts\.py$
# Run the formatter.
- id: ruff-format
exclude: tool_declaration_ts\.py$
```
## /.sync_state
```sync_state path="/.sync_state"
{
"last_synced_sha": "b4fee215e812ae5a6b0096ba37b3d9edc4f99cd5",
"last_sync_time": "2025-10-09T00:09:30.116486"
}
```
## /AGENTS.md
# Tinker Cookbook Agent Guide
Quick reference for agents working on `tinker-cookbook`. Detailed guidance is in the skills under `skills/`.
`tinker-cookbook` is a client library with training and eval code built on the Tinker service (hosted by Thinking Machines Lab) and the Tinker SDK (a separate repo with just the API). You author training/eval loops that run on a CPU machine; Tinker executes the heavy GPU work.
**Skills:** This repo ships two Claude Code skills in `skills/`: `research` (SFT, RL, DPO, distillation, evaluation, model selection, experiment methodology) and `debug` (performance, correctness, renderer, and error triage). Install via `/plugin marketplace add thinking-machines-lab/tinker-cookbook`, then use `/tinker:research` or `/tinker:debug`.
## Composing Types
Agents often struggle with the nested type hierarchy.
**Core types:**
- `Datum` = `model_input` (ModelInput) + `loss_fn_inputs` (dict of TensorData)
- `ModelInput` = list of chunks (EncodedTextChunk, ImageChunk)
- `TensorData` = wrapper for numpy/torch arrays with shape info
**Helper functions** (use these instead of manual construction):
- `datum_from_model_input_weights(model_input, weights, max_length)` - SL datum creation (`supervised/common.py`)
- `conversation_to_datum(messages, renderer, max_length, train_on_what)` - Full pipeline (`supervised/data.py`)
- `renderer.build_supervised_example(messages)` - Returns (ModelInput, weights)
- `ModelInput.from_ints(tokens)` - Create from token list
- `TensorData.from_numpy(arr)` / `TensorData.from_torch(tensor)` - Wrap arrays
---
## Architecture
**Builder pattern:** Config objects are `chz` dataclasses (SupervisedDatasetBuilder, RLDatasetBuilder, EnvGroupBuilder). They expose `.build()`/`__call__()` returning runtime objects.
**Key code locations:**
- SL: `tinker_cookbook/supervised/train.py`
- RL: `tinker_cookbook/rl/train.py`
- DPO: `tinker_cookbook/preference/train_dpo.py`
- Renderers: `tinker_cookbook/renderers/`
- Completers: `tinker_cookbook/completers.py`
- RL types: `tinker_cookbook/rl/types.py`
- Rollout strategies: `tinker_cookbook/rl/rollout_strategy.py` (FailFast, RetryOnFailure)
- Logging: `tinker_cookbook/utils/logtree.py`, `tinker_cookbook/rl/rollouts.py`
- Recipes: `tinker_cookbook/recipes/`
**Training outputs:** RL and SL training write human-readable HTML reports and machine-readable JSON files (metrics, rollout transcripts, per-trajectory summaries) to `log_path`. Point agents at a `log_path` directory to analyze training runs — `metrics.jsonl` for scalar metrics, `*_rollout_summaries.jsonl` for per-trajectory data, and `*_logtree.json` for full rollout transcripts including model responses.
---
## Conventions
**Subscript suffixes** for tensor names: `_P` (problems), `_G` (groups), `_T` (tokens), `_D` (datums). Example: `tokens_P_G_T[p][g][t]`
**Code style:**
- Explicit typing; avoid `Any` / `type: ignore`
- Use `safezip`, `timed`, `scope` helpers
- `@chz.chz` decorator for config serialization
- `ml_log.log_metrics` for metrics; `logtree` for transcripts
**Env lifecycle:** `Env` objects are single-use (no reset). Create via `EnvGroupBuilder`.
---
## Public-repo discipline
`tinker-cookbook` is a public repository. Keep internal context out of commits (even intermediate commits), and write with an external audience in mind.
---
## Finding supported models
For an identifier to pass to the SDK, call `service_client.get_server_capabilities().supported_models` — authoritative, includes `:peft:` long-context variants, but requires `TINKER_API_KEY`. For a read-only lookup with pricing and context length (no auth), browse <https://tinker-docs.thinkingmachines.ai/tinker/models/>.
---
## Common Pitfalls
1. **Sequential API calls:** The #1 performance mistake. Always use `_async` variants and submit calls back-to-back before awaiting. Use `asyncio.gather` for concurrent evaluation — never sequential loops over API calls. Tinker is designed for high request concurrency from a single Python process; when exact limits matter, check the current SDK/client configuration rather than inventing small caps. For very high parallelism, shard work across multiple Python processes and pass pickled sampling clients where appropriate.
2. **Sampler desync:** Create a **new** sampling client after saving weights. A stale client silently samples from old weights.
3. **LoRA LR:** Use `hyperparam_utils.get_lr(model_name)` - LoRA needs ~10x higher LR than full fine-tuning.
4. **Renderer mismatch:** Use `model_info.get_recommended_renderer_name()` — never hardcode renderer names.
**Never call `tokenizer.encode(prompt)` directly on a chat-tuned model** (gpt-oss, Llama-3-Instruct, Qwen-Instruct, etc.). Raw encoding skips the chat template, producing OOD prompt tokens. The sampler and trainer then take subtly different code paths on those OOD inputs, and per-token sampler/trainer logprob KL can inflate by 5×+ (max ratios in the tens), silently breaking PPO/CISPO/GRPO importance ratios. Use `tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True)` (take `["input_ids"]` from the returned `BatchEncoding`) or a cookbook renderer for the prompt. Raw `encode` is correct only for base / continued-pretraining / NLL-eval workflows where there is no conversation.
5. **Type construction:** Use helper functions, not manual dict construction. See `supervised/data.py` and `supervised/common.py`.
6. **Group semantics:** RL advantages are centered within each group.
7. **DPO:** Start with `dpo_beta=0.1`, LR~1e-5.
---
## Testing
```bash
# Unit tests (no API needed, colocated *_test.py files)
pytest tinker_cookbook/
# Smoke tests (requires TINKER_API_KEY + network)
pytest tests/
```
For debugging, shrink workloads via `n_batches`, `batch_size`, `group_size` in dataset builders.
## /CHANGELOG.md
# Changelog
A curated feed of notable changes to `tinker-cookbook`. Small bugfixes and minor argument additions are omitted—this is for changes worth knowing about.
## Format
Each entry includes:
- **Title**: A short, human-readable summary (not the commit message)
- **Date**: When it was merged
- **Type**: `new` (feature), `improvement` (enhancement to existing functionality), or `fix`
- **Tags**: What area it touches (e.g., `renderers`, `rl`, `supervised`, `eval`, `datasets`)
- **PR**: Link to the pull request
---
### [cookbook] Fix base-model single-turn evals + replace `parse_success` bool with `ParseTermination` enum ([#688](https://github.com/thinking-machines-lab/tinker-cookbook/pull/688))
**Date:** 2026-04-30
**Type:** fix
**Tags:** renderers, eval, rl
Fixes #685: every base model × every single-turn benchmark scored 0% because `RoleColonRenderer.parse_response` reported `parse_success=False` on EOS-terminated responses, and `EnvFromMessageEnv.step` short-circuits with `failed_parse_reward=0` in that case — so the grader was never invoked. Re-running AIME 2025 against `Qwen/Qwen3-8B-Base` now gives 6/30 (20.0%) instead of 0/30.
**Breaking API change.** `Renderer.parse_response` now returns `tuple[Message, ParseTermination]` instead of `tuple[Message, bool]`. `ParseTermination` is a `StrEnum` with `STOP_SEQUENCE` / `EOS` / `MALFORMED` values plus `is_clean` and `is_stop_sequence` properties. Note that `ParseTermination` is truthy in all three states (it's a `StrEnum`), so `if not parse_success:` patterns will silently always be False without raising `TypeError`. Migration: replace with `if not termination.is_clean:` (lenient, what eval grading reads) or `if not termination.is_stop_sequence:` (strict, what RL format-reward shaping reads). All in-tree call sites are updated. R1-Zero / math RL training behavior is preserved by default via a new `ProblemEnv.require_stop_sequence_for_format=True` knob.
---
### [cookbook] Fix grading bugs in IFEval, MATH-500, and GPQA benchmarks ([#643](https://github.com/thinking-machines-lab/tinker-cookbook/pull/643))
**Date:** 2026-04-08
**Type:** fix
**Tags:** eval
Fixes IFEval `letter_frequency` constraint using wrong kwargs key (always passed), `should_allow_eval` inverted regex check in math grading (blocked safe expressions from sympy), and GPQA silent fallback to answer "A" when correct_answer doesn't match choices. IFEval +3.9pp, MATH-500 +3.2pp on Qwen3.5-35B-A3B.
---
### [cookbook] Guard against edge-case crashes in RL, preference, and xmux ([#644](https://github.com/thinking-machines-lab/tinker-cookbook/pull/644))
**Date:** 2026-04-08
**Type:** fix
**Tags:** rl, preference, xmux
Adds defensive guards for division-by-zero in pairwise preference rewards (`group_size=1`), empty nonzero tensor in preference types, `max()`/`min()` on empty list in sampling metrics, `total_steps=0` in LR scheduling, model names without `org/` prefix, and empty jobs list in xmux control panel.
---
### [cookbook] Parallel evaluation in async training loop ([#630](https://github.com/thinking-machines-lab/tinker-cookbook/pull/630))
**Date:** 2026-04-07
**Type:** improvement
**Tags:** rl, eval
Training evaluation loop now runs evaluators in parallel via `run_evaluations_parallel`, reducing eval overhead in RL and distillation training.
---
### [cookbook] Public API exports for distillation and eval exceptions ([#639](https://github.com/thinking-machines-lab/tinker-cookbook/pull/639))
**Date:** 2026-04-07
**Type:** improvement
**Tags:** distillation, eval
Distillation module and eval exceptions are now importable from the top-level package.
---
### [cookbook] Remove v0.3.0-deprecated parameters ([#635](https://github.com/thinking-machines-lab/tinker-cookbook/pull/635))
**Date:** 2026-04-07
**Type:** improvement
**Tags:** rl
Removes deprecated parameters that were marked for removal in v0.3.0, including old metric names and legacy function signatures.
---
### [cookbook] Fix DPO training crash with odd batch size ([#637](https://github.com/thinking-machines-lab/tinker-cookbook/pull/637))
**Date:** 2026-04-07
**Type:** fix
**Tags:** preference
Fixes crash when DPO batch has odd number of datums after filtering invalid rows.
---
### [cookbook] Cloud storage backends — S3, GCS, Azure ([#623](https://github.com/thinking-machines-lab/tinker-cookbook/pull/623))
**Date:** 2026-04-06
**Type:** new
**Tags:** stores, infra
`FsspecStorage` enables writing training logs, checkpoints, and eval results to cloud URIs (`s3://`, `gs://`, `az://`). All log writers, rollout summaries, logtrees, and the eval runner migrated to the `Storage` protocol.
---
### [cookbook] Stores module — typed stores and run registry ([#617](https://github.com/thinking-machines-lab/tinker-cookbook/pull/617))
**Date:** 2026-04-06
**Type:** new
**Tags:** stores
New `stores/` module with `TrainingRunStore` (typed access to checkpoints, metrics, rollouts) and `RunRegistry` for discovering and comparing runs across multiple storage backends.
---
### [cookbook] Dynamic max_tokens capping via context_window ([#618](https://github.com/thinking-machines-lab/tinker-cookbook/pull/618))
**Date:** 2026-04-06
**Type:** improvement
**Tags:** eval
Benchmark runner now caps `max_tokens` to the model's `context_window`, preventing wasted generation budget on shorter-context models.
---
### [cookbook] Thread TrainingRunStore through all training loops ([#620](https://github.com/thinking-machines-lab/tinker-cookbook/pull/620), [#621](https://github.com/thinking-machines-lab/tinker-cookbook/pull/621), [#627](https://github.com/thinking-machines-lab/tinker-cookbook/pull/627))
**Date:** 2026-04-06
**Type:** improvement
**Tags:** rl, distillation, stores
All training loops (SL, RL, on-policy distillation, SDFT) now accept an optional `TrainingRunStore` for structured checkpoint and timing persistence.
---
### [cookbook] Cloud URI support for logging ([#625](https://github.com/thinking-machines-lab/tinker-cookbook/pull/625), [#628](https://github.com/thinking-machines-lab/tinker-cookbook/pull/628))
**Date:** 2026-04-06
**Type:** improvement
**Tags:** logging, stores
`JsonLogger` and `setup_logging` now accept cloud URIs as `log_dir`, writing metrics and configs directly to S3/GCS/Azure.
---
### [cookbook] Fix IncrementalReader locking and FsspecStorage flush re-upload ([#629](https://github.com/thinking-machines-lab/tinker-cookbook/pull/629))
**Date:** 2026-04-06
**Type:** fix
**Tags:** stores
Fixes thread-safety in `IncrementalReader` and a bug where `FsspecStorage.flush()` re-uploaded stale data.
---
### [cookbook] Benchmark evaluation framework with 21 benchmarks ([#569](https://github.com/thinking-machines-lab/tinker-cookbook/pull/569))
**Date:** 2026-04-05
**Type:** new
**Tags:** eval
Full benchmark framework reusing the RL `Env` abstraction. Includes GSM8K, MATH-500, MMLU-Pro, MMLU-Redux, GPQA, IFEval, MBPP, C-Eval, SuperGPQA, IFBench, AIME 2025/2026, plus experimental benchmarks (LiveCodeBench, Terminal Bench, SWE-bench, Arena Hard, LongBench, TAU2-Bench, HMMT, BFCL). Single-turn and multi-turn, with programmatic and sandbox grading.
---
### [cookbook] Generalize SandboxFactory for non-Modal backends ([#588](https://github.com/thinking-machines-lab/tinker-cookbook/pull/588))
**Date:** 2026-04-03
**Type:** new
**Tags:** sandbox
`SandboxFactory` now supports pluggable backends beyond Modal, enabling custom sandbox implementations for code execution benchmarks.
---
### [cookbook] Fix FP8 expert quantization for fused 3D expert tensors ([#597](https://github.com/thinking-machines-lab/tinker-cookbook/pull/597))
**Date:** 2026-04-03
**Type:** fix
**Tags:** weights
Fixes FP8 quantization producing wrong shapes for fused 3D expert tensors (e.g., gate+up projections).
---
### [cookbook] Pluggable quantization formats + GPU acceleration for weight export ([#583](https://github.com/thinking-machines-lab/tinker-cookbook/pull/583))
**Date:** 2026-04-02
**Type:** new
**Tags:** weights
Refactored weight export to support pluggable quantization formats. Adds GPU-accelerated quantization for faster export.
---
### [cookbook] True-Thinking Score (TTS) recipe ([#578](https://github.com/thinking-machines-lab/tinker-cookbook/pull/578))
**Date:** 2026-04-02
**Type:** new
**Tags:** recipes
New recipe for computing True-Thinking Score, measuring how much a model's thinking chain actually contributes to answer quality.
---
### [cookbook] MXFP4 shard merge support for GPT-OSS models ([#590](https://github.com/thinking-machines-lab/tinker-cookbook/pull/590))
**Date:** 2026-04-02
**Type:** new
**Tags:** weights, models
Adds MXFP4 block-scaled format for shard-by-shard LoRA merge, enabling weight export for GPT-OSS models.
---
### [cookbook] Fix unembed_tokens merge for models with tied embeddings ([#593](https://github.com/thinking-machines-lab/tinker-cookbook/pull/593))
**Date:** 2026-04-02
**Type:** fix
**Tags:** weights
Fixes LoRA merge producing incorrect output embeddings for models that share input/output embedding weights.
---
### [cookbook] Fix load_config_dict not forwarding trust_remote_code ([#586](https://github.com/thinking-machines-lab/tinker-cookbook/pull/586))
**Date:** 2026-04-02
**Type:** fix
**Tags:** weights
Fixes weight merge failing for models requiring `trust_remote_code` (e.g., Kimi K2.5).
---
### [cookbook] Fix tokenizer_class corrupted to TokenizersBackend during export ([#582](https://github.com/thinking-machines-lab/tinker-cookbook/pull/582))
**Date:** 2026-04-02
**Type:** fix
**Tags:** weights
Fixes a bug where `tokenizer_class` in `tokenizer_config.json` was being overwritten with `TokenizersBackend` during weight export, breaking downstream tokenizer loading.
---
### [cookbook] Fix experts-fp8 compression_config for non-DeepSeek models ([#580](https://github.com/thinking-machines-lab/tinker-cookbook/pull/580))
**Date:** 2026-04-02
**Type:** fix
**Tags:** weights
Fixes FP8 expert quantization config generation for non-DeepSeek MoE models (e.g., Qwen3.5).
---
### [cookbook] Kimi K2 / K2.5 shard-by-shard merge with INT4 expert dequant/requant ([#573](https://github.com/thinking-machines-lab/tinker-cookbook/pull/573))
**Date:** 2026-04-02
**Type:** new
**Tags:** weights, models
Adds shard-by-shard LoRA merge support for Kimi K2 and K2.5, including INT4 expert dequantization and requantization during merge.
---
### [cookbook] SDFT (Self-Distillation Fine-Tuning) recipe with top-K distillation ([#524](https://github.com/thinking-machines-lab/tinker-cookbook/pull/524))
**Date:** 2026-04-01
**Type:** new
**Tags:** recipes, distillation
New SDFT recipe implementing self-distillation fine-tuning with top-K forward KL loss, enabling efficient knowledge distillation without a separate teacher deployment.
---
### [cookbook] Off-policy top-K distillation for multi-teacher knowledge merging ([#572](https://github.com/thinking-machines-lab/tinker-cookbook/pull/572))
**Date:** 2026-04-01
**Type:** new
**Tags:** distillation, supervised
Adds off-policy top-K distillation support, enabling knowledge merging from multiple teacher models into a single student.
---
### [cookbook] InterleavedRLDatasetBuilder for multi-domain RL training ([#570](https://github.com/thinking-machines-lab/tinker-cookbook/pull/570))
**Date:** 2026-04-01
**Type:** new
**Tags:** rl, datasets
New `InterleavedRLDatasetBuilder` that interleaves examples from multiple RL dataset sources, enabling multi-domain RL training in a single run.
---
### [cookbook] Fix InterleavedRLDataset crash on ragged last source batch ([#574](https://github.com/thinking-machines-lab/tinker-cookbook/pull/574))
**Date:** 2026-04-01
**Type:** fix
**Tags:** rl, datasets
Fixes a crash in `InterleavedRLDataset` when the last batch from a source dataset is smaller than expected.
---
### [cookbook] Extend SFT LR sweep to full Tinker model lineup ([#575](https://github.com/thinking-machines-lab/tinker-cookbook/pull/575))
**Date:** 2026-04-01
**Type:** improvement
**Tags:** supervised, recipes
Extends the SFT learning rate sweep results to cover all models in the Tinker lineup.
---
### [cookbook] Make image_processor optional for Qwen3VL renderers ([#566](https://github.com/thinking-machines-lab/tinker-cookbook/pull/566))
**Date:** 2026-03-31
**Type:** improvement
**Tags:** renderers, models
The Qwen3VL renderer no longer requires `image_processor` when used for text-only workloads.
---
### [cookbook] Fix slug-to-HF-name parsing for orgs with hyphens ([#567](https://github.com/thinking-machines-lab/tinker-cookbook/pull/567))
**Date:** 2026-03-31
**Type:** fix
**Tags:** weights
Fixes HuggingFace model name parsing for organizations with hyphens in their name.
---
### [cookbook] 22 marimo tutorials (101–503) ([#562](https://github.com/thinking-machines-lab/tinker-cookbook/pull/562))
**Date:** 2026-03-31
**Type:** new
**Tags:** tutorials
Adds 22 interactive marimo notebook tutorials covering core workflows from SFT basics through advanced RL and distillation.
---
### [cookbook] Fix weight merging and adapter export for Nemotron fused Mamba projections ([#548](https://github.com/thinking-machines-lab/tinker-cookbook/pull/548), [#549](https://github.com/thinking-machines-lab/tinker-cookbook/pull/549))
**Date:** 2026-03-27
**Type:** fix
**Tags:** weights, adapters
Fixes `build_hf_model` and `build_lora_adapter` for Nemotron models with fused Mamba projections. `build_hf_model` now correctly handles the backbone prefix and fused projection key mapping. `build_lora_adapter` now handles empty expert LoRA weights and fused Mamba projection splitting.
---
### [cookbook] Auto-generated model cards for HuggingFace Hub publishing ([#543](https://github.com/thinking-machines-lab/tinker-cookbook/pull/543))
**Date:** 2026-03-26
**Type:** new
**Tags:** weights
`publish_to_hf_hub` now accepts an optional `ModelCardConfig` to auto-generate a HuggingFace-compatible `README.md` with YAML frontmatter, usage snippets, and framework versions. Adapter vs merged model format is auto-detected from `adapter_config.json`. A standalone `generate_model_card` function is also available for previewing cards before publishing.
---
### [cookbook] Group per-iteration output files into subdirectories ([#517](https://github.com/thinking-machines-lab/tinker-cookbook/pull/517))
**Date:** 2026-03-25
**Type:** improvement
**Tags:** infrastructure, logging
Training output files (rollout summaries, logtree JSONs, HTML reports) are now grouped into per-iteration subdirectories under `log_path`, keeping the output directory clean.
---
### [cookbook] PEFT-format adapter serving with `build_lora_adapter` ([#533](https://github.com/thinking-machines-lab/tinker-cookbook/pull/533))
**Date:** 2026-03-24
**Type:** new
**Tags:** weights, adapters
New `build_lora_adapter` function exports trained LoRA weights in PEFT format, enabling adapter serving without merging into the base model. Includes Nemotron-3 adapter serving support and vLLM e2e tests ([#539](https://github.com/thinking-machines-lab/tinker-cookbook/pull/539)).
---
### [cookbook] ActionExtra TypedDict for Env.step extensibility ([#538](https://github.com/thinking-machines-lab/tinker-cookbook/pull/538))
**Date:** 2026-03-24
**Type:** new
**Tags:** rl
Introduces `ActionExtra` TypedDict so environments can pass additional structured data through `Env.step()` without breaking the existing interface.
---
### [cookbook] Multimodal tool result support ([#526](https://github.com/thinking-machines-lab/tinker-cookbook/pull/526))
**Date:** 2026-03-24
**Type:** new
**Tags:** renderers, tools
Tool results can now include images and other multimodal content, not just text.
---
### [cookbook] Unified training telemetry with `trace_iteration` and `scope_span` ([#522](https://github.com/thinking-machines-lab/tinker-cookbook/pull/522), [#477](https://github.com/thinking-machines-lab/tinker-cookbook/pull/477))
**Date:** 2026-03-24
**Type:** improvement
**Tags:** infrastructure, logging
All training loops (SL, RL, DPO, distillation) now share a unified telemetry system built on `trace_iteration` and `scope_span`. Generates per-iteration Gantt charts and W&B metrics via `@scope` ([#453](https://github.com/thinking-machines-lab/tinker-cookbook/pull/453)).
---
### [cookbook] Add version lower bounds and exclude compromised litellm ([#532](https://github.com/thinking-machines-lab/tinker-cookbook/pull/532))
**Date:** 2026-03-24
**Type:** fix
**Tags:** infrastructure, dependencies
Adds minimum version bounds for key dependencies and excludes compromised `litellm` versions, protecting users from known supply chain issues.
---
### [cookbook] Handle context limits and max-tokens truncation in multi-turn RL ([#506](https://github.com/thinking-machines-lab/tinker-cookbook/pull/506))
**Date:** 2026-03-24
**Type:** improvement
**Tags:** rl
Multi-turn RL environments now gracefully handle context limit exhaustion and max-tokens truncation rather than crashing.
---
### [cookbook] Fix LoRA merging for Qwen3.5 models ([#528](https://github.com/thinking-machines-lab/tinker-cookbook/pull/528), [#529](https://github.com/thinking-machines-lab/tinker-cookbook/pull/529))
**Date:** 2026-03-23
**Type:** fix
**Tags:** weights
Fixes split QKV fusion and tied vision embeddings issues when merging LoRA adapters for Qwen3.5 models. Also refactors `_merge.py` into per-model merge modules for maintainability.
---
### [cookbook] SFT hyperparameter sweep with results for 3 models ([#496](https://github.com/thinking-machines-lab/tinker-cookbook/pull/496))
**Date:** 2026-03-23
**Type:** new
**Tags:** supervised, recipes
Published SFT hyperparameter sweep results covering learning rate, batch size, and schedule across 3 model families. Useful as a starting point for tuning.
---
### [cookbook] Clean up public API surface for release ([#516](https://github.com/thinking-machines-lab/tinker-cookbook/pull/516))
**Date:** 2026-03-23
**Type:** improvement
**Tags:** infrastructure
Audit and cleanup of the public API: removed internal symbols from `__all__`, consolidated re-exports, and ensured a clean `import tinker_cookbook` surface.
---
### [cookbook] Diagnostic logs for MessageEnv and AgentToolMessageEnv ([#518](https://github.com/thinking-machines-lab/tinker-cookbook/pull/518), [#521](https://github.com/thinking-machines-lab/tinker-cookbook/pull/521))
**Date:** 2026-03-22
**Type:** new
**Tags:** rl, logging
`MessageEnv.step()` can now return diagnostic logs via `MessageStepResult`, and `AgentToolMessageEnv` populates them automatically. Useful for debugging multi-turn agent training.
---
### [cookbook] Warn when renderer is not recommended for the model ([#509](https://github.com/thinking-machines-lab/tinker-cookbook/pull/509))
**Date:** 2026-03-21
**Type:** improvement
**Tags:** renderers
A warning is now emitted when a renderer that isn't recommended for the given model is used, helping catch renderer mismatch bugs early.
---
### [cookbook] Rollout error resilience for RL training ([#497](https://github.com/thinking-machines-lab/tinker-cookbook/pull/497))
**Date:** 2026-03-20
**Type:** improvement
**Tags:** rl
RL training can now survive individual rollout failures (e.g., sandbox timeouts) without aborting the entire batch. Failed rollouts are logged and skipped.
---
### [cookbook] Slim core deps and split recipe extras ([#437](https://github.com/thinking-machines-lab/tinker-cookbook/pull/437))
**Date:** 2026-03-20
**Type:** improvement
**Tags:** infrastructure
Core `tinker-cookbook` dependencies are significantly slimmed. Recipe-specific deps (Modal, Inspect AI, etc.) are now under optional extras like `pip install tinker-cookbook[inspect]` ([#380](https://github.com/thinking-machines-lab/tinker-cookbook/pull/380)).
---
### [cookbook] `cleanup()` lifecycle method on EnvGroupBuilder ([#505](https://github.com/thinking-machines-lab/tinker-cookbook/pull/505))
**Date:** 2026-03-20
**Type:** new
**Tags:** rl
`EnvGroupBuilder` now has a `cleanup()` method called at the end of training, so environment backends (e.g., sandbox pools) can release resources gracefully.
---
### [cookbook] Fix crash when all RL advantages are zero ([#507](https://github.com/thinking-machines-lab/tinker-cookbook/pull/507))
**Date:** 2026-03-20
**Type:** fix
**Tags:** rl
Fixed a crash when an entire batch has zero advantage (all trajectories identical reward). The batch is now skipped with a warning.
---
### [cookbook] Fix async RL training hang on data exhaustion ([#480](https://github.com/thinking-machines-lab/tinker-cookbook/pull/480))
**Date:** 2026-03-18
**Type:** fix
**Tags:** rl
Fixed a deadlock where async RL training would hang when the dataset was exhausted. Uses cascading shutdown to cleanly terminate worker threads.
---
### [cookbook] Nemotron-3 model and renderer ([#492](https://github.com/thinking-machines-lab/tinker-cookbook/pull/492))
**Date:** 2026-03-18
**Type:** new
**Tags:** models, renderers
Adds support for NVIDIA Nemotron-3 model family with a dedicated renderer and LR config. Includes downstream compatibility tests ([#495](https://github.com/thinking-machines-lab/tinker-cookbook/pull/495)).
---
### [cookbook] Centralized exception hierarchy ([#489](https://github.com/thinking-machines-lab/tinker-cookbook/pull/489))
**Date:** 2026-03-18
**Type:** new
**Tags:** infrastructure
New structured exception hierarchy (`TinkerCookbookError`, `RolloutError`, `RendererError`, etc.) with picklability guarantees for distributed execution.
---
### [cookbook] Deprecation framework for API evolution ([#486](https://github.com/thinking-machines-lab/tinker-cookbook/pull/486))
**Date:** 2026-03-18
**Type:** new
**Tags:** infrastructure
New `@deprecated()` decorator and `warn_deprecated()` helper with `removal_version` enforcement. Enables smooth API transitions with clear migration paths.
---
### [cookbook] Quantized export with FP8 expert quantization ([#478](https://github.com/thinking-machines-lab/tinker-cookbook/pull/478))
**Date:** 2026-03-17
**Type:** new
**Tags:** weights
New quantized weight export supporting FP8 quantization of MoE expert layers, reducing model size for deployment.
---
### [cookbook] Shard-by-shard merging and modular merge architecture ([#476](https://github.com/thinking-machines-lab/tinker-cookbook/pull/476))
**Date:** 2026-03-17
**Type:** improvement
**Tags:** weights
LoRA merge now processes one shard at a time instead of loading all weights into memory. Modular architecture with per-model merge modules makes adding new model families easier.
---
### [cookbook] Tag-based versioning with hatch-vcs and nightly builds ([#439](https://github.com/thinking-machines-lab/tinker-cookbook/pull/439))
**Date:** 2026-03-17
**Type:** new
**Tags:** infrastructure
Package version is now derived from git tags via hatch-vcs. Nightly builds publish dev versions automatically. PyPI publishing triggers on `v*` tags via GitHub Actions ([#430](https://github.com/thinking-machines-lab/tinker-cookbook/pull/430)).
---
### [cookbook] PEP 561 `py.typed` marker ([#483](https://github.com/thinking-machines-lab/tinker-cookbook/pull/483))
**Date:** 2026-03-17
**Type:** new
**Tags:** infrastructure
Added `py.typed` marker so downstream projects using mypy/pyright get type information from `tinker-cookbook`.
---
### [cookbook] Consolidate streaming and response normalization into base Renderer ([#451](https://github.com/thinking-machines-lab/tinker-cookbook/pull/451))
**Date:** 2026-03-17
**Type:** improvement
**Tags:** renderers
Streaming token parsing and response normalization logic moved from individual renderers into the base `Renderer` class. Reduces per-model boilerplate significantly.
---
### [cookbook] Downstream compatibility tests for public API contracts ([#474](https://github.com/thinking-machines-lab/tinker-cookbook/pull/474))
**Date:** 2026-03-16
**Type:** new
**Tags:** testing
New test suite that imports and exercises the public API surface, catching accidental breakage of downstream consumers.
---
### [cookbook] `include_reasoning` option for Inspect AI integration ([#456](https://github.com/thinking-machines-lab/tinker-cookbook/pull/456))
**Date:** 2026-03-16
**Type:** new
**Tags:** eval
Inspect AI evaluators can now optionally include model reasoning (thinking traces) in their evaluation context.
---
### [cookbook] `weights/` subpackage for weight lifecycle ([#461](https://github.com/thinking-machines-lab/tinker-cookbook/pull/461))
**Date:** 2026-03-16
**Type:** new
**Tags:** weights
New `tinker_cookbook/weights/` subpackage consolidating weight download, merge, quantization, and publishing into a cohesive module.
---
### [cookbook] Support `HF_TRUST_REMOTE_CODE` env var for custom tokenizers ([#460](https://github.com/thinking-machines-lab/tinker-cookbook/pull/460))
**Date:** 2026-03-14
**Type:** new
**Tags:** models, renderers
Renderers now respect the `HF_TRUST_REMOTE_CODE` environment variable, enabling use of custom tokenizers that require `trust_remote_code=True` without code changes.
---
### [cookbook] Fix `gate_up_proj` interleave index in merge script ([#459](https://github.com/thinking-machines-lab/tinker-cookbook/pull/459))
**Date:** 2026-03-14
**Type:** fix
**Tags:** weights
Fixed a bug where the interleave index for `gate_up_proj` was always 0 during LoRA merging, which could silently produce incorrect merged weights for gated MLP models.
---
### [cookbook] LiteLLM custom provider for Tinker sampling ([#458](https://github.com/thinking-machines-lab/tinker-cookbook/pull/458))
**Date:** 2026-03-14
**Type:** new
**Tags:** infrastructure
New LiteLLM custom provider that routes sampling requests through Tinker, allowing existing LiteLLM-based tooling (e.g., agent frameworks) to use Tinker-hosted models.
---
### [cookbook] CheckpointRecord dataclass for typed checkpoint bookkeeping ([#450](https://github.com/thinking-machines-lab/tinker-cookbook/pull/450))
**Date:** 2026-03-13
**Type:** new
**Tags:** infrastructure
New `CheckpointRecord` dataclass replaces ad-hoc dicts for tracking checkpoint metadata. Backward-compatible with external checkpoint formats ([#471](https://github.com/thinking-machines-lab/tinker-cookbook/pull/471)).
---
### [cookbook] Fix final checkpoint batch field that breaks resume ([#448](https://github.com/thinking-machines-lab/tinker-cookbook/pull/448))
**Date:** 2026-03-13
**Type:** fix
**Tags:** infrastructure
Fixed a bug where the batch field in the final checkpoint was incorrect, causing training resume to start from the wrong position.
---
### [cookbook] Fix checkpoint loading for DPO and on-policy distillation ([#446](https://github.com/thinking-machines-lab/tinker-cookbook/pull/446), [#447](https://github.com/thinking-machines-lab/tinker-cookbook/pull/447))
**Date:** 2026-03-13
**Type:** fix
**Tags:** supervised, rl
DPO and on-policy distillation checkpoint loading now matches SFT/RL behavior, properly restoring optimizer state and step count.
---
### [cookbook] Extract shared streaming parser and restructure renderer tests ([#431](https://github.com/thinking-machines-lab/tinker-cookbook/pull/431))
**Date:** 2026-03-13
**Type:** improvement
**Tags:** renderers, testing
Common streaming parsing logic extracted into a shared `StreamingParser`. Renderer tests reorganized by model family for better maintainability.
---
### [cookbook] Keep final SFT and RL checkpoints indefinitely ([#424](https://github.com/thinking-machines-lab/tinker-cookbook/pull/424))
**Date:** 2026-03-12
**Type:** improvement
**Tags:** infrastructure
The final checkpoint from a training run is now saved with no TTL expiry, ensuring trained weights aren't auto-deleted.
---
### [cookbook] Pluggable rollout executor via `concurrent.futures.Executor` ([#425](https://github.com/thinking-machines-lab/tinker-cookbook/pull/425))
**Date:** 2026-03-12
**Type:** new
**Tags:** rl
RL rollout execution can now use any `concurrent.futures.Executor`, enabling distributed rollout computation across multiple machines.
---
### [cookbook] Rollout summary JSONL and logtree JSON exports ([#389](https://github.com/thinking-machines-lab/tinker-cookbook/pull/389), [#428](https://github.com/thinking-machines-lab/tinker-cookbook/pull/428))
**Date:** 2026-03-12
**Type:** new
**Tags:** rl, logging
RL training now writes machine-readable `*_rollout_summaries.jsonl` (per-trajectory metadata) and `*_logtree.json` (full rollout transcripts) alongside the existing HTML reports.
---
### [cookbook] Pickle support for Renderer and env builders ([#422](https://github.com/thinking-machines-lab/tinker-cookbook/pull/422), [#423](https://github.com/thinking-machines-lab/tinker-cookbook/pull/423))
**Date:** 2026-03-11
**Type:** new
**Tags:** infrastructure
Renderers, `ChromaTool`, and `VerifiersEnvGroupBuilder` are now picklable, enabling distributed rollout execution with process pools.
---
### [cookbook] Multi-turn on-policy distillation for Harbor environments ([#411](https://github.com/thinking-machines-lab/tinker-cookbook/pull/411))
**Date:** 2026-03-10
**Type:** new
**Tags:** rl, recipes
On-policy distillation now supports multi-turn environments (e.g., Harbor terminal tasks), distilling teacher behavior across interactive trajectories.
---
### [cookbook] Standardize recipe entrypoints, log paths, and CLI config ([#405](https://github.com/thinking-machines-lab/tinker-cookbook/pull/405))
**Date:** 2026-03-09
**Type:** improvement
**Tags:** recipes
All recipes now share a consistent CLI config pattern, standardized log path structure, and unified entrypoint conventions.
---
### [cookbook] Fix TinkerMessageCompleter dropping tool_calls ([#403](https://github.com/thinking-machines-lab/tinker-cookbook/pull/403))
**Date:** 2026-03-09
**Type:** fix
**Tags:** renderers, tools
Fixed a bug where `TinkerMessageCompleter` was silently dropping tool calls from model responses.
---
### [cookbook] Qwen3.5 support ([#397](https://github.com/thinking-machines-lab/tinker-cookbook/pull/397))
**Date:** 2026-03-06
**Type:** new
**Tags:** models
Adds Qwen3.5 to the model lineup with renderer support, LR config, and LoRA merge compatibility.
---
### [cookbook] Harbor RL recipe for sandboxed terminal-bench training ([#377](https://github.com/thinking-machines-lab/tinker-cookbook/pull/377))
**Date:** 2026-03-04
**Type:** new
**Tags:** recipes, rl
New recipe for training agents on terminal tasks using Harbor sandboxed environments. Includes eval standardization ([#463](https://github.com/thinking-machines-lab/tinker-cookbook/pull/463)).
---
### [cookbook] Strip thinking from history for Kimi K2 and K2.5 renderers ([#384](https://github.com/thinking-machines-lab/tinker-cookbook/pull/384), [#393](https://github.com/thinking-machines-lab/tinker-cookbook/pull/393))
**Date:** 2026-03-01 to 2026-03-03
**Type:** new
**Tags:** renderers
Kimi K2 and K2.5 renderers now support `strip_thinking_from_history`, matching the existing Qwen3 option. Controls whether `<think>` blocks are preserved in multi-turn history.
---
### [cookbook] Persist renderer metadata on training runs ([#382](https://github.com/thinking-machines-lab/tinker-cookbook/pull/382))
**Date:** 2026-02-24
**Type:** improvement
**Tags:** infrastructure, eval
Training runs now save renderer metadata to the checkpoint. Evals auto-resolve the correct renderer from checkpoint metadata, eliminating manual renderer selection.
---
### [cookbook] Support Qwen3VL in adapter merge ([#360](https://github.com/thinking-machines-lab/tinker-cookbook/pull/360))
**Date:** 2026-02-23
**Type:** new
**Tags:** weights, models
`merge_tinker_adapter_to_hf_model` now supports Qwen3VL vision-language models, handling their unique weight structure during LoRA merge.
---
### [cookbook] ifBench RLVR recipe for instruction following ([#276](https://github.com/thinking-machines-lab/tinker-cookbook/pull/276))
**Date:** 2026-02-22
**Type:** new
**Tags:** recipes, rl
New recipe for RLVR training on instruction following benchmarks using the ifBench dataset.
---
### [cookbook] Fix empty token chunk causing 400 errors ([#376](https://github.com/thinking-machines-lab/tinker-cookbook/pull/376))
**Date:** 2026-02-19
**Type:** fix
**Tags:** supervised, rl
Fixed a bug where empty token chunks in model inputs would cause 400 errors from the Tinker API. Empty chunks are now filtered out before submission.
---
### [cookbook] Kimi K2.5 support ([#352](https://github.com/thinking-machines-lab/tinker-cookbook/pull/352), [#357](https://github.com/thinking-machines-lab/tinker-cookbook/pull/357), [#359](https://github.com/thinking-machines-lab/tinker-cookbook/pull/359))
**Date:** 2026-02-05 to 2026-02-10
**Type:** new
**Tags:** models, renderers
Adds Kimi K2.5 model family with text and vision rendering support.
---
### [cookbook] Library for training tool-use agents ([#311](https://github.com/thinking-machines-lab/tinker-cookbook/pull/311))
**Date:** 2026-02-05
**Type:** new
**Tags:** tools, rl
New library for training tool-use agents with structured tool calling, conversation management, and evaluation.
---
### [cookbook] Remove ToolCallPart/UnparsedToolCallPart from ContentPart ([#353](https://github.com/thinking-machines-lab/tinker-cookbook/pull/353))
**Date:** 2026-02-05
**Type:** improvement
**Tags:** renderers
**Breaking:** `ToolCallPart` and `UnparsedToolCallPart` are no longer part of the `ContentPart` union type. Tool calls now live exclusively in `message["tool_calls"]` / `message["unparsed_tool_calls"]`, simplifying content iteration.
---
### [cookbook] Custom renderer and tokenizer registration ([#349](https://github.com/thinking-machines-lab/tinker-cookbook/pull/349))
**Date:** 2026-02-05
**Type:** new
**Tags:** renderers
Users can now register custom renderers and tokenizers, enabling support for models not in the built-in lineup.
---
### [cookbook] Support structured content in ConversationFormatter ([#343](https://github.com/thinking-machines-lab/tinker-cookbook/pull/343))
**Date:** 2026-02-05
**Type:** improvement
**Tags:** renderers
`ConversationFormatter` now handles structured content (thinking parts, tool calls) alongside plain text.
---
### [cookbook] Temperature parameter for TinkerMessageCompleter ([#336](https://github.com/thinking-machines-lab/tinker-cookbook/pull/336))
**Date:** 2026-02-05
**Type:** new
**Tags:** rl, tools
`TinkerMessageCompleter` now accepts a `temperature` parameter, giving users control over sampling temperature during multi-turn RL and evaluation.
---
### [cookbook] Fix XSS vulnerability in logtree HTML ([#337](https://github.com/thinking-machines-lab/tinker-cookbook/pull/337))
**Date:** 2026-02-05
**Type:** fix
**Tags:** infrastructure
Fixed a cross-site scripting vulnerability in logtree HTML reports where model output could contain executable scripts.
---
### [cookbook] `build_supervised_examples` and `LAST_ASSISTANT_TURN` ([#341](https://github.com/thinking-machines-lab/tinker-cookbook/pull/341))
**Date:** 2026-02-03
**Type:** new
**Tags:** supervised, renderers
New `build_supervised_examples` (plural) helper generates multiple training examples from a single conversation by splitting at assistant turns. `LAST_ASSISTANT_TURN` trains only on the final assistant response.
---
### [cookbook] Streaming parsing for Kimi K2 renderer ([#319](https://github.com/thinking-machines-lab/tinker-cookbook/pull/319))
**Date:** 2026-01-30
**Type:** new
**Tags:** renderers
Kimi K2 renderer now supports streaming token-by-token parsing, matching the existing capability in Qwen3 and DeepSeek V3 renderers.
---
### [cookbook] Reuse KL reference client instead of recreating per minibatch ([#332](https://github.com/thinking-machines-lab/tinker-cookbook/pull/332))
**Date:** 2026-01-30
**Type:** fix
**Tags:** rl
DPO and RL training now reuse the KL penalty reference sampling client across minibatches instead of creating a new one each time, reducing overhead.
---
### [cookbook] Cap training steps with `max_step` parameter ([#328](https://github.com/thinking-machines-lab/tinker-cookbook/pull/328))
**Date:** 2026-01-28
**Type:** new
**Tags:** rl, supervised
Adds optional `max_step` config parameter to cap training steps in on-policy distillation. When set, trains for `min(max_step, dataset_length)`. Default `None` preserves existing behavior.
---
### [cookbook] Configurable KL penalty reference model ([#326](https://github.com/thinking-machines-lab/tinker-cookbook/pull/326))
**Date:** 2026-01-27
**Type:** new
**Tags:** rl
Makes the KL penalty reference model configurable in RL training. Users can now specify a different base model or a checkpoint for the KL penalty computation, rather than using the default.
---
### [cookbook] Checkpoints now have 7-day TTL by default ([#324](https://github.com/thinking-machines-lab/tinker-cookbook/pull/324))
**Date:** 2026-01-27
**Type:** improvement
**Tags:** infrastructure
Checkpoints are now set to auto-expire after 7 days by default, helping users avoid unexpected storage costs.
---
### [cookbook] Support for dedicated capacity ([#315](https://github.com/thinking-machines-lab/tinker-cookbook/pull/315))
**Date:** 2026-01-21
**Type:** new
**Tags:** infrastructure
Adds support for dedicated capacity in training configurations.
---
### [cookbook] Configurable loss function parameters with `loss_fn_config` ([#156](https://github.com/thinking-machines-lab/tinker-cookbook/pull/156))
**Date:** 2026-01-16
**Type:** new
**Tags:** rl, supervised
New `loss_fn_config` parameter allows passing additional configuration to loss functions (e.g., KL penalty coefficients, clipping thresholds) without changing the function signature.
---
### [cookbook] Modal sandbox backend for code execution ([#278](https://github.com/thinking-machines-lab/tinker-cookbook/pull/278), [#291](https://github.com/thinking-machines-lab/tinker-cookbook/pull/291), [#300](https://github.com/thinking-machines-lab/tinker-cookbook/pull/300), [#302](https://github.com/thinking-machines-lab/tinker-cookbook/pull/302))
**Date:** 2026-01-07 to 2026-01-15
**Type:** new
**Tags:** sandboxes, rl
Adds Modal as an alternative sandbox backend for code execution alongside SandboxFusion. Includes:
- `ModalSandbox` and `ModalSandboxPool` for managing sandboxes
- Warm pool maintenance with configurable timeouts
- Rate limiting to respect Modal account limits
- Async API calls for better performance
- Documentation for both sandbox backends
See `tinker_cookbook/sandbox/` for the new module structure.
---
### [cookbook] Fix streaming dataset batch skipping ([#295](https://github.com/thinking-machines-lab/tinker-cookbook/pull/295))
**Date:** 2026-01-19
**Type:** fix
**Tags:** supervised
HuggingFace's shuffle is deterministic, so batch skipping now works correctly with streaming datasets. Forward skipping through batches no longer causes data inconsistencies.
---
### [cookbook] Fix supervised metrics from OptimStepResponse ([#286](https://github.com/thinking-machines-lab/tinker-cookbook/pull/286))
**Date:** 2026-01-20
**Type:** fix
**Tags:** supervised
Previously, optimization metrics (like gradient norms) from `OptimStepResponse` were being dropped in `finish_batch`. Metrics are now properly captured and merged into the step's metrics dictionary.
---
### [cookbook] Adapter to base-model merge script ([#292](https://github.com/thinking-machines-lab/tinker-cookbook/pull/292))
**Date:** 2026-01-08
**Type:** new
**Tags:** tools
New script to merge LoRA/adapter weights back into the base model.
---
### [cookbook] Fix inspect_utils for list content from parse_response ([#299](https://github.com/thinking-machines-lab/tinker-cookbook/pull/299))
**Date:** 2026-01-12
**Type:** fix
**Tags:** eval
Fixed `inspect_utils.py` which assumed `parse_response` always returns string content. Renderers like `Qwen3Renderer` return list content (with `ThinkingPart`, `ToolCallPart`, etc.) when responses contain `<think>` or `<tool_call>` blocks. Now uses `renderers.get_text_content()` which handles both formats.
---
### [cookbook] Fix Kimi K2 and DeepSeek V3 renderer parsing ([#279](https://github.com/thinking-machines-lab/tinker-cookbook/pull/279), [#285](https://github.com/thinking-machines-lab/tinker-cookbook/pull/285))
**Date:** 2026-01-05 to 2026-01-07
**Type:** fix
**Tags:** renderers
Fixes tool declaration rendering for Kimi K2 and Qwen3 to match HuggingFace templates. Also fixes DeepSeekV3ThinkingRenderer to properly parse thinking traces via a round-trip test ensuring `build_supervised_example` and `parse_response` correspondence.
---
### [sdk] Torch is now an optional dependency ([#15](https://github.com/thinking-machines-lab/tinker/pull/15))
**Date:** 2026-01-20
**Type:** improvement
**Tags:** dependencies
Moves torch to an optional dependency in the SDK. Applications that don't need torch for training can now use the SDK without installing it. Import guards added to `training_client.py`.
---
### Major renderer overhaul: tool calling, structured content ([#220](https://github.com/thinking-machines-lab/tinker-cookbook/pull/220), [#221](https://github.com/thinking-machines-lab/tinker-cookbook/pull/221), [#238](https://github.com/thinking-machines-lab/tinker-cookbook/pull/238), [#243](https://github.com/thinking-machines-lab/tinker-cookbook/pull/243), [#244](https://github.com/thinking-machines-lab/tinker-cookbook/pull/244), [#250](https://github.com/thinking-machines-lab/tinker-cookbook/pull/250))
**Date:** 2025-12-26 to 2025-12-28
**Type:** improvement
**Tags:** renderers, rl
A series of PRs that significantly improve the renderer system:
**Tool calling support:** New `ToolSpec` type for defining tools and `create_conversation_prefix_with_tools()` API on all renderers. Tool call parsing supported for Qwen3, DeepSeek V3, and Kimi K2. `UnparsedToolCall` captures tool calls that fail to parse.
**Structured message content:** The `Message.thinking` field is removed (**breaking**). Thinking content is now represented as `ThinkingPart` in the content list, alongside `TextPart`, `ImagePart`, and `ToolCallPart`. Use `get_text_content(message)` to extract text after `parse_response`.
**Clearer field names:** `RenderedMessage` fields renamed (**breaking**): `prefix` → `header`, `content` → `output`, `suffix` → `stop_overlap`. `Renderer` changed from Protocol to ABC.
**Sequence extension property:** New `has_extension_property` on `Renderer` indicates whether consecutive timesteps can be merged for O(T) instead of O(T²) compute in multi-turn RL.
**Modular architecture:** `renderers.py` split into `tinker_cookbook/renderers/` package with per-model modules (`qwen3.py`, `deepseek_v3.py`, `kimi_k2.py`, etc.). Imports unchanged.
**HF compatibility:** Various fixes to match HuggingFace chat templates, with expanded test coverage using random conversation generation.
---
### Qwen3 thinking blocks can now be preserved in history ([#142](https://github.com/thinking-machines-lab/tinker-cookbook/pull/142))
**Date:** 2025-12-06
**Type:** new
**Tags:** renderers, rl
The Qwen3Renderer now has a `strip_thinking_from_history` option. By default (`True`), it strips `<think>...</think>` blocks from previous assistant turns—matching how Qwen3 was trained. Set it to `False` if you're doing multi-turn RL and want to use sequence extension: preserving thinking lets turns merge into one sequence, reducing compute cost.
---
### Disable checkpoint saving with `save_every=0` ([#149](https://github.com/thinking-machines-lab/tinker-cookbook/pull/149))
**Date:** 2025-12-06
**Type:** improvement
**Tags:** supervised, rl
Setting `save_every=0` now disables checkpoint saving entirely (previously it crashed with a divide-by-zero). Useful for quick test runs where you don't need checkpoints.
---
### xmux: launch experiment sweeps in tmux ([#138](https://github.com/thinking-machines-lab/tinker-cookbook/pull/138))
**Date:** 2025-12-02
**Type:** new
**Tags:** tools
New `xmux` utility for running experiment sweeps. It spawns parallel jobs in a tmux session where you can monitor each experiment's progress in separate windows. See `tinker_cookbook/xmux/examples/` for usage.
---
### Optimizer state now loads correctly on resume ([#140](https://github.com/thinking-machines-lab/tinker-cookbook/pull/140), [#141](https://github.com/thinking-machines-lab/tinker-cookbook/pull/141))
**Date:** 2025-12-02
**Type:** fix
**Tags:** supervised, rl
Training resumption now properly loads optimizer state (momentum, etc.) alongside model weights. Previously, `load_state()` didn't restore the optimizer, which could affect training dynamics after a checkpoint resume.
---
### Tracing support for supervised training ([#88](https://github.com/thinking-machines-lab/tinker-cookbook/pull/88))
**Date:** 2025-11-21
**Type:** new
**Tags:** supervised, tools
Set `enable_trace=True` to generate trace events during supervised training. Visualize with Perfetto to see where time is spent. Run `python -m tinker_cookbook.utils.trace` to convert the trace file.
---
### Code RL recipe with DeepCoder ([#83](https://github.com/thinking-machines-lab/tinker-cookbook/pull/83))
**Date:** 2025-11-18
**Type:** new
**Tags:** recipes, rl
New recipe for RL on competitive programming problems using the DeepCoder dataset. Code execution is sandboxed via Sandbox Fusion. See `tinker_cookbook/recipes/code_rl/`.
---
### Configurable temperature for RL sampling ([#86](https://github.com/thinking-machines-lab/tinker-cookbook/pull/86))
**Date:** 2025-11-17
**Type:** new
**Tags:** rl
Temperature is now a configurable parameter in RL configs. Previously hardcoded to 1.0.
---
### Per-message training control with `TrainOnWhat.CUSTOMIZED` ([#85](https://github.com/thinking-machines-lab/tinker-cookbook/pull/85))
**Date:** 2025-11-14
**Type:** new
**Tags:** supervised, renderers
New `TrainOnWhat.CUSTOMIZED` option lets you set a `trainable: bool` field on each message to control which messages get loss applied. Useful for training on specific turns in a conversation.
---
### Interactive environment debugging with `play_w_env` ([#76](https://github.com/thinking-machines-lab/tinker-cookbook/pull/76))
**Date:** 2025-11-07
**Type:** new
**Tags:** rl, tools
New utility to "role-play" as the policy and interact with an Environment. Useful for debugging reward functions and environment logic. See `tinker_cookbook/recipes/multiplayer_rl/twenty_questions/play.py` for an example.
---
## /CLAUDE.md
AGENTS.md
## /CONTRIBUTING.md
# Contributing to Tinker Cookbook
We welcome contributions! This project is built in the spirit of open science and collaborative development.
## Development setup
```bash
git clone https://github.com/thinking-machines-lab/tinker-cookbook.git
cd tinker-cookbook
uv sync --extra dev
pre-commit install
```
This installs dev dependencies and registers pre-commit hooks that run `ruff` formatting and linting on every commit.
## Running tests
```bash
# Unit tests (no API key needed, colocated *_test.py files)
uv run pytest tinker_cookbook/
# Integration tests (requires TINKER_API_KEY)
uv run pytest tests/
```
## Code style
We use [ruff](https://docs.astral.sh/ruff/) for linting and formatting (line length: 100). Pre-commit hooks run automatically on each commit.
```bash
uv run ruff check tinker_cookbook/
uv run ruff format tinker_cookbook/
```
## Type checking
We use [pyright](https://github.com/microsoft/pyright) for static type analysis. Please use typing wherever possible; avoid `Any` and `type: ignore`; prefer casting. However, avoid convoluted generics or overly verbose code just to satisfy the type checker. Prefer single types over union types.
```bash
uv run pyright tinker_cookbook
```
## Pull request process
1. Create a feature branch from `main`
2. Make your changes with tests if applicable
3. Ensure all checks pass: `pre-commit run --all-files`
4. Open a PR with a clear description of the change
CI runs pre-commit, pyright, and pytest on every PR.
## Project structure
- `tinker_cookbook/` — Library code (supervised learning, RL, renderers, utilities)
- `tinker_cookbook/recipes/` — Example training scripts
- `tests/` — Integration tests (require API key)
- `skills/` — Claude Code skills for Tinker workflows
---
# Design conventions
## Organization of training scripts
We're designing the codebase with the following goals:
1. Low barrier to entry: it should be dead simple to run something and see numbers go up.
2. Extensible: it should be possible to pass in custom datasets and evals and control all the hyperparameters.
3. Science-friendly: it should be easy to run sweeps, and analyze the results.
To achieve this, we'll use the following structure around training scripts:
- There's a main training function, such as [rl/train.py](tinker_cookbook/rl/train.py) or [supervised/train.py](tinker_cookbook/supervised/train.py), which contains the main loop.
- This function contains a detailed config object (`Config`), which isn't constructable from the command line.
- The config contains members that specify things like datasets and evals. However, these should be chz configs (with a `.build` method that constructs the actual object) or callables (we recommend using functools.partial). This way, the config is serializable, which is useful for sweeps.
- There are launch scripts that assemble training configs (e.g., [recipes/math_rl/train.py](tinker_cookbook/recipes/math_rl/train.py)), which construct a smaller config object (`CLIConfig`) from the command line.
## Async
Async is very useful for RL, where it allows us to make many queries in parallel (e.g., sampling calls). For all of the interfaces used in RL (such as the `Env` class), all the methods that take nontrivial amounts of time should be async. For some of the other code, such as [recipes/sl_loop.py](tinker_cookbook/recipes/sl_loop.py), we've chosen not to use async methods, just to make it more beginner-friendly, as many python programmers are not familiar with async.
## Classes
There are a lot of different classes, which might make the code feel less approachable. However, they follow *the builder pattern*, and the code should be less confusing when you know the pattern.
We can illustrate the pattern with the two main examples:
- A `SupervisedDatasetBuilder` is a configuration object which builds a `SupervisedDataset`.
- An `RLDatasetBuilder` is a configuration object which builds an `RLDataset`, which generates batches of `EnvGroupBuilder` objects, which each generate a group of `Env` objects.
Here, the `SupervisedDatasetBuilder`, `RLDatasetBuilder`, and `EnvGroupBuilder` are all configuration objects, which have a `__call__` method that builds another object. You can see these objects in [supervised/types.py](tinker_cookbook/supervised/types.py) and [rl/types.py](tinker_cookbook/rl/types.py).
In general, we use a lot of configuration objects, with a `__call__` method that returns a heavyweight object (like a dataset). We use `chz` for the configuration objects -- it's similar to a dataclass but with some extra features that are nice for configs. We use either dataclasses or regular python classes for the heavyweight objects.
## Envs
An `Env` is an RL environment. For those with an RL background, it roughly corresponds to an MDP or a POMDP, however we use it in more general cases (such as multi-agent settings) that don't strictly correspond to the MDP/POMDP formalism. It's roughly analogous to the concept of an Env in OpenAI Gym, but unlike OpenAI Gym, we don't have a `reset` method; rather, the env should be discarded after a rollout. Any shared resources should be maintained by whatever object is creating the envs.
The `Env`s are created by `EnvGroupBuilder`s. The group of envs returned by `EnvGroupBuilder` have something in common; either they correspond to the same task (in which case we can use this information for variance reduction, as in GRPO, which centers per group); or, we can use the group to define a multi-agent environment.
- One common multi-agent environment is where we use a pairwise preference model to compare pairs of completions.
- We can also use the group to define a two-player game. Some two player games such as tic-tac-toe are currently supported through the [text_arena](tinker_cookbook/recipes/multiplayer_rl/text_arena/env.py) environments.
## Notation
We'll use subscripts to indicate the shapes of objects. For example, `tokens_P_G_T` indicates a three-dimensional array of tokens, with `P` problems, `G` groups, and `T` tokens per groups, so `tokens_P_G_T[p][g][t]` should refer to a single token. In many cases, the arrays will be ragged. E.g., the `T` axis will have different lengths for different `(p,g)`. Sometimes, a given dimension will be flattened from two dimensions. If we write `tokens_PG_T`, that means that we have a two dimensional array, where the 0th dimension is flattened from the `P` and `G` dimensions.
### Common Dimension Names
Here are the standard dimension subscripts used throughout the codebase:
- `_D`: Data/Datum dimension (for training data items)
- `_G`: Group dimension (for multiple attempts/rollouts of the same problem)
- `_P`: Problem dimension (for different problems/prompts)
- `_T`: Token/Time dimension (for sequences)
The relationship between dimensions in RL:
- A batch contains multiple problems (`_P`)
- Each problem spawns multiple attempts/environments (`_G`), forming a group
- Each attempt produces one trajectory
- Advantages are normalized within each group (across the `_G` dimension)
Examples:
- `env_group_builders_P`: A list of environment builders, one per problem
- `trajectories_G`: Multiple trajectories from attempts at the same problem
- `rewards_G`: Rewards for each attempt within a group
- `tokens_P_G_T`: Tokens with problem, group, and time dimensions
- `data_D`: A list of training data items
## Questions?
Email us at tinker@thinkingmachines.ai.
## /README.md
<h1 align="center">Tinker Cookbook</h1>
<div align="center">
<img src="assets/tinker-cover.png" width="60%" />
</div>
<div align="center">
[](https://github.com/thinking-machines-lab/tinker-cookbook/actions/workflows/pytest.yaml)
[](https://github.com/thinking-machines-lab/tinker-cookbook/actions/workflows/pyright.yaml)
[](https://github.com/thinking-machines-lab/tinker-cookbook/actions/workflows/smoke-test-recipes.yaml)
[](https://pypi.org/project/tinker-cookbook/)
</div>
We provide two libraries for the broader community to customize their language models: `tinker` and `tinker-cookbook`.
- `tinker` is a training SDK for researchers and developers to fine-tune language models. You send API requests to us and we handle the complexities of distributed training.
- `tinker-cookbook` includes realistic examples of fine-tuning language models. It builds on the Tinker API and provides common abstractions to fine-tune language models.
## Installation
1. Sign up for Tinker [here](https://auth.thinkingmachines.ai/sign-up).
2. Once you have access, create an API key from the [console](https://tinker-console.thinkingmachines.ai) and export it as environment variable `TINKER_API_KEY`.
3. Install `tinker-cookbook` (includes the `tinker` SDK as a dependency):
```bash
# Latest stable release from PyPI
uv pip install tinker-cookbook
# Or install the nightly build
uv pip install 'tinker-cookbook @ git+https://github.com/thinking-machines-lab/tinker-cookbook.git@nightly'
```
## Tinker
Here we introduce a few Tinker primitives — the basic components to fine-tune LLMs (see the [quickstart guide](https://tinker-docs.thinkingmachines.ai/tinker/quickstart/) for more details):
```python
import tinker
service_client = tinker.ServiceClient()
training_client = service_client.create_lora_training_client(
base_model="meta-llama/Llama-3.2-1B", rank=32,
)
training_client.forward_backward(...)
training_client.optim_step(...)
training_client.save_state(...)
training_client.load_state(...)
sampling_client = training_client.save_weights_and_get_sampling_client()
sampling_client.sample(...)
```
See [tinker_cookbook/recipes/sl_loop.py](tinker_cookbook/recipes/sl_loop.py) and [tinker_cookbook/recipes/rl_loop.py](tinker_cookbook/recipes/rl_loop.py) for minimal examples of using these primitives to fine-tune LLMs.
### Tutorials
New to Tinker? The [`tutorials/`](tutorials/) directory contains 20+ progressive [marimo](https://marimo.io/) notebooks that walk through core concepts — rendering, loss functions, completers, weight management — and advanced topics such as custom RL environments, DPO, RLHF, and weight export. Run any tutorial with `marimo edit tutorials/101_hello_tinker.py`. See the [tutorials README](tutorials/README.md) for the full list, or browse rendered versions on the [Tinker docs site](https://tinker-docs.thinkingmachines.ai/tutorials).
To download the weights of any model:
```python
rest_client = service_client.create_rest_client()
future = rest_client.get_checkpoint_archive_url_from_tinker_path(sampling_client.model_path)
with open(f"model-checkpoint.tar.gz", "wb") as f:
f.write(future.result())
```
### Tinker Cookbook
Besides these primitives, we also offer **Tinker Cookbook** (a.k.a. this repo), a library of a wide range of abstractions to help you customize training environments.
[`tinker_cookbook/recipes/sl_basic.py`](tinker_cookbook/recipes/sl_basic.py) and [`tinker_cookbook/recipes/rl_basic.py`](tinker_cookbook/recipes/rl_basic.py) contain minimal examples to configure supervised learning and reinforcement learning.
We also include more complete examples in the [`tinker_cookbook/recipes/`](tinker_cookbook/recipes/) folder:
- **[Chat SFT](tinker_cookbook/recipes/chat_sl/)**: supervised fine-tuning on conversational datasets (e.g., Tulu3).
- **[Math RL](tinker_cookbook/recipes/math_rl/)**: reinforcement learning for mathematical reasoning with verifiable rewards.
- **[Code RL](tinker_cookbook/recipes/code_rl/)**: RL on competitive programming with sandboxed code execution (DeepCoder replication).
- **[Preference learning](tinker_cookbook/recipes/preference/)**: DPO and a three-stage RLHF pipeline (SFT, reward model, RL).
- **[Distillation](tinker_cookbook/recipes/distillation/)**: on-policy and off-policy knowledge distillation with single- and multi-teacher configurations.
- **[Tool use](tinker_cookbook/recipes/search_tool/)**: RL for retrieval-augmented generation (Search-R1 replication).
- **[Multi-agent](tinker_cookbook/recipes/multiplayer_rl/)**: multi-agent RL with self-play and cross-play.
The [recipes README](tinker_cookbook/recipes/README.md) covers all available recipes, including Harbor RL, rubric-based grading, VLM classification, and SDFT. Each recipe includes a `README.md` with implementation details, launch commands, and expected results.
### Evaluation (experimental)
Tinker Cookbook includes a [benchmark framework](tinker_cookbook/eval/) for evaluating trained models:
```python
from tinker_cookbook.eval.benchmarks import run_benchmarks, BenchmarkConfig
results = await run_benchmarks(
["gsm8k", "mmlu_pro", "ifeval"],
sampling_client, renderer,
BenchmarkConfig(save_dir="evals/step500"),
)
```
The framework currently supports 12 benchmarks (GSM8K, MATH-500, MMLU-Pro, MMLU-Redux, GPQA, IFEval, MBPP, C-Eval, SuperGPQA, IFBench, AIME 2025, AIME 2026) with verified scores against published results, plus experimental benchmarks such as LiveCodeBench, Terminal Bench, and SWE-bench. Benchmarks can also serve as inline training evaluators via `BenchmarkEvaluator`.
**Note:** Benchmark scores are sensitive to evaluation configuration — system prompts, `max_tokens`, temperature, and timeout settings can shift results significantly. We document our exact settings alongside all reported scores. This framework is under active development; feedback and contributions are welcome. See the [eval README](tinker_cookbook/eval/README.md) for verified scores, configuration details, and instructions for adding new benchmarks.
### Documentation
For the full Tinker documentation, visit [tinker-docs.thinkingmachines.ai](https://tinker-docs.thinkingmachines.ai).
### Utilities
Tinker Cookbook also provides reusable building blocks:
- [`renderers`](tinker_cookbook/renderers/) — bidirectional conversion between token sequences and structured chat messages
- [`hyperparam_utils`](tinker_cookbook/hyperparam_utils.py) — learning rate and hyperparameter scaling for LoRA training
- [`eval`](tinker_cookbook/eval/) — benchmark framework and inline training evaluators (see [Evaluation](#evaluation-experimental) above)
## Claude Code Skills
Tinker Cookbook ships with [Claude Code skills](https://docs.anthropic.com/en/docs/claude-code/skills) that teach Claude how to use the Tinker API. Install them so Claude can help you write training code in any project:
```
/plugin marketplace add thinking-machines-lab/tinker-cookbook
```
Then install the **tinker** plugin from the Discover tab (`/plugin` → Discover). Once installed, two skills are available:
| Command | What it does |
|---|---|
| `/tinker:research` | Plan and run post-training experiments — SFT, RL, DPO, distillation, evaluation, hyperparameters, model selection, and more |
| `/tinker:debug` | Diagnose slow training, hangs, output mismatches, renderer issues, and errors |
Skills also trigger automatically based on context — ask Claude to "set up SFT training" and it will load the right skill without a slash command. Skills update automatically when the repo is updated.
## Development Setup
```bash
uv sync --extra dev
pre-commit install
```
This installs dev dependencies and registers pre-commit hooks that run `ruff` formatting and linting on every commit. CI enforces these checks on all pull requests.
## Contributing
This project is built in the spirit of open science and collaborative development. We believe that the best tools emerge through community involvement and shared learning.
We welcome PR contributions after our private beta is over. If you have any feedback, please email us at tinker@thinkingmachines.ai.
## Citation
If you use Tinker for your research, please cite it as:
```
Thinking Machines Lab, 2026. Tinker. https://thinkingmachines.ai/tinker/.
```
Or use this BibTeX citation:
```
@misc{tml2026tinker,
author = {Thinking Machines Lab},
title = {Tinker},
year = {2026},
url = {https://thinkingmachines.ai/tinker/},
}
```
## /assets/tinker-cover.png
Binary file available at https://raw.githubusercontent.com/thinking-machines-lab/tinker-cookbook/refs/heads/main/assets/tinker-cover.png
## /pyproject.toml
```toml path="/pyproject.toml"
[project]
name = "tinker_cookbook"
dynamic = ["version"]
description = "Implementations of post-training algorithms using the Tinker API"
readme = "README.md"
authors = [
{ name = "Tinker authors", email = "tinker@thinkingmachines.ai" },
]
license = "Apache-2.0"
requires-python = ">=3.11"
dependencies = [
"aiohttp>=3.9.0",
"anyio>=4.0.0",
"blobfile>=3.0.0",
"chz>=0.4.0",
"cloudpickle>=3.0.0",
"datasets>=2.14.0",
"huggingface_hub>=0.20.0",
"numpy>=1.24.0",
"pillow>=10.0.0",
"pydantic>=2.0.0",
"rich>=13.0.0",
"safetensors>=0.4.0",
"termcolor>=2.0.0",
"tiktoken>=0.12.0", # Required for Kimi tokenizer
"tinker>=0.22.3",
"torch>=2.0",
"tqdm>=4.60.0",
"transformers>=4.57.6,<=5.5.3",
]
[project.urls]
Homepage = "https://thinkingmachines.ai/tinker"
Repository = "https://github.com/thinking-machines-lab/tinker-cookbook"
Documentation = "https://tinker-docs.thinkingmachines.ai/"
[project.optional-dependencies]
dev = [
"pytest>=8.0.0",
"pytest-asyncio>=0.23.0",
"pytest-timeout>=2.0.0",
"ruff>=0.4.0",
"pyright>=1.1.300",
]
math-rl = [
"math-verify>=0.5.0",
"pylatexenc>=2.0",
"sympy>=1.12.0",
]
modal = [
"modal>=1.0.0",
]
multiplayer-rl = [
"textarena>=0.7.4",
]
tutorials = [
"marimo>=0.23.8",
"matplotlib>=3.7.0",
"tinker_cookbook[math-rl]",
]
vector-search = [
"chromadb>=1.0.0",
"google-genai>=1.0.0",
"huggingface_hub>=0.20.0",
]
cloud = [
"fsspec>=2023.1.0",
"gcsfs>=2023.1.0",
"s3fs>=2023.1.0",
"adlfs>=2023.1.0",
]
wandb = [
"wandb>=0.16.0",
"plotly>=5.0.0",
]
neptune-scale = [
"neptune-scale>=0.27.0",
]
trackio = [
"trackio<1.0.0",
]
verifiers = [
"verifiers>=0.1.9,<0.1.10",
"openai>=1.0.0",
]
inspect = [
"inspect-ai>=0.3.100",
"inspect-evals>=0.3.106",
]
litellm = [
"litellm>=1.80.0,!=1.82.7,!=1.82.8",
]
eval-math500 = [
"tinker_cookbook[math-rl]",
]
eval-hmmt = [
"antlr4-python3-runtime>=4.11,<4.14",
]
eval-mbpp = [
"tinker_cookbook[modal]",
]
eval-livecodebench = [
"tinker_cookbook[modal]",
]
eval-terminal-bench = [
"tinker_cookbook[modal]",
]
eval-swe-bench = [
"tinker_cookbook[modal]",
]
eval-ifbench = [
# The ifbench package must also be installed separately (not on PyPI):
# uv pip install 'ifbench @ git+https://github.com/allenai/IFBench.git'
"nltk",
"emoji",
"syllapy",
"langdetect",
]
eval = [
"tinker_cookbook[eval-math500]",
"tinker_cookbook[eval-hmmt]",
"tinker_cookbook[eval-mbpp]",
"tinker_cookbook[eval-livecodebench]",
"tinker_cookbook[eval-terminal-bench]",
"tinker_cookbook[eval-swe-bench]",
"tinker_cookbook[eval-ifbench]",
]
all = [
"tinker_cookbook[eval]",
"tinker_cookbook[math-rl]",
"tinker_cookbook[modal]",
"tinker_cookbook[multiplayer-rl]",
"tinker_cookbook[vector-search]",
"tinker_cookbook[wandb]",
"tinker_cookbook[neptune-scale]",
"tinker_cookbook[trackio]",
"tinker_cookbook[verifiers]",
"tinker_cookbook[inspect]",
"tinker_cookbook[litellm]",
]
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"
[tool.hatch.version]
source = "vcs"
tag-pattern = "v(?P<version>.*)"
[tool.hatch.build.hooks.vcs]
version-file = "tinker_cookbook/_version.py"
[tool.hatch.build.targets.wheel]
packages = ["tinker_cookbook"]
exclude = ["*_test.py", "**/results/"]
[tool.pytest.ini_options]
testpaths = ["tinker_cookbook", "tests"]
python_files = ["*_test.py", "test_*.py"]
norecursedirs = ["tinker_cookbook/scripts"]
markers = [
"slow: marks tests as slow (deselect with '-m \"not slow\"')",
"integration: marks tests requiring TINKER_API_KEY and network access",
"downstream_compat: marks tests that verify public API contracts for downstream consumers",
"timeout: per-test timeout in seconds (requires pytest-timeout)",
]
[tool.ruff]
line-length = 100
exclude = [
# Vendored from HuggingFace, kept identical to upstream
"kimi-k2.5-hf-tokenizer/tool_declaration_ts.py",
]
[tool.ruff.lint]
select = [
"E", # pycodestyle errors
"F", # pyflakes (unused imports, undefined names)
"I", # isort (import sorting)
"UP", # pyupgrade (modernize Python syntax)
"B", # flake8-bugbear (common bugs and design problems)
"SIM", # flake8-simplify (simplifiable code)
"RUF", # Ruff-specific rules
"C4", # flake8-comprehensions (unnecessary comprehensions)
]
ignore = [
"E501", # line too long (handled by formatter)
"B028", # no explicit stacklevel in warnings
"SIM108", # ternary operator (can reduce readability)
"UP007", # use X | Y for union types (not always clearer)
"RUF001", # ambiguous unicode character in string (intentional in tokenizer code)
"RUF002", # ambiguous unicode character in docstring
"RUF003", # ambiguous unicode character in comment
"RUF005", # collection-literal-concatenation (stylistic, not a bug)
"RUF006", # asyncio-dangling-task (false positives with our patterns)
"RUF012", # mutable-class-default (conflicts with chz dataclass patterns)
"RUF046", # unnecessary-cast-to-int (explicit casts aid readability)
"B008", # function-call-in-default-argument (to be fixed in a follow-up)
"B905", # zip-without-explicit-strict (to be tightened in a follow-up)
"B023", # function-uses-loop-variable (to be fixed in a follow-up)
"B027", # empty-method-without-abstract-decorator (intentional in base classes)
"RUF059", # unused-unpacked-variable (to be fixed in a follow-up)
"SIM117", # multiple-with-statements (can reduce readability with context managers)
"RUF022", # unsorted-dunder-all (we group __all__ by category, not alphabetically)
]
[tool.pyright]
include = ["tinker_cookbook"]
exclude = [
".venv",
# Vendored from HuggingFace, kept identical to upstream
"kimi-k2.5-hf-tokenizer/tool_declaration_ts.py",
]
# PyTorch's __init__.py doesn't declare __all__, so recent pyright versions
# flag every torch.* symbol as a private import. See pytorch/pytorch#50798.
reportPrivateImportUsage = "none"
```
## /skills/debug/SKILL.md
---
name: debug
description: Diagnose training issues with Tinker — slow steps, hanging sessions, output mismatches, error messages, renderer problems, and deployment issues. Use this skill whenever a user reports that training is slow, steps take too long, sessions are hanging, model outputs differ between Tinker and external engines (vLLM, SGLang), they get a confusing error message, training quality is poor (high KL, bad outputs), or they suspect something is wrong. Also trigger when users ask "is this a Tinker issue or my issue?", "is Tinker down?", report unexpected wait times, see output quality regressions, get opaque errors, or want to profile/debug their training or deployment pipeline. This skill walks through systematic triage to determine root cause.
---
# Tinker Debug
Systematic triage for training and deployment issues. Five triage paths:
1. **Performance issues** — slow steps, hanging sessions, throughput problems
2. **Output correctness issues** — mismatches between Tinker sampling and external inference engines
3. **Service availability** — "is Tinker down?" quick diagnostics
4. **Renderer issues** — wrong tokens, training quality degradation, prompt mismatches
5. **Error message decoder** — mapping opaque errors to root causes
Identify which category the user's problem falls into, then follow the appropriate triage.
## How the Tinker SDK works (essential context)
Understanding the SDK's threading model is key to diagnosing most issues. The SDK runs a **background thread** with its own asyncio event loop. All network I/O, heartbeats, and API result polling happen on this thread.
```
┌─────────────────────┐ ┌──────────────────────────────┐
│ Main Thread │ │ SDK Background Thread │
│ (user code) │ │ (asyncio event loop) │
│ │ │ │
│ fb = tc.fwd_bwd_ │────>│ HTTP POST /forward_backward │
│ async(data) │ │ → returns request_id │
│ │ │ │
│ # prepare next │ │ Long-poll /retrieve_future │
│ # batch here... │ │ (HTTP 408 = not ready yet) │
│ │ │ │
│ result = fb.result()│<────│ Result arrives → resolve │
│ # blocks until done │ │ │
│ │ │ Heartbeat every 10s │
└─────────────────────┘ └──────────────────────────────┘
```
When you call `forward_backward_async()`, the SDK:
1. Submits the request coroutine to the background thread
2. Returns a future immediately (main thread continues)
3. Background thread sends HTTP request, starts long-polling for result
4. Calling `.result()` on the future blocks main thread until the background thread resolves it
**Why this matters for debugging:** The background thread shares the Python GIL with the main thread. If user code holds the GIL for extended periods (heavy numpy/torch computation, CPU-bound data processing, slow serialization), the background thread **cannot**:
- Send heartbeats (sessions can expire after missed heartbeats)
- Poll for API results (futures appear to "hang")
- Submit new requests (pipelining breaks)
This means "my training is slow/hanging" is often caused by the user's own code blocking the SDK's background thread via GIL contention — not a network or server issue.
## Triage order
Work through these steps in order. Most issues are caught in steps 1-3 and never need deep profiling.
### Step 1: Environment check
Bad dependency versions are a silent killer. Check these first because they're fast to verify and cause mysterious slowdowns that look like service issues.
```python
import sys, pydantic, tinker
print(f"Python: {sys.version}")
print(f"pydantic: {pydantic.__version__}")
print(f"tinker SDK: {tinker.__version__}")
try:
import torch; print(f"torch: {torch.__version__}")
except ImportError: pass
try:
import numpy; print(f"numpy: {numpy.__version__}")
except ImportError: pass
try:
import transformers; print(f"transformers: {transformers.__version__}")
except ImportError: pass
```
**Known problem versions:**
- `pydantic >= 2.13.0b1` (beta): Serialization regression makes `model_dump()` extremely slow on large payloads (tokens/tensors). Symptom: SDK thread stalls for minutes on `forward_backward` submission. Fix: pin `pydantic<2.13` or use a stable release.
- `transformers == 5.3.0`: Incorrect `tokenizer_class` for DeepSeek V2/V3 models (huggingface/transformers#44801, fixed in 5.3.1). Causes tokenizer loading failures. Upgrade or skip this version.
- `transformers < 5.0`: Bug in `Qwen2VLImageProcessor` that miscounts image tokens for VL models. Fix: upgrade to `>=5.0`.
- Always check if the user is on the **latest stable** tinker SDK. Suggest `pip install --upgrade tinker`.
If the user has a beta or pre-release of any core dependency, that's the likely culprit. Suggest downgrading before deeper investigation.
### Step 2: Async pipelining check
The single most common performance mistake. If the user's code awaits each API call before submitting the next, the GPU sits idle between steps.
**Ask the user to share their training loop code**, then look for these anti-patterns:
```python
# BAD: sequential — GPU idle while client prepares next call
result = tc.forward_backward(data=batch, loss_fn="cross_entropy") # blocks
tc.optim_step(adam_params=params) # blocks
# BAD: async but still sequential
result = await tc.forward_backward_async(data=batch, loss_fn="cross_entropy")
await tc.optim_step_async(adam_params=params)
# GOOD: pipelined — submit both before awaiting
fb_future = tc.forward_backward_async(data=batch, loss_fn="cross_entropy")
optim_future = tc.optim_step_async(adam_params=params)
# ... prepare next batch while GPU works ...
fb_result = fb_future.result()
optim_result = optim_future.result()
```
If the user is using the cookbook's `supervised.train` or `rl.train`, pipelining is handled automatically. If they have a custom script, check their loop carefully. If they're struggling with async patterns, suggest switching to the cookbook's training scripts which handle pipelining, checkpointing, and logging out of the box.
**Quick test:** If the user reports "first step is slow but later steps are faster," that's often normal warm-up (model loading, JIT compilation). If *every* step is slow, the issue is likely pipelining or serialization.
### Step 3: Quick timing breakdown
Before reaching for heavy profiling tools, get a rough breakdown of where time goes. The cookbook's built-in tracing does this automatically.
If the user is running the cookbook's train scripts, check the `log_path` output:
```python
# Read timing metrics from a training run
import json
with open("path/to/metrics.jsonl") as f:
metrics = [json.loads(line) for line in f]
# Look at timing keys for the last few steps
for m in metrics[-3:]:
timing = {k: v for k, v in m.items() if k.startswith("time/")}
print(f"step {m.get('progress/batch')}: {timing}")
```
**What to look for:**
- `time/forward_backward` >> `time/optim_step` — Normal, fwd/bwd is heavier
- `time/get_batch` is large — Data loading is the bottleneck, not Tinker
- `time/total` >> sum of individual times — There are gaps between operations (pipelining issue)
- Large `time/forward_backward` on step 0, then normal — Warm-up, not a bug
If the user has a custom script without cookbook tracing, suggest wrapping key sections:
```python
import time
t0 = time.perf_counter()
fb_future = tc.forward_backward_async(data=batch, loss_fn="cross_entropy")
t_submit = time.perf_counter() - t0
t0 = time.perf_counter()
result = fb_future.result()
t_wait = time.perf_counter() - t0
print(f"submit: {t_submit:.2f}s, wait: {t_wait:.2f}s")
```
For a more detailed view, `pyinstrument` with async mode shows exactly where time goes:
```bash
pip install pyinstrument
pyinstrument --async-mode=enabled your_script.py
```
**Interpreting results:**
- **High submit time** → Client-side bottleneck (serialization, data prep). Go to Step 4.
- **High wait time** → Either network or server-side. Go to Step 5.
- **Both reasonable but steps are slow** → Check for gaps between steps (pipelining). Go back to Step 2.
### Step 4: Profile the request lifecycle
The key is to understand where in the request lifecycle time is being spent. Every Tinker API call goes through: **submit → serialize → network send → server compute → long-poll → resolve**. GIL contention can block steps on the SDK background thread silently.
#### Option A: Cookbook tracing (if using cookbook train scripts)
The cookbook automatically produces Gantt charts and Perfetto traces:
```python
# View the Gantt chart — shows each operation as a timeline bar
# Open: <log_path>/iteration_000000/timing_gantt.html
# View Perfetto trace (more detail, shows both threads)
python -m tinker_cookbook.utils.trace <log_path>/trace_events.jsonl -o trace.json
# Open https://ui.perfetto.dev/ and load trace.json
```
In the Gantt chart, look for **gaps between bars** — these are periods where neither the main thread nor the SDK is doing useful work. Common patterns:
- **Long bar for `forward_backward`** with gaps before/after → Pipelining issue
- **Long bar for `get_batch`** → Data loading bottleneck
- **Gaps with no bars at all** → GIL contention (background thread blocked)
#### Option B: pyinstrument (for custom scripts)
```bash
pip install pyinstrument
pyinstrument --async-mode=enabled your_script.py
```
The `--async-mode=enabled` flag is critical — without it, time spent in `await` all gets attributed to `epoll.poll` which tells you nothing.
**What to look for:**
- Time in `pydantic` serialization (`model_dump`, `__repr__`) → Dependency issue (Step 1)
- Time in `AwaitableConcurrentFuture.result_async` → Waiting for server (network or server-side)
- Time in data preparation, tokenization, numpy/torch ops → GIL contention risk
#### Option C: Thread stack watchdog (for hung/slow sessions)
When a session is actively slow or hung, use the async task dump + thread stack watchdog to see what both threads are doing in real-time. Read `references/async-task-dump.md` for ready-to-paste diagnostic code.
**Interpreting the output:**
- SDK thread has pending `_forward_backward_async` tasks → Work submitted, waiting for server
- SDK thread has pending `_result_async` → Normal long-polling behavior
- SDK thread stopped logging entirely → **GIL contention** — the background thread can't wake up
- Main thread stuck in numpy/torch/data code while SDK thread is stalled → GIL is the bottleneck
### Step 5: GIL contention and background thread blocking
This is the most subtle and most common cause of "mysterious" slowdowns. Because the SDK's background thread shares Python's GIL with the main thread, CPU-heavy work in the user's code blocks the SDK from:
- **Sending heartbeats** → Session may expire (warning: "Session heartbeat failed")
- **Polling for results** → Futures appear to hang for minutes
- **Submitting new requests** → Pipelining breaks even when using `_async` variants
**Symptoms of GIL contention:**
- Steps are slow but server-side traces show the GPU finished quickly
- Heartbeat warnings in the logs
- Inconsistent step times (varies with data processing load)
- `pyinstrument` shows most time in `epoll.poll` (even with async mode) — the event loop couldn't run
**Common GIL-heavy operations in training scripts:**
- Large numpy array operations (tokenization, data preprocessing)
- Torch tensor operations on CPU (not GPU — GPU ops release the GIL)
- Heavy JSON/pydantic serialization
- File I/O for large datasets
- `transformers` tokenizer calls on large batches
**Fixes:**
1. **Move heavy work out of the hot loop**: Preprocess data before training starts
2. **Use the cookbook's training scripts**: They pipeline data prep with async API calls
3. **Offload to subprocess**: For sampling-heavy workloads, the SDK supports subprocess isolation via `TINKER_SUBPROCESS_SAMPLING=1`, which gives sampling its own event loop in a separate process (no GIL sharing)
4. **Break up long CPU operations**: Insert `await asyncio.sleep(0)` or small yields between heavy processing to let the background thread run
**Diagnostic: Is GIL the problem?**
```python
import threading, time
def gil_monitor(interval=2.0):
"""Prints when the GIL blocks this thread for too long."""
expected = interval
while True:
t0 = time.monotonic()
time.sleep(interval)
actual = time.monotonic() - t0
jitter = actual - expected
if jitter > 0.5: # >500ms jitter = GIL contention
print(f"[GIL monitor] slept {actual:.2f}s instead of {expected:.1f}s "
f"(jitter: {jitter:.2f}s) — likely GIL contention")
threading.Thread(target=gil_monitor, daemon=True).start()
# ... rest of training script ...
```
This runs a monitoring thread that detects when `time.sleep()` takes significantly longer than requested — a sign that the GIL was held by another thread.
### Step 6: Network vs. server-side
If GIL is not the issue and the client submits quickly, check whether the wait is network or server:
```python
import time, tinker
svc = tinker.ServiceClient()
t0 = time.perf_counter()
caps = svc.get_server_capabilities()
print(f"API round-trip: {time.perf_counter() - t0:.2f}s")
# >2s suggests network issues; <1s rules out network
```
If network is fast, share the session ID (`tc.get_info()`) with the Tinker team for server-side investigation.
### Step 7: Escalation
If you've verified that:
1. Dependencies are on stable, recent versions
2. The training loop uses proper async pipelining
3. GIL contention is not the issue
4. Client-side profiling shows most time in SDK `await` calls
5. Network round-trip is fast
Then the issue is likely server-side. Help the user file a report with:
- **Session ID** (from `tc.get_info()`)
- **Step timing** (from metrics.jsonl or manual timing)
- **pyinstrument profile** (with `--async-mode=enabled`)
- **Tinker SDK version** and other dependency versions
- **What machine they're running on** (cloud instance type, region)
---
## Output correctness triage
When the user reports that their fine-tuned model behaves differently in external inference engines (vLLM, SGLang, TGI) compared to Tinker's sampling client, the problem is almost always in the **weight merge/export** step, not in training.
Tinker's sampling client serves the unmerged LoRA adapter directly on top of the base model — it's the ground truth. If the model produces correct outputs through Tinker sampling but wrong outputs after export, the merge is the first suspect.
### Step 1: Use the cookbook merge, not a custom script
The most common cause of merge issues is users writing their own merge scripts. The cookbook's `weights.build_hf_model()` handles model-specific weight layouts automatically — MoE expert fusion, VL model prefixes, split QKV projections, and more.
```python
from tinker_cookbook import weights
# Download adapter
adapter_dir = weights.download(
tinker_path="tinker://session-id/sampler_weights/step_N",
output_dir="./adapter",
)
# Merge LoRA into base model (handles all model-specific layouts)
weights.build_hf_model(
base_model="Qwen/Qwen3-8B",
adapter_path=adapter_dir,
output_path="./merged_model",
dtype="bfloat16",
)
```
If the user has a custom merge script, strongly recommend switching to the cookbook's implementation first. It handles:
- **MoE gate_up_proj fusion** — Correctly detects concatenated vs interleaved layouts per model family
- **VL model prefixes** — Adds `model.language_model.*` prefix for vision-language models
- **Split QKV projections** — Handles Qwen3.5 fused `in_proj_qkv` with unequal Q/K/V dimensions
- **Shard-by-shard processing** — Low memory merge for large models
### Step 2: Verify prompt equivalence
Even with correct weights, different inference engines may tokenize or render prompts differently. Verify the model receives identical input:
```python
# Compare token IDs between Tinker and external engine
# Tinker side:
model_input = renderer.build_supervised_example(messages)[0]
tinker_tokens = [chunk.tokens for chunk in model_input.chunks if hasattr(chunk, 'tokens')]
# External engine side:
external_tokens = tokenizer.apply_chat_template(messages, tokenize=True)
# Compare
assert tinker_tokens == external_tokens, "Token mismatch — check chat template and renderer"
```
For vision models, also verify:
- Image tokens are in the same positions
- Image preprocessing (resize, normalization) matches
- The number of image tokens per image matches
### Step 3: Check for known merge pitfalls
If the user must use a custom merge, the most common issues are:
- **MoE gate_up_proj fusion convention** — Concatenated (Qwen3.5, Qwen3-VL) vs interleaved (GPT-OSS). Wrong convention silently corrupts weights.
- **Precision loss** — LoRA merge math must be done in float32, then cast to bfloat16. Direct bfloat16 matmul introduces errors.
- **Tinker weight naming** — `w1`=gate, `w2`=down, `w3`=up. Swapping `w1`/`w3` is a common bug.
- **VL model prefix** — Vision-language models add `model.language_model.*` prefix that custom scripts often miss.
For full details (weight layout diagrams, validation scripts, tensor comparison code), read `references/merge-debugging.md`.
### Step 4: Try PEFT adapter as workaround
If merge issues persist, skip the merge entirely and serve the unmerged adapter:
```python
weights.build_lora_adapter(
base_model="Qwen/Qwen3-8B",
adapter_path=adapter_dir,
output_path="./peft_adapter",
)
# Then serve with vLLM: --lora-modules my_adapter=./peft_adapter
```
This lets the engine apply the LoRA at inference time, sidestepping merge-related precision and layout bugs.
### Step 5: Escalation for correctness issues
If the cookbook merge + correct prompts still produce wrong outputs, gather: model name/size, session ID, merge method, engine version, example input/output comparison, and token IDs confirming prompt equivalence.
---
## Service availability triage
When the user asks "is Tinker down?" or operations hang/fail unexpectedly, run a quick smoke test before deeper investigation. Many users can't distinguish a service outage from a bug in their code.
### Quick smoke test
Help the user run a three-step diagnostic to isolate where things break:
```python
import time, tinker
svc = tinker.ServiceClient()
# Step 1: API reachable?
t0 = time.perf_counter()
try:
caps = svc.get_server_capabilities()
print(f"API reachable ({time.perf_counter() - t0:.2f}s), {len(caps.models)} models available")
except Exception as e:
print(f"API unreachable: {type(e).__name__}: {e}")
# Step 2: Can we create a training session? (small model for speed)
try:
tc = svc.create_lora_training_client(base_model="Qwen/Qwen3.5-9B-Base", rank=32)
print(f"Training client created: {tc.get_info()}")
except Exception as e:
print(f"Training client failed: {type(e).__name__}: {e}")
```
**Interpreting:** Step 1 fails → service down or network. Step 1 passes but Step 2 fails → model-specific capacity issue (try smaller model). Both pass but user's script fails → issue is in user's code.
### Common server-side errors
| Error | Meaning |
|-------|---------|
| `APIConnectionError` | Service down or network issue |
| `APITimeoutError` | Service overloaded |
| HTTP 402 | Billing blocked — check Tinker console |
| HTTP 429 | Rate limited — reduce concurrency |
| HTTP 500 | Server bug — gather session ID and report |
| Client creation hangs | Capacity shortage — SDK may not surface error clearly |
---
## Renderer triage
Renderer issues cause **silent training degradation** — the model trains on wrong tokens, producing poor results without any error messages. This is the most common source of subtle quality bugs.
The renderer converts chat-style messages into model-specific token sequences. If the renderer produces different tokens than the model expects, training teaches the wrong associations. The model may still train (loss goes down) but learn garbage.
### When to suspect a renderer issue
- Training loss decreases but model outputs are poor quality
- High KL divergence at step 0 (before any training) — indicates the prompt tokens don't match what the model expects
- Model produces garbled or off-distribution outputs
- Tool calling works in some models but not others
- Thinking/reasoning blocks appear or disappear unexpectedly
### Step 1: Verify you're using the right renderer
```python
from tinker_cookbook import model_info
renderer_name = model_info.get_recommended_renderer_name("Qwen/Qwen3-8B")
print(f"Recommended renderer: {renderer_name}")
```
Never hardcode renderer names. Each model family has specific token formats, and using the wrong renderer silently produces incorrect training data.
**Hybrid models (thinking + non-thinking)** are especially tricky. These models support both reasoning (`<think>` blocks) and direct responses. Using the wrong variant causes token-level mismatches:
| Model family | Thinking renderer | Non-thinking renderer | Notes |
|-------------|-------------------|----------------------|-------|
| Qwen3 | `qwen3` | `qwen3_disable_thinking` | Default is thinking-enabled. `qwen3_instruct` for instruction-only. |
| Qwen3.5 | `qwen3_5` | `qwen3_5_disable_thinking` | Hybrid attention; also has VL variants. |
| DeepSeek V3 | `deepseekv3_thinking` | `deepseekv3` | Default is non-thinking. Thinking adds `<think>` prefill. |
| Kimi K2.6 | `kimi_k26` | `kimi_k26_disable_thinking` | Vision-capable. |
| Nemotron3 | `nemotron3` | `nemotron3_disable_thinking` | |
**Common hybrid model mistakes:**
- Training on data with `<think>` blocks using a `_disable_thinking` renderer → Thinking tokens treated as regular text
- Using a thinking renderer but testing with `temperature=0` and short `max_tokens` → Model spends all tokens thinking, never produces an answer
- Comparing against HF template without passing `thinking=True` → Token mismatch that looks like a renderer bug but is a test setup issue
### Step 2: Compare tokens against HuggingFace
The ground truth is the model's HuggingFace tokenizer with `apply_chat_template`. Compare your renderer's output against it:
```python
from tinker_cookbook.renderers import get_renderer
from tinker_cookbook.tokenizer_utils import get_tokenizer
from tinker_cookbook.model_info import get_recommended_renderer_name
model_name = "Qwen/Qwen3-8B"
tokenizer = get_tokenizer(model_name)
renderer_name = get_recommended_renderer_name(model_name)
renderer = get_renderer(renderer_name, tokenizer)
# Test conversation
messages = [
{"role": "user", "content": "Hello!"},
{"role": "assistant", "content": "Hi there!"},
]
# Cookbook tokens
cookbook_mi = renderer.build_generation_prompt(messages)
cookbook_tokens = cookbook_mi.to_ints()
# HuggingFace tokens
hf_tokens = tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
)
if cookbook_tokens == list(hf_tokens):
print("MATCH: Renderer tokens match HuggingFace")
else:
print("MISMATCH: Renderer diverges from HuggingFace")
# Find first divergence point
for i, (a, b) in enumerate(zip(cookbook_tokens, hf_tokens)):
if a != b:
print(f" First diff at position {i}: cookbook={a} ({tokenizer.decode([a])!r}) vs HF={b} ({tokenizer.decode([b])!r})")
break
print(f" Cookbook length: {len(cookbook_tokens)}, HF length: {len(hf_tokens)}")
```
**If tokens match:** The renderer is correct. The issue is elsewhere (training config, data quality, etc.).
**If tokens don't match:** Check these common causes:
- **Thinking mode**: Some models (Qwen3, DeepSeek) need `thinking=True` passed to `apply_chat_template`. The renderer handles this automatically, but make sure you're comparing apples to apples.
- **System prompt**: Some renderers inject a default system prompt (Kimi K2). If your HF comparison doesn't include one, tokens will diverge.
- **Tool calling format**: Each model family uses a different tool call format. The renderer must match the model's expected format exactly.
### Step 3: Check thinking mode handling
For models with thinking capabilities (Qwen3, Qwen3.5/Qwen3.6, DeepSeek V3, Kimi K2.6, Nemotron3):
- Use the `_disable_thinking` renderer variant if you don't want `<think>` blocks
- Historical assistant messages may have thinking stripped by default (depends on renderer)
- If training on thinking data, ensure the renderer preserves `<think>` blocks in the training tokens
### Step 4: Validate tool calling
Tool call formats vary significantly across models. If tool calling quality is poor after training:
1. Check that the renderer's tool format matches what the model was pre-trained on
2. Verify `parse_response()` correctly extracts tool calls from model output
3. Compare tool call rendering against HF's `apply_chat_template(..., tools=tool_specs)`
For the full renderer reference (all models, formats, edge cases), read `references/renderer-debugging.md`.
---
## Error message decoder
Tinker errors can be opaque. This section maps common error messages to root causes and fixes.
### SDK / API errors
| Error message | Root cause | Fix |
|---------------|-----------|-----|
| `max() iterable argument is empty` | Empty token list in a `Datum` — usually an `EncodedTextChunk` with no tokens | Validate your data: ensure every datum has at least one non-empty chunk with tokens |
| `Could not convert loss function inputs to array record` | Extra fields in `loss_fn_inputs` that the loss function doesn't expect (e.g., `mask` field not stripped) | Use the cookbook's data helpers which strip extra fields automatically; or remove `mask` from `loss_fn_inputs` before passing to `forward_backward` |
| `Unknown client error` | Generic catch-all — often means the checkpoint type is wrong | If during sampling: did you pass a `save_state` path instead of `save_weights_for_sampler`? State checkpoints can't be used for sampling. |
| `prompt tokens + max_tokens > context window` | Prompt too long for the model | Reduce prompt length or `max_tokens`. The error message shows the specific limits. |
| `Failed after exhausting retries` | Transient server error that didn't recover | Check service availability (smoke test above). If service is up, retry with a fresh session. |
| `Access blocked` / HTTP 403 | No permission to access the model or checkpoint | Check API key, organization membership, or checkpoint visibility settings |
| HTTP 402 | Billing issue — account blocked | Add credits at the Tinker console billing page |
| HTTP 429 | Rate limited — too many concurrent requests | Reduce concurrency (default limit: ~2000 concurrent sampling requests). Add backoff/retry logic. |
| HTTP 409 on checkpoint save | Checkpoint already exists (retry after transient failure) | The original save succeeded. Check if the checkpoint is already there before retrying. |
| `session_id is required` with hint about SDK version | Tinker SDK too old | Run `pip install --upgrade tinker` |
### Cookbook errors
| Error message | Root cause | Fix |
|---------------|-----------|-----|
| `Unknown model: {name}` | Model name not in cookbook's registry | Check spelling; use `model_info.get_model_attributes()` to list known models |
| `tokens and weights must be the same length` | Mismatch between token sequence and loss weight array | Check your `datum_from_model_input_weights()` call — usually means `max_length` truncated tokens but not weights |
| `Expected X tokens, got Y from image` | VLM image token count mismatch — usually a `transformers` version issue | Upgrade `transformers>=5.0` or install `torchvision`. HuggingFace `Qwen2VLImageProcessor` had a bug in older versions. |
| `RendererError: Unknown renderer` | Invalid renderer name | Use `model_info.get_recommended_renderer_name(model_name)` |
| `image_processor is required to render image content` | VL renderer needs image processor for image inputs | Pass `image_processor` to `get_renderer()`. Use `get_image_processor(model_name)`. |
| `DataFormatError: Each line must contain a 'messages' field` | JSONL data file has wrong format | Each line must be a JSON object with a `messages` key containing a list of message dicts |
| `StreamingSupervisedDatasetFromHFDataset only supports forward iteration` | Tried to seek backward in streaming dataset | Streaming datasets are forward-only; don't try to restart from an earlier batch |
For the full error reference with additional edge cases, read `references/error-reference.md`.
---
## Decision tree summary
```
Output differs between Tinker and external engine
├─ Using custom merge script? → Switch to cookbook weights.build_hf_model()
├─ Prompts identical? → Compare token IDs and image preprocessing
├─ MoE model? → Check gate_up_proj fusion convention (concat vs interleave)
├─ Large model / numerical issues? → Check merge precision (float32), try PEFT adapter
└─ Cookbook merge + correct prompts + still wrong → Engine-specific issue → Escalate
├─ Using custom merge script? → Switch to cookbook weights.build_hf_model()
├─ Prompts identical? → Compare token IDs and image preprocessing
├─ MoE model? → Check gate_up_proj fusion convention (concat vs interleave)
├─ Large model / numerical issues? → Check merge precision (float32), try PEFT adapter
└─ Cookbook merge + correct prompts + still wrong → Engine-specific issue → Escalate
Training is slow
├─ Check dependency versions (Step 1)
│ └─ Beta/pre-release pydantic? → Downgrade
├─ Check async pipelining (Step 2)
│ └─ Sequential API calls? → Pipeline them
├─ Get timing breakdown (Step 3)
│ ├─ High submit time → Profile request lifecycle (Step 4)
│ │ ├─ Slow serialization → Dependency issue
│ │ ├─ Slow data loading → Optimize data pipeline
│ │ └─ SDK thread stalled → GIL contention (Step 5)
│ └─ High wait time → GIL or network or server
│ ├─ Heartbeat warnings in logs → GIL contention (Step 5)
│ ├─ Fast API round-trip → Server-side → Escalate (Step 7)
│ └─ Slow API round-trip → Network issue
├─ Inconsistent step times → GIL contention (Step 5)
└─ First step slow, rest fast → Normal warm-up
Is Tinker down?
├─ Run smoke test (ServiceClient + create small training client)
│ ├─ API unreachable → Service down or network issue
│ ├─ API works but training client fails → Model-specific capacity issue
│ └─ Everything works → Issue is in user's code
└─ Check error: HTTP 402 = billing, 429 = rate limit, 500 = server bug
Training quality is poor (high KL, bad outputs)
├─ KL high at step 0? → Renderer mismatch (tokens don't match model's expected format)
├─ Compare renderer tokens vs HF apply_chat_template
│ ├─ Tokens match → Issue is elsewhere (LR, data quality, loss function)
│ └─ Tokens differ → Wrong renderer or renderer bug
├─ Tool calling broken? → Check renderer tool format matches model family
└─ Thinking blocks wrong? → Use correct _disable_thinking variant
```
## Common resolutions
| Symptom | Likely cause | Fix |
|---------|-------------|-----|
| Every step takes 5-10min | Missing async pipelining | Use `_async` variants, submit before await |
| First step slow, rest normal | Model warm-up / JIT | Expected behavior, no fix needed |
| Steps hang indefinitely | Dependency bug or network | Check pydantic version; try different machine |
| Slow with large batch_size | Payload serialization | Check pydantic version; reduce batch_size |
| Works on one machine, not another | Environment difference | Compare `pip freeze` outputs |
| GPU time looks fine but steps slow | Gaps between submissions | Pipeline: submit next step before awaiting current |
| Heartbeat warnings + slow steps | GIL contention (CPU work blocking SDK thread) | Preprocess data outside loop; use `TINKER_SUBPROCESS_SAMPLING=1` |
| Inconsistent step times | GIL contention varies with data batch | Move heavy numpy/torch CPU ops out of hot loop |
| Session expired unexpectedly | Missed heartbeats (SDK thread blocked) | Reduce GIL-holding operations; use subprocess sampling |
| Output correct in Tinker, wrong in vLLM/SGLang | Merge bug (usually fused projection layout) | Use cookbook `weights.build_hf_model()` |
| Model produces invalid/garbled outputs after export | Wrong gate_up_proj convention or precision loss | Check concat vs interleave; merge in float32 |
| Numerical instability after deploy | Merge precision or engine quantization | Try `build_lora_adapter()` instead of merging |
| Outputs subtly different but not completely wrong | bfloat16 merge rounding | Merge in float32, cast back after |
| High KL at step 0 (before any training) | Renderer produces wrong tokens | Compare renderer tokens vs HF `apply_chat_template()` |
| Training loss drops but outputs are poor | Renderer mismatch or wrong `train_on_what` | Verify renderer; check loss weight masking |
| `create_lora_training_client` hangs forever | Capacity shortage for that model | Try smaller model; check service availability |
| `Expected X tokens, got Y from image` | `transformers` version bug in VLM processor | Upgrade to `transformers>=5.0` |
| `max() iterable argument is empty` | Empty token list in datum | Validate all datums have non-empty chunks |
| Operations work but sporadically fail/hang | Service under load or transient issues | Add retry logic; gather session IDs for reports |
## Code references
**Key imports for debugging:**
```python
from tinker_cookbook import model_info, weights
from tinker_cookbook.renderers import get_renderer, get_registered_renderer_names
from tinker_cookbook.tokenizer_utils import get_tokenizer
from tinker_cookbook.utils import trace, ml_log
import tinker
```
**Reference files in this skill:**
- `references/async-task-dump.md` — Ready-to-paste diagnostic for hung sessions
- `references/serialization-test.md` — Pydantic regression benchmark
- `references/merge-debugging.md` — Weight layout conventions and fusion formats
- `references/renderer-debugging.md` — Renderer validation, token comparison, model-specific quirks
- `references/error-reference.md` — Extended error message decoder with edge cases
## /skills/debug/references/async-task-dump.md
# Async Task Dump Diagnostic
Ready-to-paste diagnostic code for investigating hung or slow training sessions. This instruments both the main asyncio event loop and the Tinker SDK's internal event loop to reveal what's blocking.
## What this does
1. **Periodic task dump** — Every second, prints all active asyncio tasks in both the main event loop and the SDK's internal `InternalClientHolder` thread
2. **Thread stack watchdog** — Every second, prints Python stack traces for the main and SDK threads, so you can see exactly where they're blocked even if it's synchronous code
3. **aiomonitor** — Opens telnet/web ports for interactive async debugging (optional, requires `pip install aiomonitor`)
## When to use
- A training step has been running for much longer than expected
- The training loop appears completely hung (no log output)
- You've confirmed with pyinstrument that time is in `epoll.poll` but need to know what the SDK thread is doing
## Setup
```bash
pip install aiomonitor
```
## The diagnostic code
Paste this at the end of the user's script, replacing their `asyncio.run(main(config))` call:
```python
import asyncio
# ── Configuration ──────────────────────────────────────────────────
AIOMONITOR_HOST = "127.0.0.1"
HOLDER_AIOMONITOR = {"port": 20101, "webui_port": 20102, "console_port": 20103}
MAIN_AIOMONITOR = {"port": 21101, "webui_port": 21102, "console_port": 21103}
TASK_DUMP_INTERVAL_SEC = 1.0
# ── Periodic async task dump ──────────────────────────────────────
async def _periodic_dump_tasks(loop, label):
import time
while True:
await asyncio.sleep(TASK_DUMP_INTERVAL_SEC)
ts = time.strftime("%Y-%m-%d %H:%M:%S")
tasks = asyncio.all_tasks(loop)
print(
f"\n===== {ts} asyncio task dump [{label}] "
f"{len(tasks)} task(s) @ interval={TASK_DUMP_INTERVAL_SEC}s ====="
)
for t in sorted(tasks, key=lambda x: (x.get_name(), id(x))):
coro = t.get_coro()
coro_s = repr(coro) if coro is not None else "None"
if len(coro_s) > 240:
coro_s = coro_s[:237] + "..."
if t.cancelled():
state = "cancelled"
elif t.done():
try:
ex = t.exception()
except asyncio.CancelledError:
state = "cancelled"
else:
state = "done" if ex is None else f"done exc={type(ex).__name__}:{ex}"
else:
state = "pending"
print(f" {t.get_name()!r}: {state} {coro_s}")
print(f"===== {ts} end [{label}] =====\n", flush=True)
# ── Thread stack watchdog ─────────────────────────────────────────
_WATCHDOG_INTERVAL_SEC = 1.0
_watchdog_started = False
_watchdog_lock = None
_watchdog_targets = {}
def _watchdog_register_this_thread(label):
import threading
global _watchdog_lock, _watchdog_targets
if _watchdog_lock is None:
_watchdog_lock = threading.Lock()
t = threading.current_thread()
with _watchdog_lock:
_watchdog_targets[t.ident] = label
def _ensure_thread_stack_watchdog():
import sys, threading, time, traceback
global _watchdog_started, _watchdog_lock
if _watchdog_lock is None:
_watchdog_lock = threading.Lock()
def watchdog_loop():
while True:
time.sleep(_WATCHDOG_INTERVAL_SEC)
ts = time.strftime("%Y-%m-%d %H:%M:%S")
buf = [
f"\n{'#' * 72}",
f"# {ts} stacks: main + holder threads only",
f"{'#' * 72}\n",
]
with _watchdog_lock:
targets = dict(_watchdog_targets)
if not targets:
buf.append("(no threads registered yet)\n")
print("\n".join(buf), flush=True)
continue
frames = sys._current_frames()
for ident, label in sorted(targets.items(), key=lambda kv: kv[1]):
fr = frames.get(ident)
if fr is None:
buf.append(f"--- [{label}] ident={ident} (no frame) ---\n")
continue
buf.append(f"--- [{label}] ident={ident} ---")
buf.append("".join(traceback.format_stack(fr, limit=80)))
buf.append("")
print("\n".join(buf), flush=True)
with _watchdog_lock:
if _watchdog_started:
return
_watchdog_started = True
threading.Thread(target=watchdog_loop, name="stack-watchdog", daemon=True).start()
# ── Patch SDK internal thread to add monitoring ───────────────────
def _patch_internal_client_holder_background_thread():
import aiomonitor
from tinker.lib.internal_client_holder import InternalClientHolderThreadSingleton
def _background_thread_func_with_monitor(self):
assert self._loop is not None
_watchdog_register_this_thread("holder")
print(
f"aiomonitor [holder thread]: telnet {AIOMONITOR_HOST}:{HOLDER_AIOMONITOR['port']} "
f"web http://{AIOMONITOR_HOST}:{HOLDER_AIOMONITOR['webui_port']} "
f"console {AIOMONITOR_HOST}:{HOLDER_AIOMONITOR['console_port']}"
)
with aiomonitor.start_monitor(self._loop, host=AIOMONITOR_HOST, **HOLDER_AIOMONITOR):
self._loop.create_task(
_periodic_dump_tasks(self._loop, "holder"),
name="periodic-task-dump-holder",
)
self._loop.run_forever()
setattr(
InternalClientHolderThreadSingleton,
"_background_thread_func",
_background_thread_func_with_monitor,
)
# ── Main runner with monitoring ───────────────────────────────────
def _run_main_with_aiomonitor(task):
import contextlib
import aiomonitor
_watchdog_register_this_thread("main")
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
dump_t = None
try:
print(
f"aiomonitor [main]: telnet {AIOMONITOR_HOST}:{MAIN_AIOMONITOR['port']} "
f"web http://{AIOMONITOR_HOST}:{MAIN_AIOMONITOR['webui_port']} "
f"console {AIOMONITOR_HOST}:{MAIN_AIOMONITOR['console_port']}"
)
with aiomonitor.start_monitor(loop, host=AIOMONITOR_HOST, **MAIN_AIOMONITOR):
dump_t = loop.create_task(
_periodic_dump_tasks(loop, "main"),
name="periodic-task-dump-main",
)
loop.run_until_complete(task)
finally:
if dump_t is not None and not dump_t.done():
dump_t.cancel()
with contextlib.suppress(asyncio.CancelledError):
loop.run_until_complete(dump_t)
with contextlib.suppress(RuntimeError):
loop.run_until_complete(loop.shutdown_asyncgens())
loop.close()
# ── Activate everything ───────────────────────────────────────────
_ensure_thread_stack_watchdog()
_patch_internal_client_holder_background_thread()
# Replace: asyncio.run(main(config))
# With:
_run_main_with_aiomonitor(main(config))
```
## Interpreting the output
### Healthy training (no stalls)
You should see both `[main]` and `[holder]` task dumps every second, and the holder thread should cycle through `_forward_backward_async` / `_optim_step_async` / `_result_async` tasks as work is submitted and completed.
### Client-side stall
If the `[main]` thread stops logging for several seconds, but `[holder]` keeps logging, the main thread is blocked on synchronous work. Check the stack watchdog output — it will show exactly where the main thread is stuck (data loading, tokenizer import, etc.).
### SDK serialization stall
If the `[holder]` thread stops logging, the SDK's internal event loop is blocked by synchronous code. The stack watchdog will reveal the exact call — commonly pydantic `model_dump()` or `__repr__` calls during payload serialization.
Example stack trace from a pydantic serialization stall:
```
File ".../tinker/_compat.py", line 143, in model_dump
return model.model_dump(
File ".../pydantic/main.py", line 479, in model_dump
return self.__pydantic_serializer__.to_python(
File ".../pydantic/_internal/_serializers.py", line 44, in serialize_sequence_via_list
v = handler(item, index)
```
This indicates a pydantic version issue — see Step 1 in the main skill.
### Server-side wait
If both threads are logging normally and the holder shows pending `_result_async` tasks, the SDK has submitted work and is waiting for the server. Share the session ID and these logs with the Tinker team.
## Simplified version (no aiomonitor)
If the user can't install aiomonitor, use just the stack watchdog:
```python
import asyncio, sys, threading, time, traceback
_watchdog_targets = {}
_lock = threading.Lock()
def register_thread(label):
with _lock:
_watchdog_targets[threading.current_thread().ident] = label
def start_watchdog(interval=1.0):
def loop():
while True:
time.sleep(interval)
ts = time.strftime("%H:%M:%S")
with _lock:
targets = dict(_watchdog_targets)
frames = sys._current_frames()
for ident, label in targets.items():
fr = frames.get(ident)
if fr:
stack = "".join(traceback.format_stack(fr, limit=10))
print(f"\n[{ts}] {label}:\n{stack}", flush=True)
threading.Thread(target=loop, daemon=True).start()
# Add before any tinker imports:
register_thread("main")
start_watchdog()
# Then monkey-patch the holder thread to register itself:
from tinker.lib.internal_client_holder import InternalClientHolderThreadSingleton
_orig = InternalClientHolderThreadSingleton._background_thread_func
def _patched(self):
register_thread("holder")
_orig(self)
InternalClientHolderThreadSingleton._background_thread_func = _patched
# ... rest of script ...
```
## /skills/debug/references/error-reference.md
# Error Reference
Extended decoder for Tinker SDK and cookbook error messages. Organized by where the error originates.
## Tinker SDK errors (from the service)
### HTTP 400 — Bad Request
These are validation errors. The request was malformed or had invalid parameters.
| Error detail | Cause | Fix |
|-------------|-------|-----|
| `base_model is required` | Missing model name in `create_lora_training_client` | Pass `base_model="org/model-name"` |
| `lora_config.rank must be positive` | LoRA rank <= 0 | Use a positive rank (typically 32) |
| `lora_config.rank must be a power of 2` | Rank like 48 or 100 | Use 16, 32, 64, etc. |
| `At least one of train_unembed, train_mlp, or train_attn must be True` | All training flags disabled | Enable at least one (defaults are all True) |
| `Prompt length plus max_tokens exceeds the model's context window: {N} + {M} > {limit}` | Input too long | Reduce prompt length or max_tokens. Error shows exact numbers. |
| `session_id is required. The version of the Tinker SDK you are using is no longer supported.` | SDK too old | `pip install --upgrade tinker` |
### HTTP 402 — Payment Required
Billing account is blocked. Top up at the Tinker console billing page.
### HTTP 403 — Forbidden
| Error detail | Cause | Fix |
|-------------|-------|-----|
| `You do not have access to this model` | No permission for the model or checkpoint | Check API key; request access to the model/org |
| `Invalid session_id` | Session doesn't exist or belongs to another user | Check session ID; create a new session |
| Generic 403 on checkpoint | Checkpoint is private and user isn't the owner | Request that the checkpoint owner make it public, or get project/org access |
### HTTP 404 — Not Found
Model, session, or checkpoint doesn't exist. Double-check:
- Model name spelling (e.g., `Qwen/Qwen3-8B` not `qwen3-8b`)
- Session ID (use `tc.get_info()` to verify)
- Checkpoint path format (`tinker://session-id/sampler_weights/name`)
- That you used `save_weights_for_sampler` (not `save_state`) for sampling/download
### HTTP 409 — Conflict
Resource already exists. Most commonly:
- **Checkpoint save retry**: The first save actually succeeded (network hiccup made it look like it failed). Check if the checkpoint exists before retrying.
- **Session conflict**: Session already exists with different metadata. Use a different session ID.
### HTTP 429 — Rate Limited
Too many concurrent requests. Default limits:
- ~2000 concurrent sampling requests per user
- Tightens under high service load
Fix: Add backoff/retry logic. Reduce concurrency (smaller `num_samples`, fewer parallel `asyncio.gather` calls).
### HTTP 500 — Internal Server Error
Unhandled server-side error. Gather:
1. Session ID (`tc.get_info()`)
2. What operation failed
3. Timestamp
4. Tinker SDK version
Report to the Tinker team.
### HTTP 504 — Gateway Timeout
`Timeout while publishing request` — the server's internal queue is full or slow. Usually transient. Retry after a few seconds.
## Tinker SDK Python exceptions
| Exception | HTTP code | Retryable? | Notes |
|-----------|----------|-----------|-------|
| `tinker.AuthenticationError` | 401/403 | No | Check API key |
| `tinker.BadRequestError` | 400 | No | Fix the request |
| `tinker.RateLimitError` | 429 | Yes | Back off and retry |
| `tinker.APITimeoutError` | timeout | Yes | Increase timeout or retry |
| `tinker.APIConnectionError` | network | Yes | Check connectivity |
| `tinker.TinkerError` | varies | Maybe | Catch-all base class |
### Request-level error codes
These appear in `ForwardBackwardResult` or sampling responses:
| Code | Meaning | Action |
|------|---------|--------|
| `NonFiniteInTensor` | NaN or Inf in input data | Check your data for invalid values; ensure loss weights don't produce Inf |
| `PromptTooLong` | Prompt exceeds context window | Reduce input length |
| `ResourceAlreadyExists` | Duplicate resource | Handle idempotently |
## Cookbook exceptions
### Configuration errors
| Exception | Common triggers |
|-----------|----------------|
| `ConfigurationError("Unknown model: {name}")` | Model name not in cookbook registry. Check spelling; use `model_info.get_model_attributes()`. |
| `ConfigurationError("Log directory already exists")` | Resume from different path, or delete the old log directory |
### Data errors
| Exception | Common triggers |
|-----------|----------------|
| `DataFormatError("Each line must contain a 'messages' field")` | JSONL file lines missing `messages` key |
| `DataValidationError("Cannot seek backward")` | Streaming dataset used with random access |
| `ValueError("tokens and weights must be the same length")` | `max_length` truncated tokens but not weights. Use `datum_from_model_input_weights()` which handles this. |
### Renderer errors
| Exception | Common triggers |
|-----------|----------------|
| `RendererError("Unknown renderer")` | Invalid renderer name. Use `get_recommended_renderer_name()`. |
| `RendererError("requires an image_processor")` | VL renderer created without image processor |
| `RendererError("Expected text content, got multimodal content")` | Passed image content to a text-only renderer |
### Weight errors
| Exception | Common triggers |
|-----------|----------------|
| `WeightsDownloadError` | Invalid tinker:// path, or checkpoint doesn't exist. Verify with `tinker checkpoint list`. |
| `WeightsMergeError` | Adapter incompatible with base model. Check model name matches exactly. |
| `WeightsAdapterError` | Can't convert to PEFT format. Check for empty expert tensors (known issue with some models). |
### Training errors
| Exception | Common triggers |
|-----------|----------------|
| `CheckpointError` | Checkpoint save/load failed. Check path format and permissions. |
| `AllTrajectoriesFailedError` | Every trajectory in a rollout group failed. Check environment code; look at rollout logs. |
## Upstream library errors
| Error | Library | Cause | Fix |
|-------|---------|-------|-----|
| `Expected N tokens, got M from image` | `transformers` < 5.0 | Bug in `Qwen2VLImageProcessor` | Upgrade: `pip install 'transformers>=5.0'` |
| DeepSeek tokenizer loading fails | `transformers` == 5.3.0 | Incorrect `tokenizer_class` on hub (huggingface/transformers#44801) | Upgrade to `transformers>=5.3.1` |
| `ModuleNotFoundError: pkg_resources` | Python 3.14 | `pkg_resources` removed from stdlib | Downgrade Python or update the offending package |
| `401 Unauthorized` on tokenizer download | HuggingFace | Gated model (Llama) needs auth | Set `HF_TOKEN` environment variable |
| Corrupted tokenizer cache | HuggingFace | Cache corruption | Delete `~/.cache/huggingface/hub/models--{org}--{model}/` and retry |
## /skills/debug/references/merge-debugging.md
# Merge Debugging Reference
Technical reference for debugging weight merge issues between Tinker adapters and HuggingFace models. Read this when the user has output mismatches between Tinker sampling and external inference engines.
## Tinker adapter weight naming
Tinker adapters store LoRA A/B matrices with these naming conventions:
```
base_model.model.layers.{N}.mlp.gate_proj.lora_A.weight # w1 = gate
base_model.model.layers.{N}.mlp.down_proj.lora_A.weight # w2 = down
base_model.model.layers.{N}.mlp.up_proj.lora_A.weight # w3 = up
base_model.model.layers.{N}.self_attn.q_proj.lora_A.weight
base_model.model.layers.{N}.self_attn.k_proj.lora_A.weight
base_model.model.layers.{N}.self_attn.v_proj.lora_A.weight
base_model.model.layers.{N}.self_attn.o_proj.lora_A.weight
```
For MoE models, expert weights are 3D tensors: `(num_experts, rank, dim)`.
## MoE expert weight layouts
Different model families fuse gate and up projections differently in HuggingFace format. Getting this wrong is the #1 cause of merge-related output mismatches.
### Separate layout (Qwen3 MoE, DeepSeek, Kimi)
Each expert has individual weight files:
```
model.layers.{N}.mlp.experts.{E}.gate_proj.weight # (out_dim, in_dim)
model.layers.{N}.mlp.experts.{E}.down_proj.weight
model.layers.{N}.mlp.experts.{E}.up_proj.weight
```
Merge is straightforward — apply LoRA delta to each expert independently.
### Fused concatenated layout (Qwen3.5, Qwen3-VL)
Gate and up projections are concatenated into a single tensor:
```
model.layers.{N}.mlp.experts.gate_up_proj # (num_experts, out_dim*2, in_dim) or (num_experts, in_dim, out_dim*2)
```
The first half (along the fused dimension) is `gate`, the second half is `up`:
```python
# Splitting:
gate = gate_up_proj[..., :out_dim]
up = gate_up_proj[..., out_dim:]
# Merging LoRA delta for gate (w1):
gate_up_proj[..., :out_dim] += lora_B_gate @ lora_A_gate
# Merging LoRA delta for up (w3):
gate_up_proj[..., out_dim:] += lora_B_up @ lora_A_up
```
**Important:** The fused dimension can be dim 1 or dim 2 depending on the model:
- Qwen3-VL: `(num_experts, in_dim, out_dim*2)` — fused on last dim
- Qwen3.5: `(num_experts, out_dim*2, in_dim)` — fused on middle dim (delta must be transposed)
The cookbook's `_detect_fused_axis()` handles this automatically.
### Fused interleaved layout (GPT-OSS)
Gate and up elements alternate:
```
model.layers.{N}.mlp.experts.gate_up_proj
# Layout: [g0, u0, g1, u1, g2, u2, ...]
```
```python
# Splitting:
gate = gate_up_proj[..., 0::2] # even indices
up = gate_up_proj[..., 1::2] # odd indices
# Merging LoRA delta for gate:
gate_up_proj[..., 0::2] += delta_gate
# Merging LoRA delta for up:
gate_up_proj[..., 1::2] += delta_up
```
**Using concatenation when the model expects interleaving (or vice versa) silently corrupts the weights.** The model will still generate text, but outputs will be degraded or invalid.
## Vision-language model considerations
VL models add a `model.language_model.*` prefix to language model weights:
```
# Standard model:
model.layers.0.mlp.gate_proj.weight
# VL model:
model.language_model.layers.0.mlp.gate_proj.weight
```
The adapter weights don't have this prefix, so the merge code must add it. The cookbook handles this via `has_language_model_prefix` in the merge profile.
## Split QKV projections (Qwen3.5)
Qwen3.5 models with hybrid attention (some layers use linear attention) fuse Q/K/V into a single `in_proj_qkv`:
```
model.layers.{N}.self_attn.in_proj_qkv.weight # (q_dim + k_dim + v_dim, hidden)
```
Tinker trains separate Q/K/V adapters. During merge, the adapter deltas must be written to the correct row range:
```python
q_rows = lora_B_q.shape[0] # e.g., 4096
k_rows = lora_B_k.shape[0] # e.g., 512
v_rows = lora_B_v.shape[0] # e.g., 512
# Q delta goes to rows [0 : q_rows]
# K delta goes to rows [q_rows : q_rows + k_rows]
# V delta goes to rows [q_rows + k_rows : q_rows + k_rows + v_rows]
```
## Diagnostic: comparing merge outputs
To verify a merge is correct, compare against Tinker's sampling output on a known input:
```python
import torch
from safetensors.torch import load_file
# Load a few expert weights from both merges
cookbook = load_file("cookbook_merge/model-00003-of-00010.safetensors")
custom = load_file("custom_merge/model-00003-of-00010.safetensors")
# Focus on MLP expert weights (where fusion bugs manifest)
for key in sorted(cookbook.keys()):
if "experts" in key and "mlp" in key:
if key in custom:
diff = (cookbook[key].float() - custom[key].float()).abs()
if diff.max() > 1e-4:
print(f"MISMATCH {key}: max={diff.max():.6f} mean={diff.mean():.6f}")
else:
print(f"OK {key}: max={diff.max():.6f}")
```
If mismatches cluster in `gate_up_proj` weights, the fusion convention is wrong.
## Diagnostic: end-to-end output comparison
```python
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("./merged_model", torch_dtype=torch.bfloat16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("./merged_model")
# Use the exact same tokens as Tinker
input_ids = torch.tensor([tinker_token_ids], device=model.device)
with torch.no_grad():
out = model.generate(input_ids, max_new_tokens=100, temperature=0, do_sample=False)
print(tokenizer.decode(out[0]))
# Compare with Tinker sampling output
```
## Using PEFT adapter as a workaround
If merge issues persist and time is critical, serve the unmerged adapter via PEFT:
```python
from tinker_cookbook import weights
weights.build_lora_adapter(
base_model="Qwen/Qwen3-8B",
adapter_path="./adapter",
output_path="./peft_adapter",
)
```
Then serve with vLLM:
```bash
vllm serve Qwen/Qwen3-8B --lora-modules my_adapter=./peft_adapter
```
This avoids merge entirely — the engine applies the LoRA at inference time. It's slightly slower but eliminates merge-related bugs.
## /skills/debug/references/renderer-debugging.md
# Renderer Debugging Reference
How to diagnose and fix renderer issues that cause silent training quality degradation.
## How renderers work
A renderer converts a list of chat messages into model-specific token sequences for training and sampling. Each model family (Llama3, Qwen3, DeepSeek, etc.) has its own token format with specific role delimiters, special tokens, and conventions.
Key renderer methods:
- `build_generation_prompt(messages)` → `ModelInput` (tokens for sampling)
- `build_supervised_example(messages)` → `(ModelInput, weights)` (tokens + loss mask for training)
- `parse_response(token_ids)` → `(Message, success)` (decode sampled tokens back to a message)
## Check your transformers version first
Different `transformers` versions have known bugs that affect specific models:
- `transformers == 5.3.0`: Incorrect `tokenizer_class` for DeepSeek V2/V3 on the hub (huggingface/transformers#44801, fixed in 5.3.1). Causes tokenizer loading to fail or use the wrong tokenizer class.
- `transformers < 5.0`: Bug in `Qwen2VLImageProcessor` — miscounts image tokens for VL models
- Always print `transformers.__version__` in your debug output
```python
import transformers
print(f"transformers: {transformers.__version__}")
```
## Full token comparison recipe
This script compares the cookbook renderer against HuggingFace's `apply_chat_template` for a given model and conversation:
```python
"""
Renderer parity check — verifies cookbook renderer matches HuggingFace tokenizer.
Usage: python renderer_check.py
"""
from tinker_cookbook.renderers import get_renderer
from tinker_cookbook.tokenizer_utils import get_tokenizer
from tinker_cookbook.model_info import get_recommended_renderer_name
MODEL_NAME = "Qwen/Qwen3-8B" # Change this
# Setup
tokenizer = get_tokenizer(MODEL_NAME)
renderer_name = get_recommended_renderer_name(MODEL_NAME)
renderer = get_renderer(renderer_name, tokenizer)
# Test conversations — try several patterns
test_cases = [
# Basic conversation
[
{"role": "user", "content": "What is 2+2?"},
{"role": "assistant", "content": "4"},
],
# With system message
[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello"},
],
# Multi-turn
[
{"role": "user", "content": "Hi"},
{"role": "assistant", "content": "Hello!"},
{"role": "user", "content": "How are you?"},
],
]
for i, messages in enumerate(test_cases):
print(f"\n=== Test case {i + 1} ===")
# Cookbook tokens
cookbook_mi = renderer.build_generation_prompt(messages)
cookbook_tokens = cookbook_mi.to_ints()
# HuggingFace tokens
hf_tokens = list(tokenizer.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
))
if cookbook_tokens == hf_tokens:
print(f"PASS: {len(cookbook_tokens)} tokens match")
else:
print(f"FAIL: cookbook={len(cookbook_tokens)} tokens, HF={len(hf_tokens)} tokens")
# Find divergence point
min_len = min(len(cookbook_tokens), len(hf_tokens))
for j in range(min_len):
if cookbook_tokens[j] != hf_tokens[j]:
ctx_start = max(0, j - 3)
print(f" First diff at position {j}:")
print(f" cookbook[{ctx_start}:{j+3}]: {cookbook_tokens[ctx_start:j+3]}")
print(f" HF[{ctx_start}:{j+3}]: {hf_tokens[ctx_start:j+3]}")
print(f" cookbook decoded: {tokenizer.decode(cookbook_tokens[max(0,j-5):j+5])!r}")
print(f" HF decoded: {tokenizer.decode(hf_tokens[max(0,j-5):j+5])!r}")
break
if len(cookbook_tokens) != len(hf_tokens):
print(f" Length diff: cookbook has {len(cookbook_tokens) - len(hf_tokens):+d} tokens")
```
## Thinking mode
Models with thinking capabilities (Qwen3, Qwen3.5/Qwen3.6, DeepSeek V3, Kimi K2.6, Nemotron3) have two renderer variants:
- **With thinking** (`qwen3`, `qwen3_5`, `deepseekv3_thinking`): Model produces `<think>...</think>` blocks before responding
- **Without thinking** (`qwen3_disable_thinking`, `qwen3_5_disable_thinking`, `deepseekv3`): Thinking is suppressed
(Qwen3.5 and Qwen3.6 share the same renderers: `qwen3_5` and `qwen3_5_disable_thinking`.)
Common issues:
- Training on data with `<think>` blocks using a non-thinking renderer → Thinking tokens get wrong loss weights
- Using a thinking renderer but passing `thinking=False` to HF template → Token mismatch
- Historical assistant messages may have thinking stripped by default (`strip_thinking_from_history=True` in Qwen3)
When comparing against HF for thinking models:
```python
# For Qwen3 with thinking enabled:
hf_tokens = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, thinking=True)
# For Qwen3 with thinking disabled:
hf_tokens = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, thinking=False)
```
## Tool calling
Each model family has a different tool call format:
- **Qwen3**: `<tool_call>\n{json}\n</tool_call>` tags
- **DeepSeek V3**: Special tokens like `<tool_calls_begin>`, `<tool_calls_end>`
- **Kimi K2**: `## FunctionCall {json}` format
- **Llama3**: Bare JSON (unreliable parsing — tool calling not well-supported)
When comparing tool-calling conversations against HF:
```python
# Must pass tools parameter to HF
hf_tokens = tokenizer.apply_chat_template(
messages,
tools=tool_specs, # List of tool schema dicts
add_generation_prompt=True,
tokenize=True,
)
```
Not all renderers have tool formats that match HF — some intentionally diverge. Check the model's renderer implementation if tool calling quality is poor.
## KL divergence at step 0
If KL divergence is high at the very first training step (before any gradient updates), the renderer is likely producing tokens the model doesn't expect. The model's log-probabilities on the "correct" tokens are low because those tokens don't match its learned distribution.
Diagnosis:
1. Run the token comparison above
2. If tokens match HF, the renderer is correct — high step-0 KL may indicate the data distribution is far from the model's pre-training distribution (especially common with gpt-oss models on specialized data)
3. If tokens don't match, fix the renderer first
## Available renderers
Use `get_recommended_renderer_name()` — never hardcode:
- **Llama 3.x**: `llama3`
- **Qwen3**: `qwen3`, `qwen3_disable_thinking`, `qwen3_instruct`, `qwen3_vl`, `qwen3_vl_instruct`
- **Qwen3.5**: `qwen3_5`, `qwen3_5_disable_thinking`
- **DeepSeek V3**: `deepseekv3` (no thinking), `deepseekv3_thinking`
- **Kimi K2.6**: `kimi_k26`, `kimi_k26_disable_thinking`
- **Nemotron3**: `nemotron3`, `nemotron3_disable_thinking`
- **GPT-OSS**: `gpt_oss_no_sysprompt`, `gpt_oss_low_reasoning`, `gpt_oss_medium_reasoning`, `gpt_oss_high_reasoning`
- **Generic fallback**: `role_colon`
## Vision model considerations
Vision-language (VL) models add image tokens alongside text tokens, creating additional points of failure.
### Setup
VL renderers require an image processor:
```python
from tinker_cookbook.image_processing_utils import get_image_processor
from tinker_cookbook.renderers import get_renderer
from tinker_cookbook.tokenizer_utils import get_tokenizer
model_name = "Qwen/Qwen3.6-35B-A3B"
tokenizer = get_tokenizer(model_name)
image_processor = get_image_processor(model_name)
renderer = get_renderer(
"qwen3_5",
tokenizer,
image_processor=image_processor, # Required when messages contain images
)
```
Forgetting `image_processor` raises `AssertionError("image_processor is required to render image content")` as soon as you render image content.
### VL renderers by model
| Model | Renderer | Notes |
|-------|----------|-------|
| Qwen3.5/Qwen3.6 (VL variants) | `qwen3_5` | Same renderer handles VL when image_processor provided |
| Kimi K2.6 | `kimi_k26` | Supports vision natively |
### Common VL issues
**`Expected X tokens, got Y from image`**
This is the most frequently reported VL bug. It's caused by a bug in HuggingFace's `Qwen2VLImageProcessor` in `transformers < 5.0` that miscounts image tokens.
Fix: `pip install 'transformers>=5.0'` (or install `torchvision` as an alternative workaround).
**Image token count mismatch between training and serving**
Different image resolutions produce different numbers of image tokens. The image processor determines this. Ensure:
- Same `transformers` version during training and serving
- Same image processor configuration (max resolution, etc.)
- Images are not re-encoded between training data preparation and actual training
**Token comparison for VL**
When comparing VL renderer output against HuggingFace, you need to include images in the comparison:
```python
from PIL import Image
messages = [
{"role": "user", "content": [
{"type": "image", "image": Image.open("test.png")},
{"type": "text", "text": "What's in this image?"},
]},
]
# Cookbook
cookbook_mi = renderer.build_generation_prompt(messages)
cookbook_tokens = cookbook_mi.to_ints()
# HuggingFace — use the full processor, not just tokenizer
hf_inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_tensors="pt",
)
# Compare input_ids
```
Note: VL token comparison is more complex because image tokens may be represented differently. Focus on verifying that text tokens surrounding images match and that the total image token count is correct.
**Weight export for VL models**
VL models add a `model.language_model.*` prefix to language model weights. The cookbook's `weights.build_hf_model()` handles this automatically. Custom merge scripts commonly miss this prefix, causing the LoRA adapter to silently not be applied (adapter weight names don't match model weight names).
## /skills/debug/references/serialization-test.md
# Serialization Regression Test
A standalone script to test whether the user's pydantic version causes slow serialization of Tinker SDK payloads. This is the most common "invisible" performance regression — the user's training script works but each step takes minutes instead of seconds.
## When to use
- Step 3 timing shows high submit time (time to submit `forward_backward` call)
- Step 4 profiling shows time in `pydantic.model_dump()` or `__repr__`
- User has a non-standard pydantic version (beta, pre-release, or very new)
## The test script
```python
"""
Tinker payload serialization benchmark.
Tests whether pydantic model_dump() is fast on typical training payloads.
Expected: < 0.1s for BS=4, SEQ_LEN=8192.
If > 1s, the pydantic version likely has a serialization regression.
Usage:
python serialization_test.py
"""
import sys
import time
import pydantic
import torch
from tinker._compat import model_dump
from tinker._version import __version__ as tinker_version
from tinker.types import (
Datum,
EncodedTextChunk,
ForwardBackwardInput,
ForwardBackwardRequest,
ModelInput,
)
# ── Print versions ────────────────────────────────────────────────
print(f"Python: {sys.version.split()[0]}")
print(f"pydantic: {pydantic.__version__}")
print(f"tinker: {tinker_version}")
print(f"torch: {torch.__version__}")
try:
import numpy; print(f"numpy: {numpy.__version__}")
except ImportError:
print("numpy: (not installed)")
print()
# ── Build a realistic payload ─────────────────────────────────────
BS = 4
SEQ_LEN = 8192
VOCAB = 32000
data = []
for _ in range(BS):
tokens = torch.randint(0, VOCAB, (SEQ_LEN,)).tolist()
model_input = ModelInput(chunks=[EncodedTextChunk(tokens=tokens)])
weights = torch.ones(SEQ_LEN)
target_tokens = torch.randint(0, VOCAB, (SEQ_LEN,)).tolist()
data.append(
Datum(
model_input=model_input,
loss_fn_inputs={
"target_tokens": target_tokens,
"weights": weights,
},
)
)
request = ForwardBackwardRequest(
forward_backward_input=ForwardBackwardInput(
data=data,
loss_fn="cross_entropy",
loss_fn_config=None,
),
model_id="tinker://dummy/example",
seq_id=1,
)
# ── Benchmark serialization ──────────────────────────────────────
t0 = time.perf_counter()
body = model_dump(request, exclude_unset=True, mode="json")
dt = time.perf_counter() - t0
print(f"Payload: batch_size={BS}, seq_len={SEQ_LEN}, total_tokens={BS * SEQ_LEN:,}")
print(f"model_dump() took {dt:.3f}s")
print()
if dt < 0.1:
print("PASS: Serialization is fast.")
elif dt < 1.0:
print("WARNING: Serialization is slower than expected. May cause minor slowdowns.")
else:
print("FAIL: Serialization is very slow. This will cause major training slowdowns.")
print(f" Your pydantic version ({pydantic.__version__}) likely has a regression.")
print(" Fix: pip install 'pydantic<2.13' (or use the latest stable release)")
```
## Expected output (healthy)
```
Python: 3.12.9
pydantic: 2.12.5
tinker: 0.16.1
torch: 2.8.0
numpy: 2.2.5
Payload: batch_size=4, seq_len=8192, total_tokens=32,768
model_dump() took 0.007s
PASS: Serialization is fast.
```
## Example output (regression)
```
Python: 3.12.12
pydantic: 2.13.0b2
torch: 2.10.0
tinker: 0.16.1
Payload: batch_size=4, seq_len=8192, total_tokens=32,768
model_dump() took 147.231s
FAIL: Serialization is very slow. This will cause major training slowdowns.
Your pydantic version (2.13.0b2) likely has a regression.
Fix: pip install 'pydantic<2.13' (or use the latest stable release)
```
## /skills/research/SKILL.md
---
name: research
description: Conduct post-training research for LLMs using the Tinker API — replicate paper results, explore new training ideas, run and monitor experiments, and document findings. Use this skill whenever the user wants to do research, replicate experiments from a paper or repo, investigate training hypotheses, run experiment sweeps, explore post-training techniques (SFT, RL, DPO, distillation, etc.), set up training, write training code, choose a model, tune hyperparameters, manage checkpoints, export weights, or analyze training logs — even if they just say "try this idea" or "let's see what happens if...".
---
# Tinker Research
You are a researcher. This is not a tool you invoke and forget — it is a mindset that shapes everything you do in this conversation. You think carefully, you stay curious, you question your assumptions, and you never stop paying attention to what's happening.
**What this means in practice:**
- **You are always monitoring.** When an experiment is running, you don't say "I'll check back later" — you actively watch `metrics.jsonl`, check for anomalies, look at rollout transcripts, verify the process is alive. If something looks off, you investigate immediately.
- **You are always curious.** Before diving into implementation, ask: what is the state of the art here? What have others tried? What papers are relevant? Use WebSearch to find recent work. A researcher who doesn't read the literature wastes time rediscovering known results.
- **You are always skeptical.** A single good result doesn't mean you're done. A single bad result doesn't mean the approach is wrong. Look for patterns across runs. Check whether your eval actually measures what you think it measures. Question surprising results in both directions.
- **You own the full loop.** Planning, implementation, execution, monitoring, analysis, iteration — these are all your responsibility. Don't hand off any step. Don't assume the next run will work. Don't assume the config is correct because it looks right.
If you are running in a **git worktree**, stay inside it — do not `cd` to the original repo root.
---
## Research methodology
Every research task follows this arc. The methodology matters as much as the code.
### 1. Understand the problem
Before writing any code, get crystal clear on what you're investigating.
- **If replicating a paper/repo:** Read the paper carefully (use WebFetch for arXiv/PDFs). Extract the exact experimental setup: model, dataset, hyperparameters, evaluation metrics, baselines. Don't approximate — if the paper says "lr=3e-5 with cosine schedule over 3 epochs," that's what you use. Cross-reference with any released code.
- **If exploring a new idea:** Clarify the hypothesis with the user. What do we expect to happen and why? What's the simplest experiment that would give us signal?
- **Search for prior work:** Use WebSearch to find related papers, blog posts, or implementations. Someone may have already tried this. What is the current state of the art on this task? What approaches have been tried and what results did they get? What are the open questions? A 30-minute literature search can save days of wasted experiments.
**Don't make judgments too quickly.** Before running any experiment, read the actual source code — the training loop, renderer, environment, dataset builder, loss function. Don't just understand the concept; read the code and trace how data flows through each component. When replicating a paper, diff their implementation against ours. Surprises in training almost always trace back to a misunderstanding of the setup.
**Check existing recipes first.** Before writing training code from scratch, look at `tinker_cookbook/recipes/` — there are complete examples for SFT, RL, DPO, distillation, code RL, tool use, and multi-agent training. Start from the closest existing recipe and modify it rather than building from zero.
### 2. Know your models
Model type fundamentally affects how you train and what to expect.
| Type | What it is | Training implications | Examples |
|------|-----------|----------------------|----------|
| **Base** | Pre-trained only, no instruction tuning | Full post-training pipeline (SFT then RL). Has a renderer but no instruction-following behavior out of the box. | `Qwen3.5-9B-Base`, `Qwen3.5-35B-A3B-Base` |
| **Reasoning** | Trained for chain-of-thought with `<think>` blocks | Produces long reasoning traces. Need higher `max_tokens`. Training data should include thinking. | `DeepSeek-R1-Distill-Qwen-7B` |
| **Hybrid** | Supports both thinking and non-thinking modes | **Tricky:** Must use correct renderer variant. `_disable_thinking` for direct answers, default for reasoning. Wrong choice silently corrupts training. | `Qwen3-8B`, `Kimi-K2.6` |
| **Vision** | Multimodal (text + images) | Needs VL-capable renderer + image_processor. Image token count must match. | `Qwen3.6-35B-A3B`, `Qwen3.5-397B-A17B` |
**Always resolve the renderer automatically:**
```python
from tinker_cookbook import model_info
renderer_name = model_info.get_recommended_renderer_name(model_name)
```
**Cost tip:** Prefer MoE models — cost scales with active parameters. `Qwen3.6-35B-A3B` (3B active) is cheaper than `Qwen3.6-27B` (27B active) at similar quality.
For the full model lineup, read `references/models.md`. For the latest supported models (the reference file may be outdated), check https://tinker-docs.thinkingmachines.ai/tinker/models/.
### 3. Set up evaluation FIRST
Don't start training without knowing how you'll measure success. Good eval is the foundation of good research. The cookbook has a standardized benchmark framework — use it instead of writing ad-hoc eval scripts.
#### Run existing benchmarks
Check what's already available in `tinker_cookbook/eval/benchmarks/` before writing anything new:
| Benchmark | Type | Notes |
|-----------|------|-------|
| `gsm8k` | Math | Numeric extraction, float-tolerant |
| `math500` | Math | `\boxed{}` extraction |
| `aime_2025`, `aime_2026` | Math | Competition-level |
| `mmlu_pro`, `mmlu_redux` | MCQ | Multiple choice A-D |
| `gpqa` | MCQ | Gated — needs `HF_TOKEN` |
| `ifeval` | Instruction following | Constraint verification |
| `mbpp` | Code | Requires sandbox |
| `ceval`, `supergpqa`, `ifbench` | Various | See eval README |
```python
from tinker_cookbook.eval.benchmarks import run_benchmarks, BenchmarkConfig
# Run baseline eval BEFORE any training
# sampling_client from tc.save_weights_and_get_sampling_client()
results = await run_benchmarks(
["gsm8k", "mmlu_pro", "ifeval"],
sampling_client, renderer,
BenchmarkConfig(save_dir="evals/baseline"),
)
for name, result in results.items():
print(f"{name}: {result.score:.1%} ({result.num_correct}/{result.num_examples})")
```
Use `BenchmarkConfig.for_model(model_name)` to get recommended defaults (max_tokens, temperature) for your model family.
#### Evaluate during training with BenchmarkEvaluator
Don't wait until training is done — run eval inline at regular intervals:
```python
from tinker_cookbook.eval import BenchmarkEvaluator
# These run automatically every eval_every steps during training
evaluator_builders = [
lambda: BenchmarkEvaluator("gsm8k", renderer, max_examples=100),
lambda: BenchmarkEvaluator("ifeval", renderer, max_examples=50),
]
# Pass to training config:
# config = train.Config(..., evaluator_builders=evaluator_builders, eval_every=20)
# Metrics logged as: eval/gsm8k/score, eval/ifeval/score, etc.
```
#### Analyze eval results
Don't just look at aggregate scores — read the actual failures:
```python
from tinker_cookbook.eval.benchmarks import load_trajectories, print_trajectory
# Load incorrect examples to understand failure modes
wrong = load_trajectories("evals/step500", "gsm8k", incorrect_only=True)
for traj in wrong[:5]:
print(f"Expected: {traj.logs['expected']}, Got: {traj.logs['extracted']}")
print_trajectory(traj) # Full conversation with model response
```
Results are saved to `save_dir` as `trajectories.jsonl` + `result.json` per benchmark. Runs are resumable — if interrupted, completed examples are skipped on re-run (matched by content hash, not index).
#### Create new benchmarks
If your task needs a custom eval, **add it to the benchmark framework** rather than writing a standalone script. This standardizes the process and makes results comparable across experiments.
```python
# tinker_cookbook/eval/benchmarks/my_benchmark.py
from tinker_cookbook.eval.benchmarks._types import BenchmarkBuilder, BenchmarkConfig, BenchmarkResult
from tinker_cookbook.eval.benchmarks._common import build_messages, make_example_id, limit_dataset, load_benchmark_dataset
from tinker_cookbook.eval.benchmarks import register
from tinker_cookbook.rl.message_env import MessageEnv, MessageStepResult, EnvFromMessageEnv
from tinker_cookbook.renderers import get_text_content
class MyEnv(MessageEnv):
def __init__(self, question: str, expected: str, example_id: str = ""):
self.question, self.expected, self.example_id = question, expected, example_id
async def initial_observation(self):
return build_messages(self.question)
async def step(self, message):
response = get_text_content(message)
correct = self.expected.lower() in response.lower()
return MessageStepResult(
reward=1.0 if correct else 0.0, episode_done=True, next_messages=[],
metrics={"correct": float(correct)},
logs={"expected": self.expected, "output": response[:500]},
)
class MyBenchmarkBuilder(BenchmarkBuilder):
name = "my_benchmark"
recommended_system_prompt = "Answer concisely."
def make_envs(self, renderer, config):
ds = load_benchmark_dataset("my/hf_dataset", split="test")
ds = limit_dataset(ds, config.max_examples)
return [
EnvFromMessageEnv(
renderer=renderer,
message_env=MyEnv(row["q"], row["a"], make_example_id("my_benchmark", row["q"])),
failed_parse_reward=0.0, context_overflow_reward=0.0,
)
for row in ds
]
register(MyBenchmarkBuilder())
```
Key design: benchmarks reuse the same `Env` protocol as RL training — `MessageEnv` + `EnvFromMessageEnv`. Thinking token stripping, context overflow, and concurrency are handled automatically.
For the full eval API (pass@k, sandbox benchmarks, custom aggregation, EvalStore), read `references/ops.md`.
### 4. Prepare your dataset
Data formatting is the #1 source of silent bugs. Before running any real experiment, verify:
1. **Decode and inspect:** Take 3-5 training examples, decode them back to text, and read them. Do they look right?
2. **Check special tokens:** Are BOS/EOS tokens present where expected? Are role markers correct?
3. **Check training masks:** For SFT, are you training on the right tokens? (`TrainOnWhat`)
4. **Compare with reference:** If replicating, compare your formatted data against the reference implementation.
```python
# Quick data inspection
from tinker_cookbook.renderers import get_renderer
from tinker_cookbook.tokenizer_utils import get_tokenizer
tokenizer = get_tokenizer(model_name)
renderer = get_renderer(renderer_name, tokenizer)
model_input, weights = renderer.build_supervised_example(messages)
tokens = model_input.to_ints()
print(tokenizer.decode(tokens))
print("Weights:", weights[:50], "...") # Check training mask
```
### 5. Plan experiments
Write a research plan in `notes/plan.md` before running anything:
- **Research question:** What are we trying to learn?
- **Hypothesis:** What do we expect and why?
- **Experiment design:** What experiments, in what order?
- **Controls and baselines:** What do we compare against?
- **Success criteria:** How do we know if it worked?
Start with the **simplest possible experiment** — small model, small dataset, few steps — to confirm the pipeline works end to end. This catches data formatting bugs, renderer mismatches, and config errors before you waste compute.
**Verify config before launching.** After writing your training config, print/log the resolved values and confirm they match your plan — model name, renderer, learning rate, batch size, log path. A misconfigured run wastes hours silently.
### 6. Run and monitor experiments
**Commit before every experiment.** Record the commit hash so results are traceable to code.
```bash
git add -A && git commit -m "experiment: <description>"
git rev-parse HEAD # Record this in your notes
```
**Start small, scale up:**
1. First run: tiny model, few steps — verify pipeline works
2. Second run: right model, few steps — verify metrics look reasonable
3. Third run: full scale
**Run independent experiments in parallel.** Use background agents or background bash commands. This is your biggest lever for iteration speed.
**Never kill a running experiment** for code changes, rebases, or cleanups — the process has code in memory and will complete on its own. Only kill if the run is clearly broken (NaN loss, wrong config).
**Active monitoring is non-negotiable.** You do not launch an experiment and wait passively. You watch it like a hawk, especially in the first few steps. This is one of the most important parts of being a researcher.
- **Immediately after launch:** Confirm the process started. Check that the first log lines appear. Verify no import errors or config errors.
- **First 5-10 steps:** Read `metrics.jsonl`. Is loss decreasing (or at least not NaN/exploding)? Do batch sizes, LR, and other logged values match your config? If anything is off, investigate now — don't wait for 100 steps.
- **Every ~10-20 steps:** Tail metrics. Look for loss spikes, gradient norm anomalies, plateaus, or unexpected KL divergence. Compare against your expectations from the plan.
- **For RL:** Read `*_rollout_summaries.jsonl` and `*_logtree.json`. Are the model's responses sensible? Is it gaming the reward? Are rewards trending up? Read actual model outputs — numbers alone don't tell you if the model is learning the right thing.
- **Eval scores during training:** If you set up `BenchmarkEvaluator` (you should), check `eval/*/score` in `metrics.jsonl`. Are scores improving? Stagnating? Degrading? Eval scores are the ground truth — loss going down doesn't mean the model is getting better at the actual task.
- **System health:** Is the process still alive? Is disk filling up (especially ramdisk)? Are there heartbeat warnings in the logs?
- **Between checks:** While waiting for results, use the time productively — read related papers, analyze previous experiment results, plan the next experiment, or review the code for potential issues.
```python
import pandas as pd
df = pd.read_json("path/to/metrics.jsonl", lines=True)
df.plot(x="progress/batch", y="env/all/reward/total")
```
If you're running multiple experiments in parallel, monitor all of them — don't let one run silently while you focus on another.
### 7. Document and iterate
After each experiment, record in `notes/experiments/`:
- Experiment name, commit hash, config, key metrics
- Comparison against baselines and prior results
- Interpretation: what did we learn?
- Next steps: what changes and why
Research is iterative. Each iteration should have clear reasoning for changes. **Don't judge too quickly** — a single run is not enough to draw conclusions. Look for consistent patterns across multiple runs before making strong claims.
**Stay curious between experiments.** When results surprise you, dig into why. When results match expectations, ask what could falsify your hypothesis. When stuck, go back to the literature — someone else may have hit the same wall and found a way through.
---
## Quick start
### Environment setup
```bash
export TINKER_API_KEY=<your-key>
pip install tinker # SDK + CLI
git clone https://github.com/thinking-machines-lab/tinker-cookbook.git
cd tinker-cookbook && pip install -e . # Cookbook
```
### Verify
```python
import tinker
svc = tinker.ServiceClient()
tc = svc.create_lora_training_client(base_model="Qwen/Qwen3.5-9B-Base", rank=32)
print(tc.get_info())
```
| Variable | Purpose |
|----------|---------|
| `TINKER_API_KEY` | Required — authenticates with Tinker service |
| `HF_TOKEN` | Optional — access gated HuggingFace models (Llama, etc.) |
| `WANDB_API_KEY` | Optional — log to Weights & Biases |
---
## Training approaches
### Supervised fine-tuning (SFT)
Use SFT for instruction tuning, chat fine-tuning, or training on curated datasets.
The cookbook uses `chz` blueprints for config — they give you CLI overridability (`python script.py learning_rate=1e-4`) and serializable configs for reproducibility.
```python
import asyncio
import chz
from tinker_cookbook import cli_utils, model_info
from tinker_cookbook.recipes.chat_sl import chat_datasets
from tinker_cookbook.renderers import TrainOnWhat
from tinker_cookbook.supervised import train
from tinker_cookbook.supervised.types import ChatDatasetBuilderCommonConfig
model_name = "Qwen/Qwen3.5-9B-Base"
renderer_name = model_info.get_recommended_renderer_name(model_name)
common_config = ChatDatasetBuilderCommonConfig(
model_name_for_tokenizer=model_name,
renderer_name=renderer_name,
max_length=32768, batch_size=128,
train_on_what=TrainOnWhat.ALL_ASSISTANT_MESSAGES,
)
dataset = chat_datasets.NoRobotsBuilder(common_config=common_config)
blueprint = chz.Blueprint(train.Config).apply({
"log_path": "/tmp/tinker-examples/sft",
"model_name": model_name, "renderer_name": renderer_name,
"dataset_builder": dataset,
"learning_rate": 2e-4, "lr_schedule": "linear", "num_epochs": 1,
})
config = blueprint.make()
cli_utils.check_log_dir(config.log_path, behavior_if_exists="ask")
asyncio.run(train.main(config))
```
**Existing recipes:** `tinker_cookbook/recipes/chat_sl/` (Tulu3, NoRobots)
**Key choices:**
- `TrainOnWhat.ALL_ASSISTANT_MESSAGES` — most common for chat SFT
- `TrainOnWhat.LAST_ASSISTANT_MESSAGE` — train only on final response
- Built-in datasets: `NoRobotsBuilder`, `Tulu3Builder`
- Custom data: `FromConversationFileBuilder(file_path="data.jsonl")` — JSONL with `{"messages": [...]}`
For renderers, datasets, completers, and custom data loading, read `references/sft.md`.
### Reinforcement learning (GRPO)
Use RL for tasks with verifiable rewards: math, code, tool use, games.
```python
import asyncio
import chz
from tinker_cookbook import cli_utils, model_info
from tinker_cookbook.recipes.math_rl.math_env import Gsm8kDatasetBuilder
from tinker_cookbook.rl import train
model_name = "Qwen/Qwen3.5-9B-Base"
renderer_name = model_info.get_recommended_renderer_name(model_name)
builder = Gsm8kDatasetBuilder(
batch_size=128, group_size=16,
renderer_name=renderer_name,
model_name_for_tokenizer=model_name,
)
blueprint = chz.Blueprint(train.Config).apply({
"model_name": model_name, "renderer_name": renderer_name,
"log_path": "/tmp/tinker-examples/rl",
"dataset_builder": builder,
"learning_rate": 4e-5, "max_tokens": 256,
})
config = blueprint.make()
cli_utils.check_log_dir(config.log_path, behavior_if_exists="ask")
asyncio.run(train.main(config))
```
**Existing recipes:** `tinker_cookbook/recipes/math_rl/` (GSM8K, MATH), `tinker_cookbook/recipes/code_rl/` (DeepCoder), `tinker_cookbook/recipes/search_tool/` (Search-R1), `tinker_cookbook/recipes/harbor_rl/` (terminal tasks), `tinker_cookbook/recipes/multiplayer_rl/` (multi-agent self-play)
**How GRPO works:** For each problem, the model generates `group_size` responses. Rewards are computed, advantages are centered within each group, and the policy is updated.
**Custom environments:**
- `ProblemEnv` — single-turn answer verification (implement `get_question`, `check_answer`, `check_format`, `get_reference_answer`)
- `MessageEnv` — multi-turn interactive (implement `initial_observation`, `step`)
**Built-in environments:** `Gsm8kDatasetBuilder`, `ArithmeticDatasetBuilder`, `DeepcoderDatasetBuilder`
For the full environment protocol, multi-turn examples, and async patterns, read `references/rl.md`.
### DPO (Direct Preference Optimization)
Use DPO for aligning models with preference data (chosen/rejected pairs).
```python
from tinker_cookbook import model_info
from tinker_cookbook.preference import train_dpo
from tinker_cookbook.preference.dpo_datasets import DPODatasetBuilderFromComparisons
from tinker_cookbook.recipes.preference.datasets import HHHComparisonBuilder
from tinker_cookbook.supervised.types import ChatDatasetBuilderCommonConfig
model_name = "Qwen/Qwen3.5-9B-Base"
renderer_name = model_info.get_recommended_renderer_name(model_name)
common_config = ChatDatasetBuilderCommonConfig(
model_name_for_tokenizer=model_name,
renderer_name=renderer_name,
max_length=8192, batch_size=256,
)
# Key settings: dpo_beta=0.1, learning_rate=1e-5 (lower than SFT)
config = train_dpo.Config(
model_name=model_name, renderer_name=renderer_name,
dataset_builder=DPODatasetBuilderFromComparisons(
common_config=common_config,
comparison_builder=HHHComparisonBuilder(),
),
learning_rate=1e-5, dpo_beta=0.1,
log_path="/tmp/tinker-examples/dpo",
)
train_dpo.main(config)
```
**Existing recipes:** `tinker_cookbook/recipes/preference/` (DPO, RLHF 3-stage pipeline)
**Built-in datasets:** `HHHComparisonBuilder`, `HelpSteer3ComparisonBuilder`, `UltraFeedbackComparisonBuilder`
**RLHF pipeline** (3 stages: SFT -> Reward Model -> RL) is also supported — see `references/preferences.md` for the full pipeline.
### Knowledge distillation
Use distillation to transfer knowledge from a stronger teacher to a smaller student.
```python
import asyncio
from tinker_cookbook import model_info
from tinker_cookbook.distillation import train_on_policy
from tinker_cookbook.distillation.datasets import (
DistillationDatasetConfig, PromptOnlyDatasetBuilder, TeacherConfig,
)
student_model = "Qwen/Qwen3.5-9B-Base"
teacher_model = "Qwen/Qwen3-8B"
renderer_name = model_info.get_recommended_renderer_name(student_model)
teacher_config = TeacherConfig(base_model=teacher_model)
dataset_builder = PromptOnlyDatasetBuilder(
dataset_name="deepmath", groups_per_batch=1024, group_size=4,
model_name_for_tokenizer=student_model, renderer_name=renderer_name,
)
config = train_on_policy.Config(
dataset_configs=[DistillationDatasetConfig(
dataset_builder=dataset_builder, teacher_config=teacher_config,
groups_per_batch=1024,
)],
model_name=student_model,
renderer_name=renderer_name,
learning_rate=1e-4, lora_rank=128,
kl_penalty_coef=1.0, kl_discount_factor=0.0,
log_path="/tmp/tinker-examples/distillation",
)
asyncio.run(train_on_policy.main(config))
```
**Existing recipes:** `tinker_cookbook/recipes/distillation/` (on-policy, off-policy, multi-teacher)
**Multi-teacher:** Pass multiple `DistillationDatasetConfig` objects.
**Off-policy:** Use standard SFT on teacher-generated traces.
For the full distillation guide, read `references/distillation.md`.
---
## SDK essentials
### Async pattern (critical for throughput)
The single most important performance pattern — **submit async calls back-to-back before awaiting**:
```python
# CORRECT: overlap GPU work with CPU data prep
fb_future = tc.forward_backward_async(data=batch, loss_fn="cross_entropy")
optim_future = tc.optim_step_async(adam_params=adam_params)
next_batch = dataset.get_batch(i + 1) # Prepare while GPU works
fb_result = fb_future.result()
optim_result = optim_future.result()
# WRONG: sequential = GPU idle between calls
result = tc.forward_backward(data=batch, loss_fn="cross_entropy")
tc.optim_step(adam_params=adam_params)
```
Same for evaluation — always concurrent:
```python
import asyncio
results = await asyncio.gather(*[evaluate_one(sc, p) for p in test_problems])
# NOT: sequential for loop
```
### Core types
```
Datum
+-- model_input: ModelInput (list of token/image chunks)
+-- loss_fn_inputs: dict[str, TensorData]
```
Use helpers, not manual construction:
- `conversation_to_datum(messages, renderer, max_length, train_on_what)` — full pipeline
- `renderer.build_supervised_example(messages)` — returns (ModelInput, weights)
- `datum_from_model_input_weights(model_input, weights, max_length)` — from components
For the complete SDK API reference, read `references/sdk.md`.
### Hyperparameters
```python
from tinker_cookbook.hyperparam_utils import get_lr
lr = get_lr(model_name, is_lora=True)
```
| Training type | Typical LR | LoRA rank | Notes |
|---------------|-----------|-----------|-------|
| SFT | 1e-4 to 5e-4 | 32 | `batch_size` in tokens |
| RL | 1e-5 to 4e-5 | 32 | `group_size` 4-16 |
| DPO | ~1e-5 | 32 | Start with `dpo_beta=0.1` |
| Distillation | ~1e-4 | 128 | Higher rank helps |
For the full hyperparameter guide, read `references/hyperparams.md`.
---
## Operations
### Checkpointing
```python
from tinker_cookbook import checkpoint_utils
# Save (two types: state for resume, sampler for inference/export)
paths = await checkpoint_utils.save_checkpoint_async(
training_client=tc, name="step_100", log_path=log_path,
loop_state={"batch": 100}, kind="both",
)
# Resume
record = checkpoint_utils.get_last_checkpoint(log_path, required_key="state_path")
tc.load_state_with_optimizer(path=record.state_path)
```
### Weight export
```python
from tinker_cookbook import weights
adapter_dir = weights.download(tinker_path="tinker://run-id/sampler_weights/final", output_dir="./adapter")
weights.build_hf_model(base_model="Qwen/Qwen3-8B", adapter_path=adapter_dir,
output_path="./model", dtype="bfloat16")
weights.publish_to_hf_hub(model_path="./model", repo_id="user/my-model", private=True)
```
### CLI
```bash
tinker run list # List training runs
tinker checkpoint list --run-id <RUN_ID> # List checkpoints
tinker checkpoint download <TINKER_PATH> -o ./adapter # Download weights
tinker checkpoint push-hf <PATH> --repo user/model # Push to HuggingFace
```
For the full operations reference (checkpoints, weights, logging, evaluation), read `references/ops.md`.
For the CLI reference, read `references/cli.md`.
---
## Common pitfalls
- **Sequential API calls**: The #1 performance mistake. Always use `_async` variants and submit back-to-back before awaiting.
- **Sampler desync**: After saving weights, create a **new** SamplingClient. A stale client silently samples from old weights.
- **Renderer mismatch**: Always use `model_info.get_recommended_renderer_name()` — never hardcode.
- **LoRA LR**: LoRA needs ~10x higher LR than full fine-tuning — use `get_lr()`.
- **forward() vs forward_backward()**: `forward()` computes loss without gradients — use for eval only, never in training loops.
- **Env objects are single-use**: Always create fresh envs via builder.
- **DPO works best from SFT checkpoint**, not raw base model.
- **batch_size is in tokens**, not examples.
---
## Code references
- **SFT:** `tinker_cookbook/supervised/train.py`, `tinker_cookbook/recipes/chat_sl/`
- **RL:** `tinker_cookbook/rl/train.py`, `tinker_cookbook/recipes/math_rl/`
- **DPO:** `tinker_cookbook/preference/train_dpo.py`, `tinker_cookbook/recipes/preference/`
- **Distillation:** `tinker_cookbook/distillation/`, `tinker_cookbook/recipes/distillation/`
- **Renderers:** `tinker_cookbook/renderers/`
- **Completers:** `tinker_cookbook/completers.py`
- **Checkpoints:** `tinker_cookbook/checkpoint_utils.py`
- **Weights:** `tinker_cookbook/weights/`
- **Eval:** `tinker_cookbook/eval/`
- **Logging:** `tinker_cookbook/utils/ml_log.py`, `tinker_cookbook/utils/logtree.py`
## Reference files
- `references/sdk.md` — Complete SDK API and types
- `references/models.md` — Full model lineup
- `references/hyperparams.md` — Hyperparameter formulas and recommendations
- `references/cli.md` — CLI command reference
- `references/sft.md` — Renderers, datasets, completers
- `references/rl.md` — Environments, multi-turn, GRPO details
- `references/preferences.md` — DPO and RLHF pipelines
- `references/distillation.md` — Knowledge distillation
- `references/ops.md` — Checkpoints, weights, logging, evaluation
- `references/dev.md` — Contributing: tests, CI, new recipes
## /skills/research/references/cli.md
# Tinker CLI
The `tinker` CLI is installed with the Tinker Python SDK. Requires `TINKER_API_KEY`.
## Global options
```bash
tinker --format table # Rich table output (default)
tinker --format json # JSON output (for scripting)
```
## Training runs
```bash
tinker run list
tinker run list --limit 50
tinker run info <RUN_ID>
tinker run list --columns id,model,lora,updated,status,checkpoint
```
Available columns: `id`, `model`, `owner`, `lora`, `updated`, `status`, `checkpoint`, `checkpoint_time`.
## Checkpoints
### List and inspect
```bash
tinker checkpoint list --run-id <RUN_ID>
tinker checkpoint list # All your checkpoints
tinker checkpoint info <TINKER_PATH>
```
### Download
```bash
tinker checkpoint download <TINKER_PATH>
tinker checkpoint download <TINKER_PATH> --output ./my-adapter
tinker checkpoint download <TINKER_PATH> --force
```
### Visibility
```bash
tinker checkpoint publish <TINKER_PATH>
tinker checkpoint unpublish <TINKER_PATH>
```
### TTL (expiration)
```bash
tinker checkpoint set-ttl <TINKER_PATH> --ttl 86400
tinker checkpoint set-ttl <TINKER_PATH> --remove
```
### Delete
```bash
tinker checkpoint delete <TINKER_PATH>
tinker checkpoint delete <TINKER_PATH> -y # No confirmation
tinker checkpoint delete <PATH1> <PATH2> <PATH3> # Multiple
```
### Upload to HuggingFace Hub
```bash
tinker checkpoint push-hf <TINKER_PATH> --repo user/my-model
tinker checkpoint push-hf <TINKER_PATH> --repo user/my-model --public
tinker checkpoint push-hf <TINKER_PATH> \
--repo user/my-model --revision main \
--commit-message "Upload fine-tuned model" --create-pr --no-model-card
```
## Version
```bash
tinker version
```
## Common patterns
### Script-friendly output
```bash
tinker checkpoint list --format json | jq '.[].tinker_path'
tinker run list --format json | jq '.[].id'
```
### Typical workflow
```bash
tinker run list
tinker checkpoint list --run-id <RUN_ID>
tinker checkpoint download tinker://<RUN_ID>/sampler_weights/final -o ./adapter
tinker checkpoint push-hf tinker://<RUN_ID>/sampler_weights/final --repo user/my-model
```
## Pitfalls
- `push-hf` uploads raw checkpoint — for merged HF models, use `weights.build_hf_model()` in Python first
- `delete` is permanent and irreversible
- Checkpoint paths: `tinker://<run-id>/<type>/<checkpoint-id>`
## /skills/research/references/dev.md
# Development Reference — Contributing, Tests, CI, New Recipes
Consolidated reference for contributing to tinker-cookbook: development setup, code style, testing, CI pipelines, and new recipe scaffolding.
---
## Development setup
```bash
git clone https://github.com/thinking-machines-lab/tinker-cookbook.git
cd tinker-cookbook
uv sync --extra dev
pre-commit install
```
---
## Code style
- **Formatter/Linter:** ruff (line length: 100)
- **Type checker:** pyright
- **Pre-commit hooks** run automatically
```bash
uv run ruff check tinker_cookbook/
uv run ruff format tinker_cookbook/
uv run pyright tinker_cookbook/
pre-commit run --all-files
```
### Rules
- Explicit typing everywhere — avoid `Any` and `type: ignore`
- Builder pattern: config objects (`@chz.chz`) build runtime objects
- Config/runtime separation: configs are serializable, runtime objects are heavyweight
- Env objects are single-use (no reset)
- Dimension notation: `_P` (problems), `_G` (groups), `_T` (tokens), `_D` (datums)
- Use `safezip`, `timed`, `scope` helpers
- Use `ml_log.log_metrics` for metrics, `logtree` for transcripts
---
## PR process
1. Create a feature branch from `main`
2. Make changes with tests
3. Run `pre-commit run --all-files`
4. Open PR with clear description
CI runs pre-commit, pyright, and pytest on every PR.
---
## Testing
Two layers of tests.
### Reference files
- `tests/helpers.py` — `run_recipe()` helper
- `tests/conftest.py` — Pytest configuration and API key handling
- `.github/workflows/pytest.yaml` — Unit test CI
- `.github/workflows/smoke-test-recipes.yaml` — Smoke test CI
- `pyproject.toml` — Pytest configuration
### Test structure
```
tinker-cookbook/
├── tinker_cookbook/
│ ├── renderers/parsing_test.py # Unit tests: *_test.py next to source
│ ├── recipes/math_rl/math_env_test.py
│ └── ...
└── tests/
├── conftest.py # Skips integration tests without API key
├── helpers.py # run_recipe() helper
└── recipes/
├── test_recipe_chat_sl.py # Integration: test_recipe_*.py
└── ...
```
### Unit tests (`*_test.py`)
Colocated with source code. No API key needed.
```bash
uv run pytest tinker_cookbook/
```
Conventions:
- File naming: `<module>_test.py` next to the code
- No network calls, no `TINKER_API_KEY`
- Fast (< 1s per test)
- Test picklability for distributed components
### Integration / smoke tests (`test_recipe_*.py`)
Live in `tests/recipes/`. Require `TINKER_API_KEY`.
```bash
uv run pytest tests/ -v -x -s
uv run pytest tests/recipes/test_recipe_chat_sl.py -v -x -s
```
Conventions:
- File naming: `tests/recipes/test_recipe_<name>.py`
- Mark with `@pytest.mark.integration`
- `run_recipe()` passes `max_steps=2` by default
- Always pass `behavior_if_log_dir_exists=delete`
- Override batch sizes to small values
### How `run_recipe()` works
1. Launches `uv run python -m <module> <args> max_steps=2`
2. Streams stdout in real time
3. Waits for clean exit within timeout (default: 1800s)
4. Fails on non-zero exit or timeout
### Pytest markers
- `@pytest.mark.integration` — Requires API key, skipped locally without it
- `@pytest.mark.slow` — Long-running tests
`tests/conftest.py` auto-skips integration tests when `TINKER_API_KEY` is not set.
---
## CI workflows
### `pytest.yaml` — Unit tests (every PR/push)
- Trigger: push to main, pull requests
- Runs: `uv run pytest tinker_cookbook/`
- Requires: `HF_TOKEN`
### `smoke-test-recipes.yaml` — Integration (daily + manual)
- Trigger: daily at 6am UTC, manual dispatch
- Runs: Each `test_recipe_*.py` in parallel (matrix strategy)
- Requires: `TINKER_API_KEY`, `HF_TOKEN`
- Timeout: 20 min per recipe
- Concurrency: 1
Adding `tests/recipes/test_recipe_<name>.py` is all that's needed — CI auto-discovers it.
---
## Creating a new recipe
### Step 1: Understand the request
Determine:
- **Recipe name**: Directory/file name under `recipes/`
- **Training type**: SL, RL, DPO, distillation, or hybrid
- **Key details**: Model, dataset, environment, reward signal
### Step 2: Read existing recipes
Before writing code, read the most relevant recipe:
- **SL**: `tinker_cookbook/recipes/sl_basic.py` and `tinker_cookbook/recipes/chat_sl/train.py`
- **RL**: `tinker_cookbook/recipes/rl_basic.py` and `tinker_cookbook/recipes/math_rl/train.py`
- **DPO**: `tinker_cookbook/recipes/preference/dpo/train.py`
- **Distillation**: `tinker_cookbook/recipes/distillation/on_policy_distillation.py`
- **Multi-turn RL**: `tinker_cookbook/recipes/harbor_rl/train.py`
### Step 3: Follow conventions
#### File structure
```
tinker_cookbook/recipes/<recipe_name>/
├── __init__.py
├── train.py # CLIConfig + cli_main
└── <env_or_data>.py # Dataset/environment definitions
```
#### Required elements
1. `@chz.chz` config class with sensible defaults
2. `model_info.get_recommended_renderer_name()` — never hardcode
3. `cli_utils.check_log_dir()` before training
4. `checkpoint_utils.resolve_renderer_name_from_checkpoint_or_default_async()` if loading checkpoints
5. Explicit typing — no `Any` or `type: ignore`
6. Auto-generated log paths
#### CLI pattern
```python
@chz.chz
class CLIConfig:
model_name: str = "Qwen/Qwen3.5-9B-Base"
learning_rate: float = 1e-4
async def cli_main(cli_config: CLIConfig):
# Build full config, call training main
if __name__ == "__main__":
cli_config = chz.entrypoint(CLIConfig)
asyncio.run(cli_main(cli_config))
```
Entry point: `python -m tinker_cookbook.recipes.<name>.train [chz overrides]`
#### Naming conventions
- Subscript suffixes: `_P` (problems), `_G` (groups), `_T` (tokens), `_D` (datums)
- Use `safezip`, `timed`, `scope` helpers
- Use `ml_log.log_metrics` for metrics, `logtree` for transcripts
### Step 4: Add tests
#### Smoke test (required)
Create `tests/recipes/test_recipe_<name>.py`:
```python
import pytest
from tests.helpers import run_recipe
@pytest.mark.integration
def test_<recipe_name>():
run_recipe(
"tinker_cookbook.recipes.<recipe_name>.train",
[
"behavior_if_log_dir_exists=delete",
# Override params for fast execution:
# "groups_per_batch=4", "group_size=2",
],
)
```
#### Unit tests (for testable components)
Place next to code: `tinker_cookbook/recipes/<name>/<component>_test.py`
### Step 5: Verify
```bash
python -c "from tinker_cookbook.recipes.<name> import train"
python -m tinker_cookbook.recipes.<name>.train --help
uv run pytest tests/recipes/test_recipe_<name>.py -v -x -s
```
## /skills/research/references/distillation.md
# Knowledge Distillation
Complete reference for on-policy, off-policy, and multi-teacher distillation.
## Key concepts
**Distillation types:**
- **On-policy** (recommended): Student generates, teacher scores via KL divergence
- **Off-policy reasoning**: SFT on teacher-generated traces (e.g., OpenThoughts3)
- **Multi-teacher**: Different teachers for different datasets
**Core abstractions:**
- `TeacherConfig(base_model, load_checkpoint_path)` — identifies the teacher model
- `PromptOnlyDatasetBuilder(dataset_name, ...)` — loads prompts (built-in: `"deepmath"`, `"tulu3"`)
- `DistillationDatasetConfig(dataset_builder, teacher_config, groups_per_batch)` — binds dataset to teacher
**Key parameters:**
- `kl_penalty_coef`: Weight of KL penalty (default 1.0). The only supervision signal.
- `kl_discount_factor`: Discount for future KL (0.0 = no discount). Increase for longer sequences.
- `group_size`: Rollouts per prompt (default 4)
- `groups_per_batch`: Prompts per batch (default 1024)
## Complete on-policy example
```python
import asyncio
import chz
from tinker_cookbook import checkpoint_utils, cli_utils
from tinker_cookbook.distillation import train_on_policy
from tinker_cookbook.distillation.datasets import (
DistillationDatasetConfig, PromptOnlyDatasetBuilder, TeacherConfig,
)
@chz.chz
class CLIConfig:
model_name: str = "Qwen/Qwen3.5-9B-Base"
teacher_model: str = "Qwen/Qwen3-8B"
dataset: str = "deepmath"
group_size: int = 4
groups_per_batch: int = 1024
learning_rate: float = 1e-4
max_tokens: int = 4096
kl_penalty_coef: float = 1.0
kl_discount_factor: float = 0.0
lora_rank: int = 128
renderer_name: str | None = None
load_checkpoint_path: str | None = None
log_path: str | None = None
behavior_if_log_dir_exists: cli_utils.LogdirBehavior = "ask"
max_steps: int | None = None
async def cli_main(cli_config: CLIConfig):
renderer_name = await checkpoint_utils.resolve_renderer_name_from_checkpoint_or_default_async(
model_name=cli_config.model_name,
explicit_renderer_name=cli_config.renderer_name,
load_checkpoint_path=cli_config.load_checkpoint_path,
)
dataset_builder = PromptOnlyDatasetBuilder(
dataset_name=cli_config.dataset,
groups_per_batch=cli_config.groups_per_batch,
group_size=cli_config.group_size,
model_name_for_tokenizer=cli_config.model_name,
renderer_name=renderer_name,
)
teacher_config = TeacherConfig(base_model=cli_config.teacher_model)
dataset_config = DistillationDatasetConfig(
dataset_builder=dataset_builder,
teacher_config=teacher_config,
groups_per_batch=cli_config.groups_per_batch,
)
log_path = cli_config.log_path or f"/tmp/tinker-examples/distillation/{cli_config.dataset}"
cli_utils.check_log_dir(log_path, behavior_if_exists=cli_config.behavior_if_log_dir_exists)
config = train_on_policy.Config(
dataset_configs=[dataset_config],
model_name=cli_config.model_name,
renderer_name=renderer_name,
learning_rate=cli_config.learning_rate,
lora_rank=cli_config.lora_rank,
max_tokens=cli_config.max_tokens,
kl_penalty_coef=cli_config.kl_penalty_coef,
kl_discount_factor=cli_config.kl_discount_factor,
log_path=log_path,
max_steps=cli_config.max_steps,
)
await train_on_policy.main(config)
if __name__ == "__main__":
cli_config = chz.entrypoint(CLIConfig)
asyncio.run(cli_main(cli_config))
```
## Multi-teacher
Pass multiple `DistillationDatasetConfig` objects:
```python
config = train_on_policy.Config(
dataset_configs=[math_dataset_config, chat_dataset_config],
...
)
```
See `tinker_cookbook/recipes/distillation/on_policy_multi_teacher.py`.
## Off-policy reasoning
Use standard SFT on teacher-generated traces. See `tinker_cookbook/recipes/distillation/off_policy_reasoning.py`.
## Code references
- `tinker_cookbook/distillation/train_on_policy.py` — On-policy distillation training
- `tinker_cookbook/distillation/datasets.py` — Distillation datasets
- `tinker_cookbook/recipes/distillation/` — All distillation recipes
## /skills/research/references/hyperparams.md
# Hyperparameter Selection
Detailed guide for choosing training hyperparameters across SL, RL, DPO, and distillation.
## Reference
- `tinker_cookbook/hyperparam_utils.py` — LR formulas and calculations
## Learning rate formula
The recommended LR for LoRA:
```
LR(m) = lr_base * M_LoRA * (2000 / H_m) ^ P_m
```
Where:
- `lr_base = 5e-5`
- `M_LoRA = 10` (1 for full fine-tuning)
- `H_m` = hidden size of the model
- `P_m` = model-specific exponent (0.0775 for Qwen, 0.781 for Llama)
This formula gives <0.5% regret vs exhaustive sweeps across diverse SFT experiments.
```python
from tinker_cookbook.hyperparam_utils import get_lr
lr = get_lr("Qwen/Qwen3.5-9B-Base", is_lora=True)
```
## LoRA rank
- **Default**: 32 for most tasks
- **Higher rank** (64-128): More capacity for complex tasks or large models
- **Lower rank** (8-16): Faster, sufficient for simple adaptations
- LR is independent of LoRA rank (validated empirically)
```python
from tinker_cookbook.hyperparam_utils import get_lora_param_count
params = get_lora_param_count("Qwen/Qwen3.5-9B-Base", lora_rank=32)
```
## Batch size
### SL batch size
- Measured in **tokens**, not examples
- Start with 128
- Scale LR proportionally: `LR ~ sqrt(batch_size)`
- Aim for at least 100 training steps (best results with 1000+)
### RL batch size and group size
- `batch_size` / `groups_per_batch`: Number of unique problems per batch
- `group_size`: Rollouts per problem (advantages centered within group)
- `total_rollouts = batch_size * group_size`
- Start small for debugging: `groups_per_batch=4, group_size=2`
## Learning rate schedule
- `"linear"` — Linear decay to 0 (most common)
- `"cosine"` — Cosine annealing
- `"constant"` — No decay
## `num_substeps` (RL)
- `num_substeps=1` (default): One update per batch
- `num_substeps>1`: Splits batch into mini-batches. Requires PPO objective.
- Start with 2-4 if experimenting; decrease LR with higher values
## DPO-specific
- `dpo_beta=0.1` — Well-tested default
- Lower beta = more aggressive; higher beta = closer to reference
## Distillation-specific
- `kl_penalty_coef=1.0` — Weight of KL penalty from teacher
- `kl_discount_factor=0.0` — No discounting (increase for long sequences)
## Quick-start recommendations
| Scenario | Model | LR | Batch | LoRA Rank |
|----------|-------|-----|-------|-----------|
| SFT on chat data | Qwen3.5-9B-Base | `get_lr(model)` | 128 | 32 |
| Math GRPO | Qwen3.5-9B (thinking) | 4e-5 | 128x16 | 32 |
| DPO | Qwen3.5-9B-Base | 1e-5 | 256 | 32 |
| Distillation | Qwen3.5-9B-Base | 1e-4 | 1024x4 | 128 |
| Multi-turn RL | Kimi-K2.6 | 1e-5 | 8x4 | 32 |
## Pitfalls
- `get_lr()` currently only supports Llama and Qwen families
- DPO LR should be much lower than SFT (1e-5 vs 2e-4)
- RL LR should be lower than SFT — too aggressive updates destabilize the policy
- Monitor KL divergence in RL — training is stable when KL < 0.01
## /skills/research/references/models.md
# Model Lineup
Full listing of available models with types, architecture, and sizes.
## Qwen family
| Model | Type | Arch | Size |
|-------|------|------|------|
| `Qwen/Qwen3.6-35B-A3B` | Hybrid + Vision | MoE | Medium |
| `Qwen/Qwen3.6-27B` | Hybrid + Vision | Dense | Medium |
| `Qwen/Qwen3.5-397B-A17B` | Hybrid + Vision | MoE | Large |
| `Qwen/Qwen3.5-35B-A3B` | Hybrid + Vision | MoE | Medium |
| `Qwen/Qwen3.5-35B-A3B-Base` | Base | MoE | Medium |
| `Qwen/Qwen3.5-27B` | Hybrid + Vision | Dense | Medium |
| `Qwen/Qwen3.5-9B` | Hybrid + Vision | Dense | Small |
| `Qwen/Qwen3.5-9B-Base` | Base | Dense | Small |
| `Qwen/Qwen3.5-4B` | Hybrid + Vision | Dense | Compact |
| `Qwen/Qwen3-235B-A22B-Instruct-2507` | Instruction | MoE | Large |
| `Qwen/Qwen3-30B-A3B-Instruct-2507` | Instruction | MoE | Medium |
| `Qwen/Qwen3-30B-A3B` | Hybrid | MoE | Medium |
| `Qwen/Qwen3-30B-A3B-Base` | Base | MoE | Medium |
| `Qwen/Qwen3-32B` | Hybrid | Dense | Medium |
| `Qwen/Qwen3-8B` | Hybrid | Dense | Small |
| `Qwen/Qwen3-8B-Base` | Base | Dense | Small |
| `Qwen/Qwen3-4B-Instruct-2507` | Instruction | Dense | Compact |
| `Qwen/Qwen3-VL-235B-A22B-Instruct` | Vision | MoE | Large |
| `Qwen/Qwen3-VL-30B-A3B-Instruct` | Vision | MoE | Medium |
Use the `_disable_thinking` renderer variant when you want direct instruction-following behavior from a hybrid Qwen model.
## Llama family
| Model | Type | Arch | Size |
|-------|------|------|------|
| `meta-llama/Llama-3.3-70B-Instruct` | Instruction | Dense | Large |
| `meta-llama/Llama-3.1-70B` | Base | Dense | Large |
| `meta-llama/Llama-3.1-8B` | Base | Dense | Small |
| `meta-llama/Llama-3.1-8B-Instruct` | Instruction | Dense | Small |
| `meta-llama/Llama-3.2-3B` | Base | Dense | Compact |
| `meta-llama/Llama-3.2-1B` | Base | Dense | Compact |
## Nemotron family
| Model | Type | Arch | Size |
|-------|------|------|------|
| `nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16` | Hybrid | MoE | Large |
| `nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16` | Hybrid | MoE | Medium |
## Other families
| Model | Type | Arch | Size |
|-------|------|------|------|
| `openai/gpt-oss-120b` | Reasoning | MoE | Medium |
| `openai/gpt-oss-20b` | Reasoning | MoE | Small |
| `deepseek-ai/DeepSeek-V3.1` | Hybrid | MoE | Large |
| `deepseek-ai/DeepSeek-V3.1-Base` | Base | MoE | Large |
| `moonshotai/Kimi-K2-Thinking` | Reasoning | MoE | Large |
| `moonshotai/Kimi-K2.5` | Reasoning + Vision | MoE | Large |
| `moonshotai/Kimi-K2.6` | Reasoning + Vision | MoE | Large |
## Model types explained
- **Base**: Pre-trained on raw text. For research or full post-training pipelines.
- **Instruction**: Fine-tuned for instruction following. Fast inference, no chain-of-thought.
- **Reasoning**: Always uses chain-of-thought before visible output.
- **Hybrid**: Can operate in both thinking and non-thinking modes.
- **Vision**: Processes images alongside text.
## Size categories
- **Compact**: 1B-4B parameters
- **Small**: 8B parameters
- **Medium**: 27B-32B parameters
- **Large**: 70B+ parameters
## Renderer matching
Every model needs a matching renderer. Always use automatic lookup:
```python
from tinker_cookbook import model_info
renderer_name = model_info.get_recommended_renderer_name(model_name)
```
The mapping is maintained in `tinker_cookbook/model_info.py`. Never hardcode renderer names.
## Reference
- `tinker_cookbook/model_info.py` — Model metadata and renderer mapping
## /skills/research/references/ops.md
# Operations Reference — Checkpoints, Weights, Logging, Evaluation
Consolidated reference for training lifecycle operations: checkpoint management, weight export, logging/metrics analysis, and evaluation.
---
## Checkpoints
### Reference
- `tinker_cookbook/checkpoint_utils.py` — CheckpointRecord, save/load helpers
### Two checkpoint types
| Type | Method | Purpose | Contains |
|------|--------|---------|----------|
| **State** | `save_state()` | Resume training | Weights + optimizer state |
| **Sampler** | `save_weights_for_sampler()` | Sampling / export | Weights only |
```python
tc.save_state(name="step_100", ttl_seconds=None)
tc.save_weights_for_sampler(name="step_100_sampler", ttl_seconds=None)
sc = tc.save_weights_and_get_sampling_client() # Ephemeral, not persistently saved
```
### CheckpointRecord
```python
from tinker_cookbook.checkpoint_utils import CheckpointRecord
record = CheckpointRecord(
name="step_100", batch=100, epoch=1, final=False,
state_path="tinker://...", sampler_path="tinker://...",
extra={"eval_loss": 0.5},
)
d = record.to_dict()
record = CheckpointRecord.from_dict(d)
record.has("state_path") # True
```
### Save/load helpers
```python
from tinker_cookbook import checkpoint_utils
# Save (async)
paths = await checkpoint_utils.save_checkpoint_async(
training_client=tc, name="step_100", log_path="/tmp/my_run",
loop_state={"batch": 100, "epoch": 1},
kind="both", # "state", "sampler", or "both"
ttl_seconds=None,
)
# Load checkpoint list
records = checkpoint_utils.load_checkpoints_file("/tmp/my_run")
# Get last checkpoint
record = checkpoint_utils.get_last_checkpoint("/tmp/my_run", required_key="state_path")
```
### Resuming training
```python
behavior_if_log_dir_exists: cli_utils.LogdirBehavior = "ask" # "ask", "delete", "resume"
if config.load_checkpoint_path:
tc.load_state_with_optimizer(config.load_checkpoint_path)
```
### REST API / CLI management
```python
rest = ServiceClient().create_rest_client()
checkpoints = rest.list_user_checkpoints(limit=100)
rest.publish_checkpoint_from_tinker_path("tinker://...")
rest.set_checkpoint_ttl_from_tinker_path("tinker://...", ttl_seconds=86400)
rest.delete_checkpoint_from_tinker_path("tinker://...")
```
```bash
tinker checkpoint list
tinker checkpoint publish <TINKER_PATH>
tinker checkpoint set-ttl <TINKER_PATH> --ttl 86400
tinker checkpoint delete <TINKER_PATH>
```
---
## Weights
### Reference
- `tinker_cookbook/weights/__init__.py` — API overview
- `tinker_cookbook/weights/_download.py` — Download implementation
- `tinker_cookbook/weights/_export/` — LoRA merge (full, quantized, sharded)
- `tinker_cookbook/weights/_publish.py` — HuggingFace Hub publish
### Download
```python
from tinker_cookbook import weights
adapter_dir = weights.download(
tinker_path="tinker://run-id/sampler_weights/final",
output_dir="./adapter",
base_url=None,
)
```
### Merge LoRA into base model
```python
weights.build_hf_model(
base_model="Qwen/Qwen3-8B",
adapter_path="./adapter",
output_path="./model",
dtype="bfloat16",
trust_remote_code=None,
)
```
### PEFT adapter (no merge)
Convert to PEFT format for vLLM/SGLang serving:
```python
weights.build_lora_adapter(
base_model="Qwen/Qwen3-8B",
adapter_path="./adapter",
output_path="./peft_adapter",
trust_remote_code=None,
)
```
### Publish to HuggingFace Hub
```python
url = weights.publish_to_hf_hub(
model_path="./model",
repo_id="user/my-finetuned-model",
private=True,
token=None, # Uses HF_TOKEN env var
)
```
### Full workflow
```python
from tinker_cookbook import weights
# Step 1: Download adapter
adapter_dir = weights.download(
tinker_path="tinker://run-id/sampler_weights/final",
output_dir="./adapter",
)
# Step 2: Merge LoRA into base model
weights.build_hf_model(
base_model="Qwen/Qwen3.5-35B-A3B",
adapter_path=adapter_dir,
output_path="./model",
dtype="bfloat16",
)
# Step 3: Publish to HuggingFace Hub
url = weights.publish_to_hf_hub(
model_path="./model",
repo_id="user/my-finetuned-model",
private=True,
)
```
### Pitfalls
- `download()` expects `tinker://` path from `save_weights_for_sampler`, not `save_state`
- `build_hf_model()` requires the base model to be downloadable from HuggingFace
- Set `HF_TOKEN` for private models and publishing
- `dtype="bfloat16"` is recommended for most models
---
## Logging
### Reference
- `tinker_cookbook/utils/ml_log.py` — Metrics logging API
- `tinker_cookbook/utils/logtree.py` — Logtree structured transcripts
- `tinker_cookbook/utils/trace.py` — Tracing/profiling
### Output files
**Top-level:**
| File | Format | Contents |
|------|--------|----------|
| `metrics.jsonl` | JSONL | Scalar metrics per iteration |
| `config.json` | JSON | Full training config |
| `checkpoints.jsonl` | JSONL | Checkpoint metadata |
| `code.diff` | text | Git diff at training start |
| `timing_spans.jsonl` | JSONL | Per-iteration span timing |
| `trace_events.jsonl` | JSONL | Perfetto/Chrome Trace events |
**Per-iteration** (inside `iteration_NNNNNN/`):
| File | Format | Contents |
|------|--------|----------|
| `train.html` | HTML | Human-readable logtree report |
| `train_logtree.json` | JSON | Machine-readable rollout transcripts |
| `train_rollout_summaries.jsonl` | JSONL | Per-trajectory rewards |
| `eval_<name>.html` | HTML | Eval rollout report |
| `eval_<name>_logtree.json` | JSON | Eval rollout transcripts |
| `timing_gantt.html` | HTML | Plotly Gantt chart |
### Common metric keys
**Progress:** `progress/batch`, `progress/done_frac`
**RL rewards:** `env/all/reward/total`, `env/all/<metric>`
**Health:** `entropy`, `kl_sample_train_v1` (< 0.01), `optim/lr`
**Timing:** `time/total`, `time/<name>`, `time/<name>:total`, `time/<name>:mean`
### Rollout analysis
#### Rollout summaries
```python
import json
with open("iteration_000010/train_rollout_summaries.jsonl") as f:
trajectories = [json.loads(line) for line in f]
for traj in trajectories:
print(f"reward={traj['total_reward']:.2f}, metrics={traj['trajectory_metrics']}")
```
#### Logtree JSON (full transcripts)
```python
import json
with open("iteration_000060/train_logtree.json") as f:
data = json.load(f)
groups = [c for c in data["root"]["children"]
if isinstance(c, dict) and c.get("tag") == "section"]
```
#### Extracting conversations
```python
def find_conversations(node):
results = []
if isinstance(node, dict):
data = node.get("data", {})
if isinstance(data, dict) and data.get("type") == "conversation":
results.append(data)
for child in node.get("children", []):
results.extend(find_conversations(child))
return results
```
#### Extracting tables
```python
def find_tables(node):
results = []
if isinstance(node, dict):
if node.get("tag") == "table":
results.append(node)
for c in node.get("children", []):
results.extend(find_tables(c))
return results
def parse_table_rows(table_node):
rows = []
for part in table_node.get("children", []):
if not isinstance(part, dict): continue
if part.get("tag") in ("tbody", "thead"):
for row in part.get("children", []):
if isinstance(row, dict) and row.get("tag") == "tr":
cells = []
for cell in row.get("children", []):
if isinstance(cell, dict) and cell.get("tag") in ("td", "th"):
cells.append(get_text(cell).strip())
rows.append(cells)
return rows
def get_text(node):
if isinstance(node, str): return node
return "".join(get_text(c) for c in node.get("children", []))
```
### Custom logging
```python
from tinker_cookbook.utils import ml_log
ml_logger = ml_log.setup_logging(log_path="/tmp/my_run", wandb_project=None, wandb_name=None)
ml_logger.log_metrics({"train/loss": 0.5, "eval/accuracy": 0.85}, step=100)
```
### Tracing & profiling
```python
from tinker_cookbook.utils import trace
trace.trace_init()
for i_batch in range(n_batches):
with trace.trace_iteration(step=i_batch) as window:
await gather_rollouts(...)
await train_step(...)
metrics.update(window.get_timing_metrics())
window.write_spans_jsonl(log_path / "timing_spans.jsonl", step=i_batch)
```
#### Instrumenting code
```python
@trace.scope
async def my_training_step(tc, batch):
result = await tc.forward_backward_async(data=batch, loss_fn="cross_entropy")
return result
async with trace.scope_span("data_prep"):
batch = prepare_next_batch(...)
```
#### Viewing Perfetto traces
```bash
uv run python -m tinker_cookbook.utils.trace trace_events.jsonl trace.json
# Open in chrome://tracing or https://ui.perfetto.dev/
```
### Weights & Biases
```python
config = train.Config(wandb_project="my-project", wandb_name="my-experiment", ...)
```
---
## Evaluation
### Reference
- `tinker_cookbook/eval/evaluators.py` — Evaluator types
- `tinker_cookbook/eval/inspect_evaluators.py` — Inspect-based evaluators
- `tinker_cookbook/eval/custom_evaluators.py` — Custom evaluator implementations
- `tinker_cookbook/supervised/nll_evaluator.py` — NLL evaluator
- `tinker_cookbook/supervised/train.py` — SL evaluator integration
- `tinker_cookbook/rl/train.py` — RL evaluator integration
### SL evaluators
Two tiers:
```python
config = supervised_train.Config(
evaluator_builders=[...], # Every eval_every steps
infrequent_evaluator_builders=[...], # Every infrequent_eval_every steps
eval_every=8,
infrequent_eval_every=50,
)
```
### RL evaluators
Uses `SamplingClientEvaluator`:
```python
async def my_evaluator(sampling_client: SamplingClient) -> dict[str, float]:
return {"accuracy": 0.85, "avg_length": 150}
config = rl_train.Config(evaluator_builders=[my_evaluator], eval_every=20)
```
### Test set evaluator
Built into `rl/train.py` via the test dataset from `RLDatasetBuilder.__call__()`:
```python
# RLDatasetBuilder.__call__() returns (train_dataset, test_dataset)
```
### Inspect AI integration
```python
from tinker_cookbook.eval.inspect_utils import InspectAPIFromTinkerSampling
evaluator = InspectAPIFromTinkerSampling(
task="gsm8k", renderer_name=renderer_name,
model_name=model_name, include_reasoning=True,
)
```
See `tinker_cookbook/recipes/chat_sl/train.py` for a working example with GSM8K and IFEval.
### Custom evaluators
#### Sampling-based
```python
async def eval_math(sampling_client: SamplingClient) -> dict[str, float]:
async def evaluate_one(problem):
response = await sampling_client.sample_async(
prompt=problem.prompt, num_samples=1,
sampling_params=SamplingParams(max_tokens=256, temperature=0.0),
)
return parse_answer(response.sequences[0].tokens) == problem.expected
# Evaluate all problems concurrently — sequential loops waste throughput
results = await asyncio.gather(*[evaluate_one(p) for p in test_problems])
return {"math_accuracy": sum(results) / len(results)}
```
#### NLL-based
Compute NLL on a held-out dataset without generating text. See the built-in evaluator in `tinker_cookbook/supervised/train.py`.
### Metrics logging
```python
from tinker_cookbook.utils.ml_log import log_metrics
log_metrics({"train/loss": 0.5, "eval/accuracy": 0.85}, step=100)
```
## /tests/__init__.py
```py path="/tests/__init__.py"
```
## /tests/compare_sampling_training_logprobs.py
```py path="/tests/compare_sampling_training_logprobs.py"
import asyncio
import logging
import time
from functools import cache
import chz
import httpx
import pandas as pd
import tinker
import torch
from tinker import AdamParams, ModelInput
from tinker_cookbook.supervised.common import datum_from_model_input_weights
@cache
def get_reference_document():
"""Download PyTorch's forward_ad.py file from a specific commit."""
url = "https://raw.githubusercontent.com/pytorch/pytorch/a10b765bf159a86fb2a0ad693c6b72e0c691e60b/torch/autograd/forward_ad.py"
response = httpx.get(url)
response.raise_for_status()
return response.text
async def get_row(
model_name: str,
service_client: tinker.ServiceClient,
timeout_sec: float,
saved_path_for_trainer: str | None = None,
saved_path_for_sampler: str | None = None,
ttl_seconds: int | None = 604800,
) -> dict:
async def _inner():
tstart = time.time()
print(f"========== Testing {model_name} ==========")
training_client = await service_client.create_lora_training_client_async(
base_model=model_name
)
if saved_path_for_trainer is not None:
await training_client.load_state_async(saved_path_for_trainer)
# First sample something
tokenizer = training_client.get_tokenizer()
tokens = tokenizer.encode(get_reference_document())
model_input = ModelInput.from_ints(tokens)
weights = torch.ones(len(tokens), dtype=torch.float32)
weights[0] = 0.0
datum = datum_from_model_input_weights(model_input, weights)
for _ in range(3 if saved_path_for_trainer is None else 0):
fwd_bwd_future = await training_client.forward_backward_async(
[datum], loss_fn="cross_entropy"
)
optim_step_future = await training_client.optim_step_async(
adam_params=AdamParams(learning_rate=1e-3)
)
_fwd_bwd_result = await fwd_bwd_future.result_async()
_optim_step_result = await optim_step_future.result_async()
fwd_future = await training_client.forward_async([datum], loss_fn="cross_entropy")
fwd_result = await fwd_future.result_async()
training_logprobs = fwd_result.loss_fn_outputs[0]["logprobs"].to_torch()
if saved_path_for_sampler is None:
state_for_trainer_future = await training_client.save_state_async(
name="tmp-checkpoint", ttl_seconds=ttl_seconds
)
state_for_trainer = await state_for_trainer_future.result_async()
print(f"Saved state for trainer: {state_for_trainer.path}")
sampling_client = await training_client.save_weights_and_get_sampling_client_async(
name="tmp-checkpoint"
)
else:
sampling_client = training_client.create_sampling_client(
model_path=saved_path_for_sampler
)
logprobs_response = await sampling_client.compute_logprobs_async(model_input)
sampling_logprobs = torch.tensor(logprobs_response[1:])
mse = ((sampling_logprobs - training_logprobs) ** 2).mean()
dur = time.time() - tstart
print(f"Time taken: {dur:.1f} seconds")
result = {
"model_name": model_name,
"mse[sample, train]": mse.item(),
"time": dur,
}
print(result)
return result
try:
return await asyncio.wait_for(_inner(), timeout=timeout_sec)
except TimeoutError:
print(f"ERROR: Timeout after {timeout_sec} seconds for model {model_name}")
return {"model_name": model_name, "error": "TimeoutError"}
@chz.chz
class Config:
base_url: str | None = None
print_models: bool = False
model_names: list[str] | None = None
model_name_filter: list[str] | None = chz.field(default_factory=lambda: ["loadtest"])
state_for_trainer: str | None = None
state_for_sampler: str | None = None
ttl_seconds: int | None = 604800 # 7 days
async def main(config: Config):
logging.basicConfig(level=logging.INFO)
service_client = tinker.ServiceClient(base_url=config.base_url)
if config.model_names is None:
server_capabilities = await service_client.get_server_capabilities_async()
model_names = [
model_info.model_name
for model_info in server_capabilities.supported_models
if model_info.model_name is not None
]
if config.print_models:
print("Available models:")
for model_name in model_names:
print(f"- {model_name}")
return
else:
model_names = list(config.model_names)
def should_do_model(model_name: str) -> bool:
if not config.model_name_filter:
return True
return not any(x in model_name for x in config.model_name_filter)
model_names = [x for x in sorted(model_names) if should_do_model(x)]
print(f"Model names: {model_names}")
timeout_sec = 300.0
rows = await asyncio.gather(
*[
get_row(
model_name,
service_client,
timeout_sec,
config.state_for_trainer,
config.state_for_sampler,
config.ttl_seconds,
)
for model_name in model_names
]
)
df = pd.DataFrame(rows)
# Ensure df has all required columns with NaN for missing values
required_columns = ["model_name", "mse[sample, train]", "time", "error"]
for col in required_columns:
if col not in df.columns:
df[col] = pd.NA
df = df[required_columns]
df.to_csv("/tmp/sampling_training_logprobs.csv", index=False)
print(df.to_markdown())
if __name__ == "__main__":
asyncio.run(chz.nested_entrypoint(main))
```
## /tests/conftest.py
```py path="/tests/conftest.py"
"""Pytest configuration for integration tests.
Recipes NOT yet covered by integration tests:
- code_rl: requires external sandbox service (SandboxFusion)
- search_tool: requires running Chroma vector DB + embedding API
- verifiers_rl: requires verifiers framework environment
- if_rl: requires if_verifiable library + IFBench data
- rubric: needs generated JSONL data (has generate_data.py script)
- rl_basic, sl_basic, rl_loop, sl_loop: standalone tutorial scripts (not full recipes)
- prompt_distillation: needs a local JSONL data file
- harbor_rl: needs Modal + downloaded Harbor tasks
"""
import os
import pytest
def pytest_collection_modifyitems(config, items):
"""Skip smoke tests locally when TINKER_API_KEY is not set. Fail on CI."""
if os.environ.get("TINKER_API_KEY"):
return
# Separate smoke tests from downstream_compat tests (which don't need API keys)
smoke_items = [item for item in items if "downstream_compat" not in str(item.fspath)]
if not smoke_items:
return
if os.environ.get("CI"):
pytest.fail("TINKER_API_KEY is not set but CI=true — smoke tests require an API key")
skip = pytest.mark.skip(
reason="TINKER_API_KEY not set (set it or run pytest tinker_cookbook/ for unit tests)"
)
for item in smoke_items:
item.add_marker(skip)
```
The content has been capped at 50000 tokens. The user could consider applying other filters to refine the result. The better and more specific the context, the better the LLM can follow instructions. If the context seems verbose, the user can refine the filter using uithub. Thank you for using https://uithub.com - Perfect LLM context for any GitHub repo.