Twinkle Kernel Module

The Twinkle Kernel Module provides two kernel replacement paths for accelerating models during training and inference:

Layer-level kernelize Replace entire nn.Module implementations with optimized kernels.
Function-level kernelize Monkey-patch specific functions inside a Python module.

These two approaches can be used independently or together via a unified registration and application entry point.

Overview: Two Kernelization Paths

Path	Granularity	Typical Use Cases
Layer-level	Whole `nn.Module`	Linear / Conv / MLP / Attention
Function-level	Individual functions	Hot paths, math ops, activations

Layer-Level Kernel Replacement

When to Use

You have a complete kernel implementation for a layer
You want model-wide replacement of specific nn.Module types
Suitable for both training and inference

Example 1: Local Kernel Repo

Use this when:

Kernel implementations live in a local repository
You want to replace layers in HuggingFace or custom models

from twinkle.kernel import (
    kernelize_model,
    register_layer_kernel,
    register_external_layer,
)
from transformers import Qwen2Config, Qwen2ForCausalLM
from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP

# 1) Register the layer kernel from a local repo
register_layer_kernel(
    kernel_name="MyAwesomeMLP",
    repo_path="/path/to/local/repo",
    package_name="my_kernels",
    layer_name="Qwen2MLPTrainingKernel",
    device="cuda",
    mode="train",
)

# 2) Bind external layer to kernel name
register_external_layer(Qwen2MLP, "MyAwesomeMLP")

# 3) Build the model and apply kernelization
config = Qwen2Config(
    hidden_size=128,
    num_hidden_layers=1,
    num_attention_heads=4,
    num_key_value_heads=4,
    intermediate_size=256,
    use_cache=False,
)
model = Qwen2ForCausalLM(config)
model = kernelize_model(model, mode="train", device="cuda", use_fallback=True)

Example 2: Hub Kernel Repo

Use this when:

The kernel is hosted on a Hub

import torch
import torch.nn as nn
from twinkle.kernel import (
    kernelize_model,
    register_layer_kernel,
    register_external_layer,
)

# 1) Define the custom layer
class SiluAndMul(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x1, x2 = x.chunk(2, dim=-1)
        return nn.functional.silu(x1) * x2

# 2) Register the Hub kernel and bind the layer
register_layer_kernel(
    kernel_name="SiluAndMulKernel",
    repo_id="kernels-community/activation",
    layer_name="SiluAndMul",
    device="cuda",
    mode="train",
)
register_external_layer(SiluAndMul, "SiluAndMulKernel")

# 3) Apply to a model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.activation = SiluAndMul()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.activation(x)

model = SimpleModel()
model = kernelize_model(model, mode="train", device="cuda", use_fallback=True)

Local Kernel Repo (Minimal)

A local kernel repository is a regular Python package. At minimum, it only needs a layers.py file for layer-level kernels.

# Repo layout:
my_kernels/                  # Local kernel repository (Python package)
├── __init__.py              # Package entry
└── layers.py                # Layer-level kernel implementations

# my_kernels/__init__.py
from . import layers
__all__ = ["layers"]

# my_kernels/layers.py
import torch
import torch.nn as nn

class Qwen2MLPTrainingKernel(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        gate = self.gate_proj(x)
        up = self.up_proj(x)
        return self.down_proj(self.act_fn(gate) * up)

Function-Level Kernel Replacement

When to Use

You only need to accelerate a small number of hot functions
Replacing the entire layer is unnecessary or impractical
Common for math ops, activations, or utility functions

Example 1: Batch Registration (Simple Case)

from twinkle.kernel import register_kernels, kernelize_model

# 1) Register function kernels
config = {
    "functions": {
        "add": {
            "target_module": "my_pkg.math_ops",
            "func_impl": lambda x, y: x + y + 1,
            "device": "cuda",
            "mode": "inference",
        },
    },
}
register_kernels(config)

# 2) Apply (model can be None when only functions are used)
kernelize_model(model=None, mode="inference", device="cuda", use_fallback=True)

Example 2: Advanced Function Sources (Full Control)

Use this when:

Use when different functions come from different sources (impl / repo / hub) or need compile/backward flags.

from twinkle.kernel.function import (
    register_function_kernel,
    apply_function_kernel,
)
import torch.nn as nn
from twinkle.kernel import kernelize_model

TARGET_MODULE = "my_pkg.math_ops"

# 1) Direct implementation
def fast_add(x, y):
    return x + y + 1

register_function_kernel(
    func_name="add",
    target_module=TARGET_MODULE,
    func_impl=fast_add,
    device="cuda",
    mode="inference",
)

# 2) Repo object (FuncRepositoryProtocol)
class MyFuncRepo:
    def load(self):
        return MyKernelFunc

class MyKernelFunc(nn.Module):
    def forward(self, x, y):
        return x * y

register_function_kernel(
    func_name="mul",
    target_module=TARGET_MODULE,
    repo=MyFuncRepo(),
    device="cuda",
    mode="compile",
)

# 3) Hub repo
register_function_kernel(
    func_name="silu_and_mul",
    target_module="my_pkg.activations",
    repo_id="kernels-community/activation",
    revision="main",  # or version="0.1.0"
    device="cuda",
    mode="inference",
)

# 4) Apply function kernels
applied = apply_function_kernel(
    target_module=TARGET_MODULE,
    device="cuda",
    mode="inference",
    strict=False,
)
print("patched:", applied)

# 5) Optional: unified entry via kernelize_model
model = nn.Sequential(nn.Linear(8, 8), nn.ReLU())
kernelize_model(model=model, mode="inference", device="cuda", use_fallback=True)

Unified Layer + Function Batch Registration

When to Use

Framework-level integration
A single configuration entry point is preferred
Managing both layer and function kernels together

from twinkle.kernel import register_kernels, kernelize_model
import torch.nn as nn

# 1) Register layer + function kernels
config = {
    "layers": {
        "linear": {
            "repo_id": "kernels-community/linear",
            "layer_name": "Linear",
            "version": "0.1.0",
            "device": "cuda",
            "mode": "train",
        },
        "conv2d": {
            "repo_path": "/path/to/local/repo",
            "package_name": "my_kernels",
            "layer_name": "Conv2d",
            "device": "cuda",
        },
    },
    "functions": {
        "add": {
            "target_module": "my_pkg.math_ops",
            "func_impl": lambda x, y: x + y + 1,
            "device": "cuda",
            "mode": "inference",
        },
        "relu": {
            "target_module": "my_pkg.activations",
            "repo_id": "kernels-community/activation",
            "revision": "main",
            "device": "cuda",
        },
    },
}
register_kernels(config)

# 2) Apply via kernelize_model
model = nn.Sequential(nn.Linear(8, 8), nn.ReLU())
kernelize_model(model=model, mode="train", device="cuda", use_fallback=True)