# Twinkle Kernel Module

The Twinkle Kernel Module provides two kernel replacement paths for accelerating models during training and inference:

* **Layer-level kernelize**
  Replace entire `nn.Module` implementations with optimized kernels.
* **Function-level kernelize**
  Monkey-patch specific functions inside a Python module.

These two approaches can be used independently or together via a unified registration and application entry point.

---

## Overview: Two Kernelization Paths

| Path           | Granularity          | Typical Use Cases                |
| -------------- | -------------------- | -------------------------------- |
| Layer-level    | Whole `nn.Module`    | Linear / Conv / MLP / Attention  |
| Function-level | Individual functions | Hot paths, math ops, activations |

---

## Layer-Level Kernel Replacement

### When to Use

* You have a complete kernel implementation for a layer
* You want model-wide replacement of specific `nn.Module` types
* Suitable for both training and inference

---

### Example 1: Local Kernel Repo

Use this when:

* Kernel implementations live in a local repository
* You want to replace layers in HuggingFace or custom models

```python
from twinkle.kernel import (
    kernelize_model,
    register_layer_kernel,
    register_external_layer,
)
from transformers import Qwen2Config, Qwen2ForCausalLM
from transformers.models.qwen2.modeling_qwen2 import Qwen2MLP

# 1) Register the layer kernel from a local repo
register_layer_kernel(
    kernel_name="MyAwesomeMLP",
    repo_path="/path/to/local/repo",
    package_name="my_kernels",
    layer_name="Qwen2MLPTrainingKernel",
    device="cuda",
    mode="train",
)

# 2) Bind external layer to kernel name
register_external_layer(Qwen2MLP, "MyAwesomeMLP")

# 3) Build the model and apply kernelization
config = Qwen2Config(
    hidden_size=128,
    num_hidden_layers=1,
    num_attention_heads=4,
    num_key_value_heads=4,
    intermediate_size=256,
    use_cache=False,
)
model = Qwen2ForCausalLM(config)
model = kernelize_model(model, mode="train", device="cuda", use_fallback=True)
```

---

### Example 2: Hub Kernel Repo

Use this when:

* The kernel is hosted on a Hub

```python
import torch
import torch.nn as nn
from twinkle.kernel import (
    kernelize_model,
    register_layer_kernel,
    register_external_layer,
)

# 1) Define the custom layer
class SiluAndMul(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x1, x2 = x.chunk(2, dim=-1)
        return nn.functional.silu(x1) * x2

# 2) Register the Hub kernel and bind the layer
register_layer_kernel(
    kernel_name="SiluAndMulKernel",
    repo_id="kernels-community/activation",
    layer_name="SiluAndMul",
    device="cuda",
    mode="train",
)
register_external_layer(SiluAndMul, "SiluAndMulKernel")

# 3) Apply to a model
class SimpleModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.activation = SiluAndMul()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.activation(x)

model = SimpleModel()
model = kernelize_model(model, mode="train", device="cuda", use_fallback=True)
```

---

## Local Kernel Repo (Minimal)

A local kernel repository is a regular Python package.
At minimum, it only needs a `layers.py` file for layer-level kernels.

```text
# Repo layout:
my_kernels/                  # Local kernel repository (Python package)
├── __init__.py              # Package entry
└── layers.py                # Layer-level kernel implementations

```

```python
# my_kernels/__init__.py
from . import layers
__all__ = ["layers"]

# my_kernels/layers.py
import torch
import torch.nn as nn

class Qwen2MLPTrainingKernel(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        gate = self.gate_proj(x)
        up = self.up_proj(x)
        return self.down_proj(self.act_fn(gate) * up)
```

---

## Function-Level Kernel Replacement

### When to Use

* You only need to accelerate a small number of hot functions
* Replacing the entire layer is unnecessary or impractical
* Common for math ops, activations, or utility functions

---

### Example 1: Batch Registration (Simple Case)

```python
from twinkle.kernel import register_kernels, kernelize_model

# 1) Register function kernels
config = {
    "functions": {
        "add": {
            "target_module": "my_pkg.math_ops",
            "func_impl": lambda x, y: x + y + 1,
            "device": "cuda",
            "mode": "inference",
        },
    },
}
register_kernels(config)

# 2) Apply (model can be None when only functions are used)
kernelize_model(model=None, mode="inference", device="cuda", use_fallback=True)
```

---

### Example 2: Advanced Function Sources (Full Control)

Use this when:

* Use when different functions come from different sources (impl / repo / hub) or need compile/backward flags.

```python
from twinkle.kernel.function import (
    register_function_kernel,
    apply_function_kernel,
)
import torch.nn as nn
from twinkle.kernel import kernelize_model

TARGET_MODULE = "my_pkg.math_ops"

# 1) Direct implementation
def fast_add(x, y):
    return x + y + 1

register_function_kernel(
    func_name="add",
    target_module=TARGET_MODULE,
    func_impl=fast_add,
    device="cuda",
    mode="inference",
)

# 2) Repo object (FuncRepositoryProtocol)
class MyFuncRepo:
    def load(self):
        return MyKernelFunc

class MyKernelFunc(nn.Module):
    def forward(self, x, y):
        return x * y

register_function_kernel(
    func_name="mul",
    target_module=TARGET_MODULE,
    repo=MyFuncRepo(),
    device="cuda",
    mode="compile",
)

# 3) Hub repo
register_function_kernel(
    func_name="silu_and_mul",
    target_module="my_pkg.activations",
    repo_id="kernels-community/activation",
    revision="main",  # or version="0.1.0"
    device="cuda",
    mode="inference",
)

# 4) Apply function kernels
applied = apply_function_kernel(
    target_module=TARGET_MODULE,
    device="cuda",
    mode="inference",
    strict=False,
)
print("patched:", applied)

# 5) Optional: unified entry via kernelize_model
model = nn.Sequential(nn.Linear(8, 8), nn.ReLU())
kernelize_model(model=model, mode="inference", device="cuda", use_fallback=True)
```

---

## Unified Layer + Function Batch Registration

### When to Use

* Framework-level integration
* A single configuration entry point is preferred
* Managing both layer and function kernels together

```python
from twinkle.kernel import register_kernels, kernelize_model
import torch.nn as nn

# 1) Register layer + function kernels
config = {
    "layers": {
        "linear": {
            "repo_id": "kernels-community/linear",
            "layer_name": "Linear",
            "version": "0.1.0",
            "device": "cuda",
            "mode": "train",
        },
        "conv2d": {
            "repo_path": "/path/to/local/repo",
            "package_name": "my_kernels",
            "layer_name": "Conv2d",
            "device": "cuda",
        },
    },
    "functions": {
        "add": {
            "target_module": "my_pkg.math_ops",
            "func_impl": lambda x, y: x + y + 1,
            "device": "cuda",
            "mode": "inference",
        },
        "relu": {
            "target_module": "my_pkg.activations",
            "repo_id": "kernels-community/activation",
            "revision": "main",
            "device": "cuda",
        },
    },
}
register_kernels(config)

# 2) Apply via kernelize_model
model = nn.Sequential(nn.Linear(8, 8), nn.ReLU())
kernelize_model(model=model, mode="train", device="cuda", use_fallback=True)
```