About

The d9d.module.block.ffn package implements standard dense Feed-Forward networks used in Transformer blocks.

Features

SwiGLU

SwiGLU is a SwiGLU layer.

Uses efficient SiLU-Mul kernel from Liger Kernel project.

d9d.module.block.ffn

SwiGLU

Bases: Module, ModuleLateInit

Implements the SwiGLU Feed-Forward Network (FFN).

This module applies the gated activation function: down(SiLU(gate(x)) * up(x)). It corresponds to the standard MLP block used in architectures like LLaMA.

Source code in d9d/module/block/ffn/swiglu.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
class SwiGLU(nn.Module, ModuleLateInit):
    """
    Implements the SwiGLU Feed-Forward Network (FFN).

    This module applies the gated activation function: `down(SiLU(gate(x)) * up(x))`.
    It corresponds to the standard MLP block used in architectures like LLaMA.
    """

    def __init__(
            self,
            hidden_size: int,
            intermediate_size: int
    ):
        """
        Constructs a SwiGLU object.

        Args:
            hidden_size: The hidden dim size.
            intermediate_size: The intermediate dim size of the FFN.
        """

        super().__init__()
        self.gate_proj = nn.Linear(hidden_size, intermediate_size)
        self.up_proj = nn.Linear(hidden_size, intermediate_size)
        self.down_proj = nn.Linear(intermediate_size, hidden_size)

    def forward(
            self,
            x: torch.Tensor
    ) -> torch.Tensor:
        """
        Applies the SwiGLU FFN to the input.

        Args:
            x: Input tensor. Shape: `(batch_size, seq_len, hidden_dim)`.

        Returns:
            Output tensor. Shape: `(batch_size, seq_len, hidden_dim)`.
        """

        return self.down_proj(
            LigerSiLUMulFunction.apply(
                self.gate_proj(x),
                self.up_proj(x)
            )
        )

    def reset_parameters(self):
        """Resets module parameters."""

        self.gate_proj.reset_parameters()
        self.up_proj.reset_parameters()
        self.down_proj.reset_parameters()

__init__(hidden_size, intermediate_size)

Constructs a SwiGLU object.

Parameters:

Name Type Description Default
hidden_size int

The hidden dim size.

required
intermediate_size int

The intermediate dim size of the FFN.

required
Source code in d9d/module/block/ffn/swiglu.py
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
def __init__(
        self,
        hidden_size: int,
        intermediate_size: int
):
    """
    Constructs a SwiGLU object.

    Args:
        hidden_size: The hidden dim size.
        intermediate_size: The intermediate dim size of the FFN.
    """

    super().__init__()
    self.gate_proj = nn.Linear(hidden_size, intermediate_size)
    self.up_proj = nn.Linear(hidden_size, intermediate_size)
    self.down_proj = nn.Linear(intermediate_size, hidden_size)

forward(x)

Applies the SwiGLU FFN to the input.

Parameters:

Name Type Description Default
x Tensor

Input tensor. Shape: (batch_size, seq_len, hidden_dim).

required

Returns:

Type Description
Tensor

Output tensor. Shape: (batch_size, seq_len, hidden_dim).

Source code in d9d/module/block/ffn/swiglu.py
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
def forward(
        self,
        x: torch.Tensor
) -> torch.Tensor:
    """
    Applies the SwiGLU FFN to the input.

    Args:
        x: Input tensor. Shape: `(batch_size, seq_len, hidden_dim)`.

    Returns:
        Output tensor. Shape: `(batch_size, seq_len, hidden_dim)`.
    """

    return self.down_proj(
        LigerSiLUMulFunction.apply(
            self.gate_proj(x),
            self.up_proj(x)
        )
    )

reset_parameters()

Resets module parameters.

Source code in d9d/module/block/ffn/swiglu.py
57
58
59
60
61
62
def reset_parameters(self):
    """Resets module parameters."""

    self.gate_proj.reset_parameters()
    self.up_proj.reset_parameters()
    self.down_proj.reset_parameters()