File: lfm2.cpp

package info (click to toggle)
whisper.cpp 1.8.3%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 32,228 kB
  • sloc: cpp: 188,765; ansic: 121,729; lisp: 10,221; sh: 4,272; objc: 2,159; ruby: 1,682; python: 1,177; javascript: 594; makefile: 144
file content (175 lines) | stat: -rw-r--r-- 7,834 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#include "models.h"

#include "../llama-memory-hybrid.h"


llm_build_lfm2::llm_build_lfm2(const llama_model & model, const llm_graph_params & params) :
    llm_graph_context(params),
    model(model) {
    ggml_tensor * cur = build_inp_embd(model.tok_embd);
    cb(cur, "model.embed_tokens", -1);

    ggml_build_forward_expand(gf, cur);

    ggml_tensor * inp_pos     = build_inp_pos();
    auto *        inp_hybrid  = build_inp_mem_hybrid();
    ggml_tensor * inp_out_ids = build_inp_out_ids();

    for (int il = 0; il < n_layer; ++il) {
        const bool is_moe_layer = il >= static_cast<int>(hparams.n_layer_dense_lead);

        auto * prev_cur = cur;
        cur             = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
        cb(cur, "model.layers.{}.operator_norm", il);

        cur = hparams.is_recurrent(il) ? build_shortconv_block(cur, inp_hybrid->get_recr(), il) :
                                         build_attn_block(cur, inp_pos, inp_hybrid->get_attn(), il);

        if (il == n_layer - 1 && inp_out_ids) {
            cur      = ggml_get_rows(ctx0, cur, inp_out_ids);
            prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids);
        }

        cur = ggml_add(ctx0, prev_cur, cur);

        auto * ffn_norm_out = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
        cb(ffn_norm_out, "model.layers.{}.ffn_norm", il);

        ggml_tensor * ffn_out =
            is_moe_layer ? build_moe_feed_forward(ffn_norm_out, il) : build_dense_feed_forward(ffn_norm_out, il);
        cb(ffn_norm_out, "model.layers.{}.ffn_out", il);

        cur = ggml_add(ctx0, cur, ffn_out);
    }

    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
    cb(cur, "result_norm", -1);
    res->t_embd = cur;

    cur = build_lora_mm(model.output, cur);
    cb(cur, "result_output", -1);

    res->t_logits = cur;

    ggml_build_forward_expand(gf, cur);
}

ggml_tensor * llm_build_lfm2::build_moe_feed_forward(ggml_tensor * cur, int il) const {
    return build_moe_ffn(cur,
                        model.layers[il].ffn_gate_inp, model.layers[il].ffn_up_exps,
                        model.layers[il].ffn_gate_exps, model.layers[il].ffn_down_exps,
                        model.layers[il].ffn_exp_probs_b, n_expert, n_expert_used, LLM_FFN_SILU, true, false, 0.0,
                        static_cast<llama_expert_gating_func_type>(hparams.expert_gating_func), il);
}

ggml_tensor * llm_build_lfm2::build_dense_feed_forward(ggml_tensor * cur, int il) const {
    GGML_ASSERT(!model.layers[il].ffn_up_b);
    GGML_ASSERT(!model.layers[il].ffn_gate_b);
    GGML_ASSERT(!model.layers[il].ffn_down_b);
    return build_ffn(cur,
        model.layers[il].ffn_up, NULL, NULL,
        model.layers[il].ffn_gate, NULL, NULL,
        model.layers[il].ffn_down, NULL, NULL,
        NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
}

ggml_tensor * llm_build_lfm2::build_attn_block(ggml_tensor *             cur,
                                               ggml_tensor *             inp_pos,
                                               llm_graph_input_attn_kv * inp_attn,
                                               int                       il) const {
    GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il));
    const auto n_embd_head = hparams.n_embd_head_v;
    const auto n_head_kv   = hparams.n_head_kv(il);

    auto * q = build_lora_mm(model.layers[il].wq, cur);
    cb(q, "model.layers.{}.self_attn.q_proj", il);
    auto * k = build_lora_mm(model.layers[il].wk, cur);
    cb(k, "model.layers.{}.self_attn.k_proj", il);
    auto * v = build_lora_mm(model.layers[il].wv, cur);
    cb(v, "model.layers.{}.self_attn.v_proj", il);

    q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens);
    k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens);
    v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens);

    // qk norm
    q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
    cb(q, "model.layers.{}.self_attn.q_layernorm", il);
    k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
    cb(k, "model.layers.{}.self_attn.k_layernorm", il);

    // RoPE
    q = ggml_rope_ext(ctx0, q, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
                      attn_factor, beta_fast, beta_slow);
    k = ggml_rope_ext(ctx0, k, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, ext_factor,
                      attn_factor, beta_fast, beta_slow);

    cur = build_attn(inp_attn,
            model.layers[il].wo, NULL,
            q, k, v, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);

    cb(cur, "model.layers.{}.self_attn.out_proj", il);

    return cur;
}

ggml_tensor * llm_build_lfm2::build_shortconv_block(ggml_tensor * cur, llm_graph_input_rs * inp_recr, int il) {
    const auto *   mctx_cur     = static_cast<const llama_memory_hybrid_context *>(mctx)->get_recr();
    const uint32_t kv_head      = mctx_cur->get_head();
    const int64_t  n_seq_tokens = ubatch.n_seq_tokens;
    const int64_t  n_seqs       = ubatch.n_seqs;
    GGML_ASSERT(n_seqs != 0);
    GGML_ASSERT(ubatch.equal_seqs());
    GGML_ASSERT(ubatch.n_tokens == n_seq_tokens * n_seqs);

    GGML_ASSERT(hparams.n_shortconv_l_cache > 1);
    const uint32_t d_conv = hparams.n_shortconv_l_cache - 1;

    // {n_embd, n_tokens} => {n_embd, n_seq_tokens, n_seqs}
    cur = ggml_reshape_3d(ctx0, cur, cur->ne[0], n_seq_tokens, n_seqs);

    auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur);
    cb(bcx, "model.layers.{}.conv.in_proj", il);

    constexpr auto n_chunks = 3;
    GGML_ASSERT(bcx->ne[0] % n_chunks == 0);
    const auto chunk_size = bcx->ne[0] / n_chunks;
    auto *     b          = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
                                         0 * chunk_size * ggml_element_size(bcx));
    auto *     c          = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
                                         1 * chunk_size * ggml_element_size(bcx));
    auto *     x          = ggml_view_3d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->ne[2], bcx->nb[1], bcx->nb[2],
                                         2 * chunk_size * ggml_element_size(bcx));

    auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x));

    // read conv state
    auto * conv_state = mctx_cur->get_r_l(il);
    auto * conv_rs    = build_rs(inp_recr, conv_state, hparams.n_embd_r(), n_seqs);
    auto * conv       = ggml_reshape_3d(ctx0, conv_rs, d_conv, hparams.n_embd, n_seqs);

    bx = ggml_concat(ctx0, conv, bx, 0);
    GGML_ASSERT(bx->ne[0] > conv->ne[0]);

    // last d_conv columns is a new conv state
    auto * new_conv = ggml_view_3d(ctx0, bx, conv->ne[0], bx->ne[1], bx->ne[2], bx->nb[1], bx->nb[2],
                                   (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx));
    GGML_ASSERT(ggml_are_same_shape(conv, new_conv));

    // write new conv conv state
    ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv,
                                           ggml_view_1d(ctx0, conv_state, ggml_nelements(new_conv),
                                                        kv_head * d_conv * n_embd * ggml_element_size(new_conv))));

    auto * conv_kernel = model.layers[il].shortconv.conv;
    auto * conv_out    = ggml_ssm_conv(ctx0, bx, conv_kernel);
    cb(conv_out, "model.layers.{}.conv.conv", il);

    auto * y = ggml_mul(ctx0, c, conv_out);
    y        = build_lora_mm(model.layers[il].shortconv.out_proj, y);
    cb(y, "model.layers.{}.conv.out_proj", il);
    // {n_embd, n_seq_tokens, n_seqs} => {n_embd, n_tokens}
    y = ggml_reshape_2d(ctx0, y, y->ne[0], n_seq_tokens * n_seqs);

    return y;
}