File: lua_cta.lua

package info (click to toggle)
rspamd 3.13.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 34,056 kB
  • sloc: ansic: 243,746; cpp: 105,657; javascript: 29,539; asm: 2,512; perl: 2,440; pascal: 1,625; python: 1,274; sql: 313; sh: 281; makefile: 140; xml: 74
file content (216 lines) | stat: -rw-r--r-- 8,622 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
--[[
CTA and link affiliation analysis

Purpose:
- Given a capped list of candidate links extracted in C during HTML parsing,
  compute simple affiliation scores between those links and the sender’s
  first-party domain, and pick a likely CTA (call-to-action) link.

How it is called:
- C code (message processing after HTML parsing) loads this function via
  `rspamd_lua_require_function(L, "lua_cta", "process_html_links")` and calls
  `process_html_links(task, part, ctx)` once per HTML text part.

Inputs (ctx table):
- links_total: total number of links in the part (summary; may be omitted)
- domains_total: number of distinct link domains (summary)
- max_links_single_domain: maximum links seen for a single domain (summary)
- candidates: array (capped in C, default 24) of small objects with fields:
  - host: link host (string)
  - idn, numeric, has_port, has_query, display_mismatch: booleans
  - order, part_order: integers (ordering hints)
  - etld1: optional eTLD+1 (if not set, this module approximates from host)

Outputs (returned table):
- cta_affiliated: boolean – whether the selected CTA appears affiliated
- cta_weight: number – simple weight hint (e.g. 1.0 if display mismatch)
- affiliated_ratio: number – fraction of candidates considered affiliated
- trackerish_ratio: number – fraction of candidates that look trackerish

Configuration (rspamd.conf):
- Use the `link_affiliation { ... }` section.
- Options:
  - stopwords: map (set/regexp/glob) used to strip common tracking tokens from
               domains when computing token overlap
  - whitelist / blacklist: optional maps (set) to tweak affiliation
  - min_similarity: number (default 0.5) – Jaccard threshold for affiliation
  - max_candidates: number (default 24) – extra Lua-side cap (C caps as well)

This module keeps all heavy config logic in Lua using lua_maps and only relies
on C to provide a bounded set of safe, pre-filtered candidates.
]]
local M = {}

local lua_util = require "lua_util"
local lua_maps = require "lua_maps"
local rspamd_util = require "rspamd_util"

-- Reasonable defaults (can be overridden in rspamd.conf: link_affiliation { ... })
local DEFAULT_STOPWORDS = {
  -- Common TLD tokens and ccTLDs
  "com", "net", "org", "info", "biz", "co", "io", "me", "us", "uk", "ru", "de", "fr", "au", "ca", "cn", "jp", "kr", "in",
  "eu",
  "es", "it", "nl", "pl", "se", "no", "fi", "dk", "cz", "sk", "pt", "tr", "gr", "hu", "ro", "bg", "ua", "by", "lt", "lv",
  "ee",
  "br", "mx", "ch", "be", "at", "dk", "cz", "sk", "pt", "ar", "cl", "pe", "tw", "th", "ph", "vn", "id", "hk", "sg", "nz",
  "za",
  "il", "ie", "is", "lu", "si", "hr", "rs", "gl", "ly",
  -- Generic / infrastructural
  "www", "web", "site", "app", "apps", "cloud", "cdn", "edge", "fastly", "akamai", "akamaihd", "edgesuite", "cloudfront",
  -- Tracking/redirect/marketing tokens
  "mail", "email", "news", "newsletter", "click", "link", "links", "go", "redir", "redirect", "rdir", "safe", "safelinks",
  "trk", "track", "tracking", "ref", "mkt", "mktg", "campaign", "promo", "offer", "offers",
  -- ESPs and bulk mailers (tokens found in their eTLD+1)
  "mailchimp", "mandrill", "sendgrid", "sparkpost", "sparkpostmail", "amazonses", "ses", "postmark", "postmarkapp",
  "mailgun",
  "sendinblue", "constantcontact", "list", "manage", "rs6", "aweber", "hubspot", "campaignmonitor", "cmail", "klaviyo",
  "sailthru",
  "drip", "convertkit", "getresponse", "mautic", "braze", "acoustic", "responsys", "eloqua", "iterable", "sendy",
  "emarsys", "mailjet",
  "mailerlite", "mailerq", "mailrelay", "mailup", "omnisend", "clickdimensions", "dotdigital", "pepipost"
}

local DEFAULT_WHITELIST = {
  -- Intentionally empty by default. Users can add trusted eTLD+1 domains here
}

local DEFAULT_BLACKLIST = {
  -- Popular shorteners / redirection eTLD+1
  "t.co", "bit.ly", "goo.gl", "tinyurl.com", "lnkd.in", "buff.ly", "ow.ly", "rebrand.ly", "bitly.com", "is.gd", "v.gd",
  "t.ly",
  "cutt.ly", "shorturl.at", "reurl.cc", "rb.gy", "s.id", "trib.al",
  -- Common ESP/tracker link domains (treat as non-affiliated by default)
  "list-manage.com", "mandrillapp.com", "sendgrid.net", "sparkpostmail.com", "amazonses.com", "postmarkapp.com",
  "mailgun.org",
  "sendinblue.com", "constantcontact.com", "campaignmonitor.com", "cmail1.com", "cmail2.com", "aweber.com", "hubspot.com",
  "exacttarget.com", "clickdimensions.com", "eloqua.com", "responsys.net", "emarsys.net", "mailjet.com", "klaviyo.com",
  "dripemail2.com",
  "getresponse.com", "benchmarkemail.com", "omnisend.com", "mailerlite.com", "dotdigital.com"
}

local settings = {
  min_similarity = 0.5,
  max_candidates = 24,
  stopwords = nil,
  whitelist = nil,
  blacklist = nil,
}

local function load_settings()
  local cfg = rawget(_G, 'rspamd_config')
  local opts = (cfg and cfg:get_all_opt('link_affiliation')) or {}
  settings = lua_util.override_defaults(settings, opts)
  -- Convert map definitions to maps if needed
  if settings.stopwords then
    if type(settings.stopwords) ~= 'table' or not settings.stopwords.get_key then
      settings.stopwords = lua_maps.map_add_from_ucl(settings.stopwords, 'set', 'link affiliation stopwords')
    end
  else
    settings.stopwords = lua_maps.map_add_from_ucl(DEFAULT_STOPWORDS, 'set', 'link affiliation stopwords (default)')
  end
  if settings.whitelist then
    if type(settings.whitelist) ~= 'table' or not settings.whitelist.get_key then
      settings.whitelist = lua_maps.map_add_from_ucl(settings.whitelist, 'set', 'link affiliation whitelist')
    end
  else
    settings.whitelist = lua_maps.map_add_from_ucl(DEFAULT_WHITELIST, 'set', 'link affiliation whitelist (default)')
  end
  if settings.blacklist then
    if type(settings.blacklist) ~= 'table' or not settings.blacklist.get_key then
      settings.blacklist = lua_maps.map_add_from_ucl(settings.blacklist, 'set', 'link affiliation blacklist')
    end
  else
    settings.blacklist = lua_maps.map_add_from_ucl(DEFAULT_BLACKLIST, 'set', 'link affiliation blacklist (default)')
  end
end

load_settings()

local function etld1_tokens(dom)
  local t = {}
  for token in string.gmatch(string.lower(dom or ''), "[a-z0-9]+") do
    if not (settings.stopwords and settings.stopwords:get_key(token)) then
      t[token] = true
    end
  end
  return t
end

local function jaccard(a, b)
  local inter, uni = 0, 0
  for k in pairs(a) do
    if b[k] then inter = inter + 1 end
    uni = uni + 1
  end
  for k in pairs(b) do
    if not a[k] then uni = uni + 1 end
  end
  if uni == 0 then return 0 end
  return inter / uni
end

M.process_html_links = function(task, part, ctx)
  local first_party = nil
  -- Derive first-party from From: if not provided
  do
    local from = task:get_from('mime') or {}
    if from[1] and from[1].domain then
      first_party = from[1].domain
    end
  end

  local cands = ctx.candidates or {}
  if #cands > settings.max_candidates then
    local tmp = {}
    for i = 1, settings.max_candidates do tmp[i] = cands[i] end
    cands = tmp
  end
  local affiliated = 0
  local trackerish = 0

  local fp_tokens = etld1_tokens(first_party)

  for _, c in ipairs(cands) do
    local etld1 = c.etld1 or rspamd_util.get_tld(c.host or '') or (c.host or '')

    local toks = etld1_tokens(etld1)
    local sim = jaccard(fp_tokens, toks)

    if sim >= settings.min_similarity then
      affiliated = affiliated + 1
    end

    -- very naive trackerish: all tokens are stopwords or too few tokens
    local n_tokens, n_nonstop = 0, 0
    for _ in pairs(toks) do
      n_tokens = n_tokens + 1; n_nonstop = n_nonstop + 1
    end
    if n_nonstop == 0 then trackerish = trackerish + 1 end
  end

  local res = {
    affiliated_ratio = (#cands > 0) and (affiliated / #cands) or 0,
    trackerish_ratio = (#cands > 0) and (trackerish / #cands) or 0,
  }

  -- Simple CTA guess: prefer higher C-side weight, then display_mismatch, then earliest order
  if #cands > 0 then
    table.sort(cands, function(a, b)
      local aw, bw = tonumber(a.weight) or 0, tonumber(b.weight) or 0
      if aw ~= bw then return aw > bw end
      if a.display_mismatch ~= b.display_mismatch then return a.display_mismatch end
      if a.order ~= b.order then return a.order < b.order end
      return a.part_order < b.part_order
    end)
    local cta = cands[1]
    local etld1 = cta.etld1 or rspamd_util.get_tld(cta.host or '') or (cta.host or '')
    local toks = etld1_tokens(etld1)
    local sim = jaccard(fp_tokens, toks)
    res.cta_affiliated = (sim >= settings.min_similarity)
    res.cta_weight = (cta.display_mismatch and 1.0 or 0.5)
  end

  return res
end

return M