File: lua_fuzzy_html.lua

package info (click to toggle)
rspamd 3.14.3-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 35,064 kB
  • sloc: ansic: 247,728; cpp: 107,741; javascript: 31,385; perl: 3,089; asm: 2,512; pascal: 1,625; python: 1,510; sh: 589; sql: 313; makefile: 195; xml: 74
file content (97 lines) | stat: -rw-r--r-- 2,822 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
--[[
HTML Fuzzy Hashing Helper Module

This module provides helper functions for HTML fuzzy hash matching
and phishing detection based on HTML structure vs. content mismatches.

Use case: Detect phishing where HTML structure matches legitimate emails
but CTA (Call-To-Action) domains are different.
]]

local exports = {}
local lua_util = require "lua_util"

--[[
Analyze fuzzy results to detect potential phishing based on:
- Text content fuzzy match (high score)
- HTML structure fuzzy match (high score)
- But HTML CTA domains differ from known legitimate

Returns: phishing_score, explanation
]]
exports.check_html_text_mismatch = function(task, fuzzy_results)
  local html_matches = {}
  local text_matches = {}

  -- Separate HTML and text fuzzy matches
  for _, res in ipairs(fuzzy_results or {}) do
    if res.type == 'html' then
      table.insert(html_matches, res)
    elseif res.type == 'txt' then
      table.insert(text_matches, res)
    end
  end

  -- Phishing scenario: high text match but low/no HTML match
  if #text_matches > 0 and #html_matches == 0 then
    local max_text_score = 0
    for _, res in ipairs(text_matches) do
      if res.score > max_text_score then
        max_text_score = res.score
      end
    end

    -- High text match but no HTML match = suspicious
    if max_text_score > 0.7 then
      return max_text_score * 0.5, string.format(
        "Text fuzzy match (%.2f) without HTML match - possible CTA substitution",
        max_text_score)
    end
  end

  -- Inverse scenario: HTML match but no text match
  -- (Could be template with varying content - less suspicious)
  if #html_matches > 0 and #text_matches == 0 then
    local max_html_score = 0
    for _, res in ipairs(html_matches) do
      if res.score > max_html_score then
        max_html_score = res.score
      end
    end

    -- This is expected for newsletters/notifications
    lua_util.debugm('fuzzy_html', task,
      'HTML match (%.2f) without text match - likely template variation',
      max_html_score)
  end

  return 0, nil
end

--[[
Check if message has suspicious HTML fuzzy pattern:
- Known legitimate HTML structure
- But text content is different or manipulated
- Useful for brand protection

Example: Amazon email template with phishing text
]]
exports.check_brand_hijack = function(task, html_fuzzy_result, text_fuzzy_result)
  if not html_fuzzy_result then
    return 0, nil
  end

  -- High HTML match = known template
  if html_fuzzy_result.score > 0.8 then
    -- Check if text is suspicious
    if not text_fuzzy_result or text_fuzzy_result.score < 0.3 then
      return html_fuzzy_result.score * 0.6,
        string.format("Known HTML template (%.2f) with unfamiliar text - possible brand hijacking",
          html_fuzzy_result.score)
    end
  end

  return 0, nil
end

return exports