File: heuristics.rb

package info (click to toggle)
ruby-github-linguist 7.27.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 14,204 kB
  • sloc: ruby: 1,872; lex: 173; ansic: 35; makefile: 9
file content (171 lines) | stat: -rw-r--r-- 4,291 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
require 'yaml'

module Linguist
  # A collection of simple heuristics that can be used to better analyze languages.
  class Heuristics
    HEURISTICS_CONSIDER_BYTES = 50 * 1024

    # Public: Use heuristics to detect language of the blob.
    #
    # blob               - An object that quacks like a blob.
    # possible_languages - Array of Language objects
    #
    # Examples
    #
    #   Heuristics.call(FileBlob.new("path/to/file"), [
    #     Language["Ruby"], Language["Python"]
    #   ])
    #
    # Returns an Array of languages, or empty if none matched or were inconclusive.
    def self.call(blob, candidates)
      return [] if blob.symlink?
      self.load()

      data = blob.data[0...HEURISTICS_CONSIDER_BYTES]

      @heuristics.each do |heuristic|
        if heuristic.matches?(blob.name, candidates)
          return Array(heuristic.call(data))
        end
      end

      [] # No heuristics matched
    rescue Regexp::TimeoutError
      [] # Return nothing if we have a bad regexp which leads to a timeout enforced by Regexp.timeout in Ruby 3.2 or later
    end

    # Public: Get all heuristic definitions
    #
    # Returns an Array of heuristic objects.
    def self.all
      self.load()
      @heuristics
    end

    # Internal: Load heuristics from 'heuristics.yml'.
    def self.load()
      if @heuristics.any?
        return
      end

      data = self.load_config
      named_patterns = data['named_patterns'].map { |k,v| [k, self.to_regex(v)] }.to_h

      data['disambiguations'].each do |disambiguation|
        exts = disambiguation['extensions']
        rules = disambiguation['rules']
        rules.map! do |rule|
          rule['pattern'] = self.parse_rule(named_patterns, rule)
          rule
        end
        @heuristics << new(exts, rules)
      end
    end

    def self.load_config
      YAML.load_file(File.expand_path("../heuristics.yml", __FILE__))
    end

    def self.parse_rule(named_patterns, rule)
      if !rule['and'].nil?
        rules = rule['and'].map { |block| self.parse_rule(named_patterns, block) }
        return And.new(rules)
      elsif !rule['pattern'].nil?
        return self.to_regex(rule['pattern'])
      elsif !rule['negative_pattern'].nil?
        pat = self.to_regex(rule['negative_pattern'])
        return NegativePattern.new(pat)
      elsif !rule['named_pattern'].nil?
        return named_patterns[rule['named_pattern']]
      else
        return AlwaysMatch.new()
      end
    end

    # Internal: Converts a string or array of strings to regexp
    #
    # str: string or array of strings. If it is an array of strings,
    #      Regexp.union will be used.
    def self.to_regex(str)
      if str.kind_of?(Array)
        Regexp.union(str.map { |s| Regexp.new(s) })
      else
        Regexp.new(str)
      end
    end

    # Internal: Array of defined heuristics
    @heuristics = []

    # Internal
    def initialize(exts, rules)
      @exts = exts
      @rules = rules
    end

    # Internal: Return the heuristic's target extensions
    def extensions
      @exts
    end

    # Internal: Return the heuristic's candidate languages
    def languages
      @rules.map do |rule|
        [rule['language']].flatten(2).map { |name| Language[name] }
      end.flatten.uniq
    end

    # Internal: Check if this heuristic matches the candidate filenames or
    # languages.
    def matches?(filename, candidates)
      filename = filename.downcase
      candidates = candidates.compact.map(&:name)
      @exts.any? { |ext| filename.end_with?(ext) }
    end

    # Internal: Perform the heuristic
    def call(data)
      matched = @rules.find do |rule|
        rule['pattern'].match(data)
      end
      if !matched.nil?
        languages = matched['language']
        if languages.is_a?(Array)
          languages.map{ |l| Language[l] }
        else
          Language[languages]
        end
      end
    end
  end

  class And

    def initialize(pats)
      @pats = pats
    end

    def match(input)
      return !@pats.any? { |pat| !pat.match(input) }
    end

  end

  class AlwaysMatch
    def match(input)
      return true
    end
  end

  class NegativePattern

    def initialize(pat)
      @pat = pat
    end

    def match(input)
      return !@pat.match(input)
    end

  end
end