1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171
|
require 'yaml'
module Linguist
# A collection of simple heuristics that can be used to better analyze languages.
class Heuristics
HEURISTICS_CONSIDER_BYTES = 50 * 1024
# Public: Use heuristics to detect language of the blob.
#
# blob - An object that quacks like a blob.
# possible_languages - Array of Language objects
#
# Examples
#
# Heuristics.call(FileBlob.new("path/to/file"), [
# Language["Ruby"], Language["Python"]
# ])
#
# Returns an Array of languages, or empty if none matched or were inconclusive.
def self.call(blob, candidates)
return [] if blob.symlink?
self.load()
data = blob.data[0...HEURISTICS_CONSIDER_BYTES]
@heuristics.each do |heuristic|
if heuristic.matches?(blob.name, candidates)
return Array(heuristic.call(data))
end
end
[] # No heuristics matched
rescue Regexp::TimeoutError
[] # Return nothing if we have a bad regexp which leads to a timeout enforced by Regexp.timeout in Ruby 3.2 or later
end
# Public: Get all heuristic definitions
#
# Returns an Array of heuristic objects.
def self.all
self.load()
@heuristics
end
# Internal: Load heuristics from 'heuristics.yml'.
def self.load()
if @heuristics.any?
return
end
data = self.load_config
named_patterns = data['named_patterns'].map { |k,v| [k, self.to_regex(v)] }.to_h
data['disambiguations'].each do |disambiguation|
exts = disambiguation['extensions']
rules = disambiguation['rules']
rules.map! do |rule|
rule['pattern'] = self.parse_rule(named_patterns, rule)
rule
end
@heuristics << new(exts, rules)
end
end
def self.load_config
YAML.load_file(File.expand_path("../heuristics.yml", __FILE__))
end
def self.parse_rule(named_patterns, rule)
if !rule['and'].nil?
rules = rule['and'].map { |block| self.parse_rule(named_patterns, block) }
return And.new(rules)
elsif !rule['pattern'].nil?
return self.to_regex(rule['pattern'])
elsif !rule['negative_pattern'].nil?
pat = self.to_regex(rule['negative_pattern'])
return NegativePattern.new(pat)
elsif !rule['named_pattern'].nil?
return named_patterns[rule['named_pattern']]
else
return AlwaysMatch.new()
end
end
# Internal: Converts a string or array of strings to regexp
#
# str: string or array of strings. If it is an array of strings,
# Regexp.union will be used.
def self.to_regex(str)
if str.kind_of?(Array)
Regexp.union(str.map { |s| Regexp.new(s) })
else
Regexp.new(str)
end
end
# Internal: Array of defined heuristics
@heuristics = []
# Internal
def initialize(exts, rules)
@exts = exts
@rules = rules
end
# Internal: Return the heuristic's target extensions
def extensions
@exts
end
# Internal: Return the heuristic's candidate languages
def languages
@rules.map do |rule|
[rule['language']].flatten(2).map { |name| Language[name] }
end.flatten.uniq
end
# Internal: Check if this heuristic matches the candidate filenames or
# languages.
def matches?(filename, candidates)
filename = filename.downcase
candidates = candidates.compact.map(&:name)
@exts.any? { |ext| filename.end_with?(ext) }
end
# Internal: Perform the heuristic
def call(data)
matched = @rules.find do |rule|
rule['pattern'].match(data)
end
if !matched.nil?
languages = matched['language']
if languages.is_a?(Array)
languages.map{ |l| Language[l] }
else
Language[languages]
end
end
end
end
class And
def initialize(pats)
@pats = pats
end
def match(input)
return !@pats.any? { |pat| !pat.match(input) }
end
end
class AlwaysMatch
def match(input)
return true
end
end
class NegativePattern
def initialize(pat)
@pat = pat
end
def match(input)
return !@pat.match(input)
end
end
end
|