File: samples.rb

package info (click to toggle)
ruby-github-linguist 7.27.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 14,204 kB
  • sloc: ruby: 1,872; lex: 173; ansic: 35; makefile: 9
file content (114 lines) | stat: -rw-r--r-- 3,141 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
begin
  require 'yajl'
rescue LoadError
  require 'json'
end

require 'linguist/sha256'
require 'linguist/classifier'
require 'linguist/shebang'

module Linguist
  # Model for accessing classifier training data.
  module Samples
    # Path to samples root directory
    ROOT = File.expand_path("../../../samples", __FILE__)

    # Path for serialized samples db
    PATH = File.expand_path('../samples.json', __FILE__)

    # Hash of serialized samples object, cached in memory
    def self.cache
      @cache ||= load_samples
    end

    # Hash of serialized samples object, uncached
    def self.load_samples
      serializer = defined?(Yajl) ? Yajl : JSON
      serializer.load(File.read(PATH, encoding: 'utf-8'))
    end

    # Public: Iterate over each sample.
    #
    # &block - Yields Sample to block
    #
    # Returns nothing.
    def self.each(&block)
      Dir.entries(ROOT).sort!.each do |category|
        next if category == '.' || category == '..'

        dirname = File.join(ROOT, category)
        Dir.entries(dirname).each do |filename|
          next if filename == '.' || filename == '..'

          if filename == 'filenames'
            Dir.entries(File.join(dirname, filename)).each do |subfilename|
              next if subfilename == '.' || subfilename == '..'

              yield({
                :path    => File.join(dirname, filename, subfilename),
                :language => category,
                :filename => subfilename
              })
            end
          else
            path = File.join(dirname, filename)
            extname = File.extname(filename)

            yield({
              :path     => path,
              :language => category,
              :interpreter => Shebang.interpreter(File.read(path)),
              :extname  => extname.empty? ? nil : extname
            })
          end
        end
      end

      nil
    end

    # Public: Build Classifier from all samples.
    #
    # Returns trained Classifier.
    def self.data
      db = {}
      db['extnames'] = {}
      db['interpreters'] = {}
      db['filenames'] = {}

      each do |sample|
        language_name = sample[:language]

        if sample[:extname]
          db['extnames'][language_name] ||= []
          if !db['extnames'][language_name].include?(sample[:extname])
            db['extnames'][language_name] << sample[:extname]
            db['extnames'][language_name].sort!
          end
        end

        if sample[:interpreter]
          db['interpreters'][language_name] ||= []
          if !db['interpreters'][language_name].include?(sample[:interpreter])
            db['interpreters'][language_name] << sample[:interpreter]
            db['interpreters'][language_name].sort!
          end
        end

        if sample[:filename]
          db['filenames'][language_name] ||= []
          db['filenames'][language_name] << sample[:filename]
          db['filenames'][language_name].sort!
        end

        data = File.read(sample[:path])
        Classifier.train!(db, language_name, data)
      end

      db['sha256'] = Linguist::SHA256.hexdigest(db)

      db
    end
  end
end