File: find-common-stuff

package info (click to toggle)
whatweb 0.4.8~git20161009-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 8,588 kB
  • ctags: 540
  • sloc: ruby: 33,376; sh: 612; makefile: 42
file content (86 lines) | stat: -rwxr-xr-x 2,297 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env ruby
# helper for plugin research
# identify common tags, paths / quoted text in tags, link text
# add paths, truncated paths
#

require 'getoptlong'
require 'pp'

ignore_tags=%w|<head> </script> </head> </title> </html> <script> </body> </td> </a> <html> </tr> </table> <tr> <td> </div> </p> </P> </A> <title> </li> </ul> </style> </span> </form> <li> <br /> <p> <ul> </h1> <span> </label> </strong> <strong> <div> </h2> <dt> </h3> <h2> <h3> <noscript> </noscript> <body> <em> </b> <b> </thead> <thead> <table> <br> <br\ /> </font> </em> <h1> <small> </small>|


class Webpage
	attr_reader :filename, :tags, :quoted_text
	
	def initialize(filename)
		@filename=filename
		@contents=File.read(filename)
		@tags=@contents.scan(/<[^>]+>/).sort.uniq
		@quoted_text=@contents.scan(/<[^>]+("[^"]+")>/).flatten.sort.uniq
	end
end

def usage
	puts "Usage: find-common-stuff FILES"
	puts "--threshold, -t\tThe lowest % of files an item occurs in to display. Eg. 0.25 and 0.50"
	puts
end

threshold = 0.25

opts = GetoptLong.new(
      [ '--help', '-h', GetoptLong::NO_ARGUMENT ],
      [ '--threshold', '-t', GetoptLong::REQUIRED_ARGUMENT ]            
	  )
	  
opts.each do |opt, arg|
    case opt
    	when '-t','--threshold'
    			threshold=arg
    	when '-h','--help'
    			usage
    			exit
    end
end    
   
if ARGV.length <1
	usage
	exit
end 			

#files=Dir["tests/plone/*html"]
files=ARGV

all=[]
# for each file, have a set of hashes
files.each do |f|    
	w=Webpage.new(f)
	all << w
end

puts "imported #{all.size} files"

# find tags common to all/most files
all_tags_counted={}
all_tags = all.map {|w| w.tags}.flatten.sort.uniq - ignore_tags
puts "counted #{all_tags.size} tags"
all_tags.each {|tag| 
	truth=all.map {|wp| wp.tags.include?(tag) }
	all_tags_counted[tag]=truth.count(true) unless truth.count(true) / all.size.to_f < threshold
}
pp all_tags_counted.sort_by{|x| x[1] }.reverse


all_quoted_text_counted={}
all_quoted_text = all.map {|w| w.quoted_text }.flatten.sort.uniq
puts "counted #{all_quoted_text.size} quoted texts"

all_quoted_text.each {|qt| 
	truth=all.map {|wp| wp.quoted_text.include?(qt) }
	all_quoted_text_counted[qt]=truth.count(true) unless truth.count(true) / all.size.to_f < threshold
}

pp all_quoted_text_counted.sort_by{|x| x[1] }.reverse