1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165
|
package goose
import (
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/fatih/set"
)
// VideoExtractor can extract the main video from an HTML page
type VideoExtractor struct {
article *Article
config Configuration
candidates *set.Set
movies *set.Set
}
type video struct {
embedType string
provider string
width int
height int
embedCode string
src string
}
// NewVideoExtractor returns a new instance of a HTML video extractor
func NewVideoExtractor() VideoExtractor {
return VideoExtractor{
candidates: set.New(set.ThreadSafe).(*set.Set),
movies: set.New(set.ThreadSafe).(*set.Set),
}
}
var videoTags = [4]string{"iframe", "embed", "object", "video"}
var videoProviders = [4]string{"youtube", "vimeo", "dailymotion", "kewego"}
func (ve *VideoExtractor) getEmbedCode(node *goquery.Selection) string {
return node.Text()
}
func (ve *VideoExtractor) getWidth(node *goquery.Selection) int {
value, exists := node.Attr("width")
if exists {
nvalue, _ := strconv.Atoi(value)
return nvalue
}
return 0
}
func (ve *VideoExtractor) getHeight(node *goquery.Selection) int {
value, exists := node.Attr("height")
if exists {
nvalue, _ := strconv.Atoi(value)
return nvalue
}
return 0
}
func (ve *VideoExtractor) getSrc(node *goquery.Selection) string {
value, exists := node.Attr("src")
if exists {
return value
}
return ""
}
func (ve *VideoExtractor) getProvider(src string) string {
if src != "" {
for _, provider := range videoProviders {
if strings.Contains(src, provider) {
return provider
}
}
}
return ""
}
func (ve *VideoExtractor) getVideo(node *goquery.Selection) video {
src := ve.getSrc(node)
video := video{
embedCode: ve.getEmbedCode(node),
embedType: node.Get(0).DataAtom.String(),
width: ve.getWidth(node),
height: ve.getHeight(node),
src: src,
provider: ve.getProvider(src),
}
return video
}
func (ve *VideoExtractor) getIFrame(node *goquery.Selection) video {
return ve.getVideo(node)
}
func (ve *VideoExtractor) getVideoTag(node *goquery.Selection) video {
return video{}
}
func (ve *VideoExtractor) getEmbedTag(node *goquery.Selection) video {
parent := node.Parent()
if parent != nil {
parentTag := parent.Get(0).DataAtom.String()
if parentTag == "object" {
return ve.getObjectTag(node)
}
}
return ve.getVideo(node)
}
func (ve *VideoExtractor) getObjectTag(node *goquery.Selection) video {
childEmbedTag := node.Find("embed")
if ve.candidates.Has(childEmbedTag) {
ve.candidates.Remove(childEmbedTag)
}
srcNode := node.Find(`param[name="movie"]`)
if srcNode == nil || srcNode.Length() == 0 {
return video{}
}
src, _ := srcNode.Attr("value")
provider := ve.getProvider(src)
if provider == "" {
return video{}
}
video := ve.getVideo(node)
video.provider = provider
video.src = src
return video
}
// GetVideos returns the video tags embedded in the article
func (ve *VideoExtractor) GetVideos(doc *goquery.Document) *set.Set {
var nodes *goquery.Selection
for _, videoTag := range videoTags {
tmpNodes := doc.Find(videoTag)
if nodes == nil {
nodes = tmpNodes
} else {
nodes.Union(tmpNodes)
}
}
nodes.Each(func(i int, node *goquery.Selection) {
tag := node.Get(0).DataAtom.String()
var movie video
switch tag {
case "video":
movie = ve.getVideoTag(node)
case "embed":
movie = ve.getEmbedTag(node)
case "object":
movie = ve.getObjectTag(node)
case "iframe":
movie = ve.getIFrame(node)
}
if movie.src != "" {
ve.movies.Add(movie)
}
})
return ve.movies
}
|