
|
package goose
import (
"strconv"
"strings"
"github.com/PuerkitoBio/goquery"
"github.com/fatih/set"
)
// VideoExtractor can extract the main video from an HTML page
type VideoExtractor struct {
article *Article
config Configuration
candidates *set.Set
movies *set.Set
}
type video struct {
embedType string
provider string
width int
height int
embedCode string
src string
}
// NewVideoExtractor returns a new instance of a HTML video extractor
func NewVideoExtractor() VideoExtractor {
return VideoExtractor{
candidates: set.New(set.ThreadSafe).(*set.Set),
movies: set.New(set.ThreadSafe).(*set.Set),
}
}
var videoTags = [4]string{"iframe", "embed", "object", "video"}
var videoProviders = [4]string{"youtube", "vimeo", "dailymotion", "kewego"}
func (ve *VideoExtractor) getEmbedCode(node *goquery.Selection) string {
return node.Text()
}
func (ve *VideoExtractor) getWidth(node *goquery.Selection) int {
value, exists := node.Attr("width")
if exists {
nvalue, _ := strconv.Atoi(value)
return nvalue
}
return 0
}
func (ve *VideoExtractor) getHeight(node *goquery.Selection) int {
value, exists := node.Attr("height")
if exists {
nvalue, _ := strconv.Atoi(value)
return nvalue
}
return 0
}
func (ve *VideoExtractor) getSrc(node *goquery.Selection) string {
value, exists := node.Attr("src")
if exists {
return value
}
return ""
}
func (ve *VideoExtractor) getProvider(src string) string {
if src != "" {
for _, provider := range videoProviders {
if strings.Contains(src, provider) {
return provider
}
}
}
return ""
}
func (ve *VideoExtractor) getVideo(node *goquery.Selection) video {
src := ve.getSrc(node)
video := video{
embedCode: ve.getEmbedCode(node),
embedType: node.Get(0).DataAtom.String(),
width: ve.getWidth(node),
height: ve.getHeight(node),
src: src,
provider: ve.getProvider(src),
}
return video
}
func (ve *VideoExtractor) getIFrame(node *goquery.Selection) video {
return ve.getVideo(node)
}
func (ve *VideoExtractor) getVideoTag(node *goquery.Selection) video {
return video{}
}
func (ve *VideoExtractor) getEmbedTag(node *goquery.Selection) video {
parent := node.Parent()
if parent != nil {
parentTag := parent.Get(0).DataAtom.String()
if parentTag == "object" {
return ve.getObjectTag(node)
}
}
return ve.getVideo(node)
}
func (ve *VideoExtractor) getObjectTag(node *goquery.Selection) video {
childEmbedTag := node.Find("embed")
if ve.candidates.Has(childEmbedTag) {
ve.candidates.Remove(childEmbedTag)
}
srcNode := node.Find(`param[name="movie"]`)
if srcNode == nil || srcNode.Length() == 0 {
return video{}
}
src, _ := srcNode.Attr("value")
provider := ve.getProvider(src)
if provider == "" {
return video{}
}
video := ve.getVideo(node)
video.provider = provider
video.src = src
return video
}
// GetVideos returns the video tags embedded in the article
func (ve *VideoExtractor) GetVideos(doc *goquery.Document) *set.Set {
var nodes *goquery.Selection
for _, videoTag := range videoTags {
tmpNodes := doc.Find(videoTag)
if nodes == nil {
nodes = tmpNodes
} else {
nodes.Union(tmpNodes)
}
}
nodes.Each(func(i int, node *goquery.Selection) {
tag := node.Get(0).DataAtom.String()
var movie video
switch tag {
case "video":
movie = ve.getVideoTag(node)
case "embed":
movie = ve.getEmbedTag(node)
case "object":
movie = ve.getObjectTag(node)
case "iframe":
movie = ve.getIFrame(node)
}
if movie.src != "" {
ve.movies.Add(movie)
}
})
return ve.movies
}
|