1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
|
package goose
import (
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
)
// Parser is an HTML parser specialised in extraction of main content and other properties
type Parser struct{}
// NewParser returns an HTML parser
func NewParser() *Parser {
return &Parser{}
}
func (p Parser) dropTag(selection *goquery.Selection) {
selection.Each(func(i int, s *goquery.Selection) {
replaceTagWithContents(s, whitelistedTextAtomTypes)
})
}
func (p Parser) indexOfAttribute(selection *goquery.Selection, attr string) int {
node := selection.Get(0)
for i, a := range node.Attr {
if a.Key == attr {
return i
}
}
return -1
}
func (p Parser) delAttr(selection *goquery.Selection, attr string) {
idx := p.indexOfAttribute(selection, attr)
if idx > -1 {
node := selection.Get(0)
node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...)
}
}
func (p Parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection {
selection := new(goquery.Selection)
for _, tag := range tags {
selections := div.Find(tag)
if selections != nil {
selection = selection.Union(selections)
}
}
return selection
}
func (p Parser) clear(selection *goquery.Selection) {
selection.Nodes = make([]*html.Node, 0)
}
func (p Parser) removeNode(selection *goquery.Selection) {
if selection != nil {
node := selection.Get(0)
if node != nil && node.Parent != nil {
node.Parent.RemoveChild(node)
}
}
}
func (p Parser) name(selector string, selection *goquery.Selection) string {
value, exists := selection.Attr(selector)
if exists {
return value
}
return ""
}
func (p Parser) setAttr(selection *goquery.Selection, attr string, value string) {
if selection.Size() > 0 {
node := selection.Get(0)
var attrs []html.Attribute
for _, a := range node.Attr {
if a.Key != attr {
newAttr := new(html.Attribute)
newAttr.Key = a.Key
newAttr.Val = a.Val
attrs = append(attrs, *newAttr)
}
}
newAttr := new(html.Attribute)
newAttr.Key = attr
newAttr.Val = value
attrs = append(attrs, *newAttr)
node.Attr = attrs
}
}
|