File: parser.go

package info (click to toggle)
golang-github-advancedlogic-goose 0.0~git20210820.9d5822d%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 516 kB
  • sloc: makefile: 128; sh: 11
file content (90 lines) | stat: -rw-r--r-- 2,090 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
package goose

import (
	"github.com/PuerkitoBio/goquery"
	"golang.org/x/net/html"
)

// Parser is an HTML parser specialised in extraction of main content and other properties
type Parser struct{}

// NewParser returns an HTML parser
func NewParser() *Parser {
	return &Parser{}
}

func (p Parser) dropTag(selection *goquery.Selection) {
	selection.Each(func(i int, s *goquery.Selection) {
		replaceTagWithContents(s, whitelistedTextAtomTypes)
	})
}

func (p Parser) indexOfAttribute(selection *goquery.Selection, attr string) int {
	node := selection.Get(0)
	for i, a := range node.Attr {
		if a.Key == attr {
			return i
		}
	}
	return -1
}

func (p Parser) delAttr(selection *goquery.Selection, attr string) {
	idx := p.indexOfAttribute(selection, attr)
	if idx > -1 {
		node := selection.Get(0)
		node.Attr = append(node.Attr[:idx], node.Attr[idx+1:]...)
	}
}

func (p Parser) getElementsByTags(div *goquery.Selection, tags []string) *goquery.Selection {
	selection := new(goquery.Selection)
	for _, tag := range tags {
		selections := div.Find(tag)
		if selections != nil {
			selection = selection.Union(selections)
		}
	}
	return selection
}

func (p Parser) clear(selection *goquery.Selection) {
	selection.Nodes = make([]*html.Node, 0)
}

func (p Parser) removeNode(selection *goquery.Selection) {
	if selection != nil {
		node := selection.Get(0)
		if node != nil && node.Parent != nil {
			node.Parent.RemoveChild(node)
		}
	}
}

func (p Parser) name(selector string, selection *goquery.Selection) string {
	value, exists := selection.Attr(selector)
	if exists {
		return value
	}
	return ""
}

func (p Parser) setAttr(selection *goquery.Selection, attr string, value string) {
	if selection.Size() > 0 {
		node := selection.Get(0)
		var attrs []html.Attribute
		for _, a := range node.Attr {
			if a.Key != attr {
				newAttr := new(html.Attribute)
				newAttr.Key = a.Key
				newAttr.Val = a.Val
				attrs = append(attrs, *newAttr)
			}
		}
		newAttr := new(html.Attribute)
		newAttr.Key = attr
		newAttr.Val = value
		attrs = append(attrs, *newAttr)
		node.Attr = attrs
	}
}