1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41
|
package goose
import (
"github.com/pkg/errors"
)
// Goose is the main entry point of the program
type Goose struct {
config Configuration
}
// New returns a new instance of the article extractor
func New(args ...string) Goose {
return Goose{
config: GetDefaultConfiguration(args...),
}
}
// NewWithConfig returns a new instance of the article extractor with configuration
func NewWithConfig(config Configuration) Goose {
return Goose{
config,
}
}
// ExtractFromURL follows the URL, fetches the HTML page and returns an article object
func (g Goose) ExtractFromURL(url string) (*Article, error) {
HtmlRequester := NewHtmlRequester(g.config)
html, err := HtmlRequester.fetchHTML(url)
if err != nil {
return nil, errors.Wrap(err, "could not get htnk from site")
}
cc := NewCrawler(g.config)
return cc.Crawl(html, url)
}
// ExtractFromRawHTML returns an article object from the raw HTML content
func (g Goose) ExtractFromRawHTML(RawHTML string, url string) (*Article, error) {
cc := NewCrawler(g.config)
return cc.Crawl(RawHTML, url)
}
|