File: conv.go

package info (click to toggle)
golang-golang-x-tools 1%3A0.25.0%2Bds-1
links: PTS, VCS
area: main
in suites: experimental, sid, trixie
size: 22,724 kB
sloc: javascript: 2,027; asm: 1,645; sh: 166; yacc: 155; makefile: 49; ansic: 8
file content (331 lines) | stat: -rw-r--r-- 6,834 bytes
parent folder | download | duplicates (4)
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// This program takes an HTML file and outputs a corresponding article file in
// present format. See: golang.org/x/tools/present
package main // import "golang.org/x/tools/cmd/html2article"

import (
	"bytes"
	"errors"
	"flag"
	"fmt"
	"io"
	"log"
	"net/url"
	"os"
	"regexp"
	"strings"

	"golang.org/x/net/html"
	"golang.org/x/net/html/atom"
)

func main() {
	flag.Parse()

	err := convert(os.Stdout, os.Stdin)
	if err != nil {
		log.Fatal(err)
	}
}

func convert(w io.Writer, r io.Reader) error {
	root, err := html.Parse(r)
	if err != nil {
		return err
	}

	style := find(root, isTag(atom.Style))
	if err := parseStyles(style); err != nil {
		log.Printf("couldn't parse all styles: %v", err)
	}

	body := find(root, isTag(atom.Body))
	if body == nil {
		return errors.New("couldn't find body")
	}
	article := limitNewlineRuns(makeHeadings(strings.TrimSpace(text(body))))
	_, err = fmt.Fprintf(w, "Title\n\n%s", article)
	return err
}

type Style string

const (
	Bold   Style = "*"
	Italic Style = "_"
	Code   Style = "`"
)

var cssRules = make(map[string]Style)

func parseStyles(style *html.Node) error {
	if style == nil || style.FirstChild == nil {
		return errors.New("couldn't find styles")
	}

	styles := style.FirstChild.Data
	readUntil := func(end rune) (string, bool) {
		i := strings.IndexRune(styles, end)
		if i < 0 {
			return "", false
		}
		s := styles[:i]
		styles = styles[i:]
		return s, true
	}

	for {
		sel, ok := readUntil('{')
		if !ok && sel == "" {
			break
		} else if !ok {
			return fmt.Errorf("could not parse selector %q", styles)
		}

		value, ok := readUntil('}')
		if !ok {
			return fmt.Errorf("couldn't parse style body for %s", sel)
		}
		switch {
		case strings.Contains(value, "italic"):
			cssRules[sel] = Italic
		case strings.Contains(value, "bold"):
			cssRules[sel] = Bold
		case strings.Contains(value, "Consolas") || strings.Contains(value, "Courier New"):
			cssRules[sel] = Code
		}
	}
	return nil
}

var newlineRun = regexp.MustCompile(`\n\n+`)

func limitNewlineRuns(s string) string {
	return newlineRun.ReplaceAllString(s, "\n\n")
}

func makeHeadings(body string) string {
	buf := new(bytes.Buffer)
	lines := strings.Split(body, "\n")
	for i, s := range lines {
		if i == 0 && !isBoldTitle(s) {
			buf.WriteString("* Introduction\n\n")
		}
		if isBoldTitle(s) {
			s = strings.TrimSpace(strings.Replace(s, "*", " ", -1))
			s = "* " + s
		}
		buf.WriteString(s)
		buf.WriteByte('\n')
	}
	return buf.String()
}

func isBoldTitle(s string) bool {
	return !strings.Contains(s, " ") &&
		strings.HasPrefix(s, "*") &&
		strings.HasSuffix(s, "*")
}

func indent(buf *bytes.Buffer, s string) {
	for _, l := range strings.Split(s, "\n") {
		if l != "" {
			buf.WriteByte('\t')
			buf.WriteString(l)
		}
		buf.WriteByte('\n')
	}
}

func unwrap(buf *bytes.Buffer, s string) {
	var cont bool
	for _, l := range strings.Split(s, "\n") {
		l = strings.TrimSpace(l)
		if len(l) == 0 {
			if cont {
				buf.WriteByte('\n')
				buf.WriteByte('\n')
			}
			cont = false
		} else {
			if cont {
				buf.WriteByte(' ')
			}
			buf.WriteString(l)
			cont = true
		}
	}
}

func text(n *html.Node) string {
	var buf bytes.Buffer
	walk(n, func(n *html.Node) bool {
		switch n.Type {
		case html.TextNode:
			buf.WriteString(n.Data)
			return false
		case html.ElementNode:
			// no-op
		default:
			return true
		}
		a := n.DataAtom
		if a == atom.Span {
			switch {
			case hasStyle(Code)(n):
				a = atom.Code
			case hasStyle(Bold)(n):
				a = atom.B
			case hasStyle(Italic)(n):
				a = atom.I
			}
		}
		switch a {
		case atom.Br:
			buf.WriteByte('\n')
		case atom.P:
			unwrap(&buf, childText(n))
			buf.WriteString("\n\n")
		case atom.Li:
			buf.WriteString("- ")
			unwrap(&buf, childText(n))
			buf.WriteByte('\n')
		case atom.Pre:
			indent(&buf, childText(n))
			buf.WriteByte('\n')
		case atom.A:
			href, text := attr(n, "href"), childText(n)
			// Skip links with no text.
			if strings.TrimSpace(text) == "" {
				break
			}
			// Don't emit empty links.
			if strings.TrimSpace(href) == "" {
				buf.WriteString(text)
				break
			}
			// Use original url for Google Docs redirections.
			if u, err := url.Parse(href); err != nil {
				log.Printf("parsing url %q: %v", href, err)
			} else if u.Host == "www.google.com" && u.Path == "/url" {
				href = u.Query().Get("q")
			}
			fmt.Fprintf(&buf, "[[%s][%s]]", href, text)
		case atom.Code:
			buf.WriteString(highlight(n, "`"))
		case atom.B:
			buf.WriteString(highlight(n, "*"))
		case atom.I:
			buf.WriteString(highlight(n, "_"))
		case atom.Img:
			src := attr(n, "src")
			fmt.Fprintf(&buf, ".image %s\n", src)
		case atom.Iframe:
			src, w, h := attr(n, "src"), attr(n, "width"), attr(n, "height")
			fmt.Fprintf(&buf, "\n.iframe %s %s %s\n", src, h, w)
		case atom.Param:
			if attr(n, "name") == "movie" {
				// Old style YouTube embed.
				u := attr(n, "value")
				u = strings.Replace(u, "/v/", "/embed/", 1)
				if i := strings.Index(u, "&"); i >= 0 {
					u = u[:i]
				}
				fmt.Fprintf(&buf, "\n.iframe %s 540 304\n", u)
			}
		case atom.Title:
		default:
			return true
		}
		return false
	})
	return buf.String()
}

func childText(node *html.Node) string {
	var buf bytes.Buffer
	for n := node.FirstChild; n != nil; n = n.NextSibling {
		fmt.Fprint(&buf, text(n))
	}
	return buf.String()
}

func highlight(node *html.Node, char string) string {
	t := strings.Replace(childText(node), " ", char, -1)
	return fmt.Sprintf("%s%s%s", char, t, char)
}

type selector func(*html.Node) bool

func isTag(a atom.Atom) selector {
	return func(n *html.Node) bool {
		return n.DataAtom == a
	}
}

func hasClass(name string) selector {
	return func(n *html.Node) bool {
		for _, a := range n.Attr {
			if a.Key == "class" {
				for _, c := range strings.Fields(a.Val) {
					if c == name {
						return true
					}
				}
			}
		}
		return false
	}
}

func hasStyle(s Style) selector {
	return func(n *html.Node) bool {
		for rule, s2 := range cssRules {
			if s2 != s {
				continue
			}
			if strings.HasPrefix(rule, ".") && hasClass(rule[1:])(n) {
				return true
			}
			if n.DataAtom.String() == rule {
				return true
			}
		}
		return false
	}
}

func attr(node *html.Node, key string) (value string) {
	for _, attr := range node.Attr {
		if attr.Key == key {
			return attr.Val
		}
	}
	return ""
}

func find(n *html.Node, fn selector) *html.Node {
	var result *html.Node
	walk(n, func(n *html.Node) bool {
		if result != nil {
			return false
		}
		if fn(n) {
			result = n
			return false
		}
		return true
	})
	return result
}

func walk(n *html.Node, fn selector) {
	if fn(n) {
		for c := n.FirstChild; c != nil; c = c.NextSibling {
			walk(c, fn)
		}
	}
}