File: url.go

package info (click to toggle)
golang-github-johanneskaufmann-html-to-markdown 2.3.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,080 kB
  • sloc: makefile: 3
file content (114 lines) | stat: -rw-r--r-- 2,875 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
package converter

import (
	"net/url"
	"strings"
)

var percentEncodingReplacer = strings.NewReplacer(
	" ", "%20",
	"[", "%5B",
	"]", "%5D",
	"(", "%28",
	")", "%29",
	"<", "%3C",
	">", "%3E",
)

func parseBaseDomain(rawDomain string) *url.URL {
	if rawDomain == "" {
		return nil
	}

	u1, err := url.Parse(rawDomain)
	if err == nil && u1.Host != "" {
		// Yes, we got valid domain (probably with a http/https scheme)
		return u1
	}

	u2, err := url.Parse("http://" + rawDomain)
	if err == nil && u2.Host != "" {
		// Yes, we got a valid domain (by choosing a fallback scheme)
		return u2
	}

	return nil
}
func defaultAssembleAbsoluteURL(tagName string, rawURL string, domain string) string {
	rawURL = strings.TrimSpace(rawURL)

	if rawURL == "#" {
		// Golangs url.Parse does not seem to distinguish between
		// no fragment and an empty fragment.
		return rawURL
	}

	// Increase the chance that the url will be parsed
	rawURL = strings.ReplaceAll(rawURL, "\n", "%0A")
	rawURL = strings.ReplaceAll(rawURL, "\t", "%09")

	u, err := url.Parse(rawURL)
	if err != nil {
		// We can't do anything with this url because it is invalid
		return percentEncodingReplacer.Replace(rawURL)
	}

	if u.Scheme == "data" {
		// This is a data uri (for example an inline base64 image)
		return percentEncodingReplacer.Replace(rawURL)
	}

	// The default Query().Encode() encodes the query parameters "sorted by key".
	// Instead we want to keep the original order, but still encode the parameters.
	u.RawQuery = ParseAndEncodeQuery(u.RawQuery)

	// For better compatibility (especially in regards to mailto links),
	// instead of encoding a space with a "+" we use ""%20" to prevent
	// e.g. the email reading "Hi+Johannes" instead of "Hi Johannes"
	u.RawQuery = strings.ReplaceAll(u.RawQuery, "+", "%20")

	if base := parseBaseDomain(domain); base != nil {
		// If a "domain" is provided, we use that to convert relative links
		// to absolute links.
		u = base.ResolveReference(u)
	}

	return percentEncodingReplacer.Replace(u.String())
}

// - - - - //

func decodeAndEncode(original string) string {
	s, err := url.QueryUnescape(original)
	if err != nil {
		return original
	}

	return url.QueryEscape(s)
}

func ParseAndEncodeQuery(rawQuery string) string {
	if rawQuery == "" {
		return ""
	}

	rawParts := strings.Split(rawQuery, "&")
	encodedParts := make([]string, len(rawParts))

	for i, part := range rawParts {
		splitted := strings.SplitN(part, "=", 2)

		if len(splitted) == 1 {
			// A: Just the key
			encodedParts[i] = decodeAndEncode(splitted[0])
		} else if splitted[1] == "" {
			// B: The key and the equal sign
			encodedParts[i] = decodeAndEncode(splitted[0]) + "="
		} else {
			// C: The key and the equal sign and the value
			encodedParts[i] = decodeAndEncode(splitted[0]) + "=" + decodeAndEncode(splitted[1])
		}
	}

	return strings.Join(encodedParts, "&")
}