File: escape.bs

package info (click to toggle)
storm-lang 0.7.5-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 52,100 kB
  • sloc: ansic: 261,471; cpp: 140,438; sh: 14,891; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (143 lines) | stat: -rw-r--r-- 3,711 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
use core:io;

private Byte? hexDigit(Byte ch) {
	if (ch >= 0x30 & ch <= 0x39)
		return ch - 0x30;
	else if (ch >= 0x41 & ch <= 0x46)
		return ch - 0x41 + 0xA;
	else if (ch >= 0x61 & ch <= 0x66)
		return ch - 0x61 + 0xA;
	else
		return null;
}

// Unescape an URL component.
Str unescapeUrl(Buffer piece) {
	// Interestingly, escapes tend to be used to encode UTF-8 octets. So to simplify the
	// implementation, we simply put all data into a buffer, and decode that as UTF8.
	Nat original = piece.filled;
	Buffer temp = buffer(piece.filled);
	for (Nat i = 0; i < original; i++) {
		Byte ch = piece[i];
		if (ch == 0x25) {
			// % ch, unescape it! If anything seems strange, just ignore the % and continue
			// outputting the characters as they were.

			if (i + 2 >= piece.filled)
				continue;

			var hex1 = hexDigit(piece[i + 1]);
			var hex2 = hexDigit(piece[i + 2]);
			unless (hex1)
				continue;
			unless (hex2)
				continue;

			ch = (hex1 << 4) | hex2;
			i += 2;
		}
		temp.push(ch);
	}

	return temp.fromUtf8();
}

// Helper to unescape an URL parameter. Works much like the regular URL encoding, except that spaces
// are encoded as + and we rewrite 0xD 0xA (CR LF) into just 0xA (LF).
Str unescapeUrlParam(Buffer piece) {
	// Interestingly, escapes tend to be used to encode UTF-8 octets. So to simplify the
	// implementation, we simply put all data into a buffer, and decode that as UTF8.
	Nat original = piece.filled;
	Buffer temp = buffer(piece.filled);
	for (Nat i = 0; i < original; i++) {
		Byte ch = piece[i];
		if (ch == 0x25) {
			// % ch, unescape it! If anything seems strange, just ignore the % and continue
			// outputting the characters as they were.

			if (i + 2 >= piece.filled)
				continue;

			var hex1 = hexDigit(piece[i + 1]);
			var hex2 = hexDigit(piece[i + 2]);
			unless (hex1)
				continue;
			unless (hex2)
				continue;

			ch = (hex1 << 4) | hex2;
			i += 2;
		} else if (ch == 0x2B) {
			// + is used instead of space
			ch = 0x20;
		} else if (ch == 0x0D) {
			// Remove linefeeds before newline characters.
			if (i + 1 < piece.filled) {
				if (piece[i + 1] == 0x0A) {
					i++;
					ch = 0x0A;
				}
			}
		}
		temp.push(ch);
	}

	return temp.fromUtf8();
}

// Helper to escape an URL component. The output string will be ASCII, so the encoding does not matter.
Str escapeUrl(Str original) {
	StrBuf out;
	for (ch in original) {
		Nat codepoint = ch.codepoint;
		Bool safe = codepoint >= 0x30 & codepoint <= 0x39; // 0-9
		safe |= codepoint >= 0x41 & codepoint <= 0x5A; // A-Z
		safe |= codepoint >= 0x61 & codepoint <= 0x7A; // a-z
		safe |= codepoint == 0x2D; // -
		safe |= codepoint == 0x2E; // .
		safe |= codepoint == 0x5F; // _
		safe |= codepoint == 0x7E; // ~

		if (!safe) {
			Buffer utf8 = ch.toS.toUtf8();
			for (ch in utf8) {
				out << "%" << hex(ch);
			}
		} else {
			out << ch;
		}
	}
	return out.toS;
}

// Helper to escape an URL query parameter. Very much like for URL components, except that spaces
// are encoded as + and \n is expanded to \r\n.
Str escapeUrlParam(Str original) {
	StrBuf out;
	for (ch in original) {
		Nat codepoint = ch.codepoint;
		Bool safe = codepoint >= 0x30 & codepoint <= 0x39; // 0-9
		safe |= codepoint >= 0x41 & codepoint <= 0x5A; // A-Z
		safe |= codepoint >= 0x61 & codepoint <= 0x7A; // a-z
		safe |= codepoint == 0x2D; // -
		safe |= codepoint == 0x2E; // .
		safe |= codepoint == 0x5F; // _
		safe |= codepoint == 0x7E; // ~

		if (!safe) {
			if (codepoint == 0x20) {
				out << "+";
			} else if (codepoint == 0x0A) {
				out << "%0D%0A";
			} else {
				Buffer utf8 = ch.toS.toUtf8();
				for (ch in utf8) {
					out << "%" << hex(ch);
				}
			}
		} else {
			out << ch;
		}
	}
	return out.toS;
}