File: escape.bs

package info (click to toggle)
storm-lang 0.7.5-2
links: PTS, VCS
area: main
in suites: forky, sid
size: 52,100 kB
sloc: ansic: 261,471; cpp: 140,438; sh: 14,891; perl: 9,846; python: 2,525; lisp: 2,504; asm: 860; makefile: 678; pascal: 70; java: 52; xml: 37; awk: 12
file content (143 lines) | stat: -rw-r--r-- 3,711 bytes
use core:io;

private Byte? hexDigit(Byte ch) {
	if (ch >= 0x30 & ch <= 0x39)
		return ch - 0x30;
	else if (ch >= 0x41 & ch <= 0x46)
		return ch - 0x41 + 0xA;
	else if (ch >= 0x61 & ch <= 0x66)
		return ch - 0x61 + 0xA;
	else
		return null;
}

// Unescape an URL component.
Str unescapeUrl(Buffer piece) {
	// Interestingly, escapes tend to be used to encode UTF-8 octets. So to simplify the
	// implementation, we simply put all data into a buffer, and decode that as UTF8.
	Nat original = piece.filled;
	Buffer temp = buffer(piece.filled);
	for (Nat i = 0; i < original; i++) {
		Byte ch = piece[i];
		if (ch == 0x25) {
			// % ch, unescape it! If anything seems strange, just ignore the % and continue
			// outputting the characters as they were.

			if (i + 2 >= piece.filled)
				continue;

			var hex1 = hexDigit(piece[i + 1]);
			var hex2 = hexDigit(piece[i + 2]);
			unless (hex1)
				continue;
			unless (hex2)
				continue;

			ch = (hex1 << 4) | hex2;
			i += 2;
		}
		temp.push(ch);
	}

	return temp.fromUtf8();
}

// Helper to unescape an URL parameter. Works much like the regular URL encoding, except that spaces
// are encoded as + and we rewrite 0xD 0xA (CR LF) into just 0xA (LF).
Str unescapeUrlParam(Buffer piece) {
	// Interestingly, escapes tend to be used to encode UTF-8 octets. So to simplify the
	// implementation, we simply put all data into a buffer, and decode that as UTF8.
	Nat original = piece.filled;
	Buffer temp = buffer(piece.filled);
	for (Nat i = 0; i < original; i++) {
		Byte ch = piece[i];
		if (ch == 0x25) {
			// % ch, unescape it! If anything seems strange, just ignore the % and continue
			// outputting the characters as they were.

			if (i + 2 >= piece.filled)
				continue;

			var hex1 = hexDigit(piece[i + 1]);
			var hex2 = hexDigit(piece[i + 2]);
			unless (hex1)
				continue;
			unless (hex2)
				continue;

			ch = (hex1 << 4) | hex2;
			i += 2;
		} else if (ch == 0x2B) {
			// + is used instead of space
			ch = 0x20;
		} else if (ch == 0x0D) {
			// Remove linefeeds before newline characters.
			if (i + 1 < piece.filled) {
				if (piece[i + 1] == 0x0A) {
					i++;
					ch = 0x0A;
				}
			}
		}
		temp.push(ch);
	}

	return temp.fromUtf8();
}

// Helper to escape an URL component. The output string will be ASCII, so the encoding does not matter.
Str escapeUrl(Str original) {
	StrBuf out;
	for (ch in original) {
		Nat codepoint = ch.codepoint;
		Bool safe = codepoint >= 0x30 & codepoint <= 0x39; // 0-9
		safe |= codepoint >= 0x41 & codepoint <= 0x5A; // A-Z
		safe |= codepoint >= 0x61 & codepoint <= 0x7A; // a-z
		safe |= codepoint == 0x2D; // -
		safe |= codepoint == 0x2E; // .
		safe |= codepoint == 0x5F; // _
		safe |= codepoint == 0x7E; // ~

		if (!safe) {
			Buffer utf8 = ch.toS.toUtf8();
			for (ch in utf8) {
				out << "%" << hex(ch);
			}
		} else {
			out << ch;
		}
	}
	return out.toS;
}

// Helper to escape an URL query parameter. Very much like for URL components, except that spaces
// are encoded as + and \n is expanded to \r\n.
Str escapeUrlParam(Str original) {
	StrBuf out;
	for (ch in original) {
		Nat codepoint = ch.codepoint;
		Bool safe = codepoint >= 0x30 & codepoint <= 0x39; // 0-9
		safe |= codepoint >= 0x41 & codepoint <= 0x5A; // A-Z
		safe |= codepoint >= 0x61 & codepoint <= 0x7A; // a-z
		safe |= codepoint == 0x2D; // -
		safe |= codepoint == 0x2E; // .
		safe |= codepoint == 0x5F; // _
		safe |= codepoint == 0x7E; // ~

		if (!safe) {
			if (codepoint == 0x20) {
				out << "+";
			} else if (codepoint == 0x0A) {
				out << "%0D%0A";
			} else {
				Buffer utf8 = ch.toS.toUtf8();
				for (ch in utf8) {
					out << "%" << hex(ch);
				}
			}
		} else {
			out << ch;
		}
	}
	return out.toS;
}