1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
# Tests of 'bytes' (immutable byte strings).
load("assert.star", "assert")
# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement
hello = bytes("hello, 世界")
goodbye = bytes("goodbye")
empty = bytes("")
nonprinting = bytes("\t\n\x7F\u200D") # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER
assert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��")
# bytes(iterable of int) -- construct from numeric byte values
assert.eq(bytes([65, 66, 67]), b"ABC")
assert.eq(bytes((65, 66, 67)), b"ABC")
assert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"😿")
assert.fails(lambda: bytes([300]),
"at index 0, 300 out of range .want value in unsigned 8-bit range")
assert.fails(lambda: bytes([b"a"]),
"at index 0, got bytes, want int")
assert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints")
# literals
assert.eq(b"hello, 世界", hello)
assert.eq(b"goodbye", goodbye)
assert.eq(b"", empty)
assert.eq(b"\t\n\x7F\u200D", nonprinting)
assert.ne("abc", b"abc")
assert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ😿") # see scanner tests for more
assert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw
# type
assert.eq(type(hello), "bytes")
# len
assert.eq(len(hello), 13)
assert.eq(len(goodbye), 7)
assert.eq(len(empty), 0)
assert.eq(len(b"A"), 1)
assert.eq(len(b"Ѐ"), 2)
assert.eq(len(b"世"), 3)
assert.eq(len(b"😿"), 4)
# truth
assert.true(hello)
assert.true(goodbye)
assert.true(not empty)
# str(bytes) does UTF-8 to UTF-k transcoding.
# TODO(adonovan): specify.
assert.eq(str(hello), "hello, 世界")
assert.eq(str(hello[:-1]), "hello, 世��") # incomplete UTF-8 encoding => U+FFFD
assert.eq(str(goodbye), "goodbye")
assert.eq(str(empty), "")
assert.eq(str(nonprinting), "\t\n\x7f\u200d")
assert.eq(str(b"\xED\xB0\x80"), "���") # UTF-8 encoding of unpaired surrogate => U+FFFD x 3
# repr
assert.eq(repr(hello), r'b"hello, 世界"')
assert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"') # (incomplete UTF-8 encoding )
assert.eq(repr(goodbye), 'b"goodbye"')
assert.eq(repr(empty), 'b""')
assert.eq(repr(nonprinting), 'b"\\t\\n\\x7f\\u200d"')
# equality
assert.eq(hello, hello)
assert.ne(hello, goodbye)
assert.eq(b"goodbye", goodbye)
# ordered comparison
assert.lt(b"abc", b"abd")
assert.lt(b"abc", b"abcd")
assert.lt(b"\x7f", b"\x80") # bytes compare as uint8, not int8
# bytes are dict-hashable
dict = {hello: 1, goodbye: 2}
dict[b"goodbye"] = 3
assert.eq(len(dict), 2)
assert.eq(dict[goodbye], 3)
# hash(bytes) is 32-bit FNV-1a.
assert.eq(hash(b""), 0x811c9dc5)
assert.eq(hash(b"a"), 0xe40c292c)
assert.eq(hash(b"ab"), 0x4d2505ca)
assert.eq(hash(b"abc"), 0x1a47e90b)
# indexing
assert.eq(goodbye[0], b"g")
assert.eq(goodbye[-1], b"e")
assert.fails(lambda: goodbye[100], "out of range")
# slicing
assert.eq(goodbye[:4], b"good")
assert.eq(goodbye[4:], b"bye")
assert.eq(goodbye[::2], b"gobe")
assert.eq(goodbye[3:4], b"d") # special case: len=1
assert.eq(goodbye[4:4], b"") # special case: len=0
# bytes in bytes
assert.eq(b"bc" in b"abcd", True)
assert.eq(b"bc" in b"dcab", False)
assert.fails(lambda: "bc" in b"dcab", "requires bytes or int as left operand, not string")
# int in bytes
assert.eq(97 in b"abc", True) # 97='a'
assert.eq(100 in b"abc", False) # 100='d'
assert.fails(lambda: 256 in b"abc", "int in bytes: 256 out of range")
assert.fails(lambda: -1 in b"abc", "int in bytes: -1 out of range")
# ord TODO(adonovan): specify
assert.eq(ord(b"a"), 97)
assert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1")
assert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1")
# repeat (bytes * int)
assert.eq(goodbye * 3, b"goodbyegoodbyegoodbye")
assert.eq(3 * goodbye, b"goodbyegoodbyegoodbye")
# elems() returns an iterable value over 1-byte substrings.
assert.eq(type(hello.elems()), "bytes.elems")
assert.eq(str(hello.elems()), "b\"hello, 世界\".elems()")
assert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140])
assert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello)
assert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101])
assert.eq(list(empty.elems()), [])
assert.eq(bytes(hello.elems()), hello) # bytes(iterable) is dual to bytes.elems()
# x[i] = ...
def f():
b"abc"[1] = b"B"
assert.fails(f, "bytes.*does not support.*assignment")
# TODO(adonovan): the specification is not finalized in many areas:
# - chr, ord functions
# - encoding/decoding bytes to string.
# - methods: find, index, split, etc.
#
# Summary of string operations (put this in spec).
#
# string to number:
# - bytes[i] returns numeric value of ith byte.
# - ord(string) returns numeric value of sole code point in string.
# - ord(string[i]) is not a useful operation: fails on non-ASCII; see below.
# Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder.
# Perhaps ord(string, index=int) should apply the index and relax the len=1 check.
# - string.codepoint() iterates over 1-codepoint substrings.
# - string.codepoint_ords() iterates over numeric values of code points in string.
# - string.elems() iterates over 1-element (UTF-k code) substrings.
# - string.elem_ords() iterates over numeric UTF-k code values.
# - string.elem_ords()[i] returns numeric value of ith element (UTF-k code).
# - string.elems()[i] returns substring of a single element (UTF-k code).
# - int(string) parses string as decimal (or other) numeric literal.
#
# number to string:
# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python).
# Redundant with '%c' % int (which Python2 calls 'unichr'.)
# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point.
# - bytes([int]) returns 1-byte string (with regrettable list allocation).
# - str(int) - format number as decimal.
|