File: bytes.star

package info (click to toggle)
golang-starlark 0.0~git20240725.42030a7-1
  • links: PTS, VCS
  • area: main
  • in suites: experimental, forky, sid, trixie
  • size: 1,296 kB
  • sloc: makefile: 8; sh: 8
file content (159 lines) | stat: -rw-r--r-- 5,813 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# Tests of 'bytes' (immutable byte strings).

load("assert.star", "assert")

# bytes(string) -- UTF-k to UTF-8 transcoding with U+FFFD replacement
hello = bytes("hello, 世界")
goodbye = bytes("goodbye")
empty = bytes("")
nonprinting = bytes("\t\n\x7F\u200D")  # TAB, NEWLINE, DEL, ZERO_WIDTH_JOINER
assert.eq(bytes("hello, 世界"[:-1]), b"hello, 世��")

# bytes(iterable of int) -- construct from numeric byte values
assert.eq(bytes([65, 66, 67]), b"ABC")
assert.eq(bytes((65, 66, 67)), b"ABC")
assert.eq(bytes([0xf0, 0x9f, 0x98, 0xbf]), b"😿")
assert.fails(lambda: bytes([300]),
             "at index 0, 300 out of range .want value in unsigned 8-bit range")
assert.fails(lambda: bytes([b"a"]),
             "at index 0, got bytes, want int")
assert.fails(lambda: bytes(1), "want string, bytes, or iterable of ints")

# literals
assert.eq(b"hello, 世界", hello)
assert.eq(b"goodbye", goodbye)
assert.eq(b"", empty)
assert.eq(b"\t\n\x7F\u200D", nonprinting)
assert.ne("abc", b"abc")
assert.eq(b"\012\xff\u0400\U0001F63F", b"\n\xffЀ😿") # see scanner tests for more
assert.eq(rb"\r\n\t", b"\\r\\n\\t") # raw

# type
assert.eq(type(hello), "bytes")

# len
assert.eq(len(hello), 13)
assert.eq(len(goodbye), 7)
assert.eq(len(empty), 0)
assert.eq(len(b"A"), 1)
assert.eq(len(b"Ѐ"), 2)
assert.eq(len(b"世"), 3)
assert.eq(len(b"😿"), 4)

# truth
assert.true(hello)
assert.true(goodbye)
assert.true(not empty)

# str(bytes) does UTF-8 to UTF-k transcoding.
# TODO(adonovan): specify.
assert.eq(str(hello), "hello, 世界")
assert.eq(str(hello[:-1]), "hello, 世��")  # incomplete UTF-8 encoding => U+FFFD
assert.eq(str(goodbye), "goodbye")
assert.eq(str(empty), "")
assert.eq(str(nonprinting), "\t\n\x7f\u200d")
assert.eq(str(b"\xED\xB0\x80"), "���") # UTF-8 encoding of unpaired surrogate => U+FFFD x 3

# repr
assert.eq(repr(hello), r'b"hello, 世界"')
assert.eq(repr(hello[:-1]), r'b"hello, 世\xe7\x95"')  # (incomplete UTF-8 encoding )
assert.eq(repr(goodbye), 'b"goodbye"')
assert.eq(repr(empty), 'b""')
assert.eq(repr(nonprinting), 'b"\\t\\n\\x7f\\u200d"')

# equality
assert.eq(hello, hello)
assert.ne(hello, goodbye)
assert.eq(b"goodbye", goodbye)

# ordered comparison
assert.lt(b"abc", b"abd")
assert.lt(b"abc", b"abcd")
assert.lt(b"\x7f", b"\x80") # bytes compare as uint8, not int8

# bytes are dict-hashable
dict = {hello: 1, goodbye: 2}
dict[b"goodbye"] = 3
assert.eq(len(dict), 2)
assert.eq(dict[goodbye], 3)

# hash(bytes) is 32-bit FNV-1a.
assert.eq(hash(b""), 0x811c9dc5)
assert.eq(hash(b"a"), 0xe40c292c)
assert.eq(hash(b"ab"), 0x4d2505ca)
assert.eq(hash(b"abc"), 0x1a47e90b)

# indexing
assert.eq(goodbye[0], b"g")
assert.eq(goodbye[-1], b"e")
assert.fails(lambda: goodbye[100], "out of range")

# slicing
assert.eq(goodbye[:4], b"good")
assert.eq(goodbye[4:], b"bye")
assert.eq(goodbye[::2], b"gobe")
assert.eq(goodbye[3:4], b"d")  # special case: len=1
assert.eq(goodbye[4:4], b"")  # special case: len=0

# bytes in bytes
assert.eq(b"bc" in b"abcd", True)
assert.eq(b"bc" in b"dcab", False)
assert.fails(lambda: "bc" in b"dcab", "requires bytes or int as left operand, not string")

# int in bytes
assert.eq(97 in b"abc", True)  # 97='a'
assert.eq(100 in b"abc", False) # 100='d'
assert.fails(lambda: 256 in b"abc", "int in bytes: 256 out of range")
assert.fails(lambda: -1 in b"abc", "int in bytes: -1 out of range")

# ord   TODO(adonovan): specify
assert.eq(ord(b"a"), 97)
assert.fails(lambda: ord(b"ab"), "ord: bytes has length 2, want 1")
assert.fails(lambda: ord(b""), "ord: bytes has length 0, want 1")

# repeat (bytes * int)
assert.eq(goodbye * 3, b"goodbyegoodbyegoodbye")
assert.eq(3 * goodbye, b"goodbyegoodbyegoodbye")

# elems() returns an iterable value over 1-byte substrings.
assert.eq(type(hello.elems()), "bytes.elems")
assert.eq(str(hello.elems()), "b\"hello, 世界\".elems()")
assert.eq(list(hello.elems()), [104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140])
assert.eq(bytes([104, 101, 108, 108, 111, 44, 32, 228, 184, 150, 231, 149, 140]), hello)
assert.eq(list(goodbye.elems()), [103, 111, 111, 100, 98, 121, 101])
assert.eq(list(empty.elems()), [])
assert.eq(bytes(hello.elems()), hello) # bytes(iterable) is dual to bytes.elems()

# x[i] = ...
def f():
    b"abc"[1] = b"B"

assert.fails(f, "bytes.*does not support.*assignment")

# TODO(adonovan): the specification is not finalized in many areas:
# - chr, ord functions
# - encoding/decoding bytes to string.
# - methods: find, index, split, etc.
#
# Summary of string operations (put this in spec).
#
# string to number:
# - bytes[i]  returns numeric value of ith byte.
# - ord(string)  returns numeric value of sole code point in string.
# - ord(string[i])  is not a useful operation: fails on non-ASCII; see below.
#   Q. Perhaps ord should return the first (not sole) code point? Then it becomes a UTF-8 decoder.
#      Perhaps ord(string, index=int) should apply the index and relax the len=1 check.
# - string.codepoint()  iterates over 1-codepoint substrings.
# - string.codepoint_ords()  iterates over numeric values of code points in string.
# - string.elems()  iterates over 1-element (UTF-k code) substrings.
# - string.elem_ords()  iterates over numeric UTF-k code values.
# - string.elem_ords()[i]  returns numeric value of ith element (UTF-k code).
# - string.elems()[i]  returns substring of a single element (UTF-k code).
# - int(string)  parses string as decimal (or other) numeric literal.
#
# number to string:
# - chr(int) returns string, UTF-k encoding of Unicode code point (like Python).
#   Redundant with '%c' % int (which Python2 calls 'unichr'.)
# - bytes(chr(int)) returns byte string containing UTF-8 encoding of one code point.
# - bytes([int]) returns 1-byte string (with regrettable list allocation).
# - str(int) - format number as decimal.