File: String%2BEndianAdaptorSequence.swift

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (312 lines) | stat: -rw-r--r-- 12,106 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

enum Endianness {
    case little
    case big
    
    init?(_ ns: String.Encoding) {
        switch ns {
        case .utf16, .utf32: return nil
        case .utf16LittleEndian, .utf32LittleEndian: self = .little
        case .utf16BigEndian, .utf32BigEndian: self = .big
        default: fatalError("Unexpected encoding")
        }
    }
    
    static var host: Endianness {
#if _endian(little)
        return .little
#else
        return .big
#endif
    }
}

/// Converts a sequence of UInt8 containing big-endian or little-endian UInt16 elements into host order. 
/// If the bytes contain a BOM and the endianness on initialization is `nil` then it will honor the BOM to swap the bytes if appropriate.
struct UTF16EndianAdaptor<S : Sequence> : Sequence where S.Element == UInt8 {
    typealias Element = UInt16
    
    let underlying: S
    let endianness: Endianness?

    init(_ sequence: S, endianness: Endianness?) {
        underlying = sequence
        self.endianness = endianness
    }
    
    func makeIterator() -> Iterator {
        Iterator(underlying, endianness: endianness)
    }
    
    struct Iterator : IteratorProtocol {
        var i: S.Iterator
        var endianness: Endianness?
        var bomCheck = false
        
        init(_ sequence: S, endianness: Endianness?) {
            i = sequence.makeIterator()
            self.endianness = endianness
        }
        
        func swap(_ b1: UInt8, _ b2: UInt8) -> UInt16 {
            let uint16 = UInt16(b1) | UInt16(b2) << 8
            switch endianness {
            case .little:
                return UInt16(littleEndian: uint16)
            case .none, .big:
                // Historically speaking, Foundation treats an unspecified encoding on decoding (plain .utf16) + no BOM as assuming the input is big endian.
                return UInt16(bigEndian: uint16)
            }
        }
        
        mutating func next() -> UInt16? {
            // First check for the BOM.
            // If the encoding was unspecified (`.utf16`), then we detect the BOM here, specify the encoding, and remove the BOM.
            // If the encoding was specified, and a BOM is present, and it matches, then leave the BOM in place.
            // If the encoding was specified, and a BOM is present, and it does not match, then all bets are off. Leave the BOM and pass it on to String to deal with.
            if !bomCheck {
                // Only do this once
                bomCheck = true
                                
                guard let bom1 = i.next() else { return nil }
                
                if bom1 == 0xFF || bom1 == 0xFE {
                    // A BOM is probably present.
                    
                    // Check for BOM byte 2
                    guard let bom2 = i.next() else {
                        // Only 1 byte - return nil
                        return nil
                    }
                    
                    if bom1 == 0xFF && bom2 == 0xFE {
                        if endianness == nil {
                            // 0xFF FE is little endian
                            self.endianness = .little
                            // Continue below, now that we have skipped BOM
                        } else if endianness == .little {
                            // Do not skip BOM
                            return swap(bom1, bom2)
                        } else {
                            // Mismatch of BOM and encoding. Pass it on to String.
                            return swap(bom1, bom2)
                        }
                    } else if bom1 == 0xFE && bom2 == 0xFF {
                        if endianness == nil {
                            // 0xFE FF is big endian
                            self.endianness = .big
                            // Continue below, now that we have skipped BOM
                        } else if endianness == .big {
                            // Do not skip BOM
                            return swap(bom1, bom2)
                        } else {
                            // Mismatch of BOM and encoding. Pass it on to String.
                            return swap(bom1, bom2)
                        }
                    } else {
                        // Not a full BOM; just return the UInt16 and let String sort it out
                        return swap(bom1, bom2)
                    }
                } else {
                    // Not a BOM. 
                    // Get 2nd byte and return it
                    guard let b2 = i.next() else { return nil }
                    return swap(bom1, b2)
                }
            }
            
            // Check for end
            guard let b1 = i.next() else { return nil }
            
            // Check for 2nd byte
            guard let b2 = i.next() else { return nil }
            
            return swap(b1, b2)
        }
    }
}

/// Converts a sequence of UInt8 containing big-endian or little-endian UInt32 elements into host order.
/// If the bytes contain a BOM and the endianness on initialization is `nil` then it will honor the BOM to swap the bytes if appropriate.
struct UTF32EndianAdaptor<S : Sequence> : Sequence where S.Element == UInt8 {
    typealias Element = UInt32
    
    let underlying: S
    let endianness: Endianness?

    init(_ sequence: S, endianness: Endianness?) {
        underlying = sequence
        self.endianness = endianness
    }
    
    func makeIterator() -> Iterator {
        Iterator(underlying, endianness: endianness)
    }
    
    struct Iterator : IteratorProtocol {
        var i: S.Iterator
        var endianness: Endianness?
        var bomCheck = false
        
        init(_ sequence: S, endianness: Endianness?) {
            i = sequence.makeIterator()
            self.endianness = endianness
        }
        
        func swap(_ b1: UInt8, _ b2: UInt8, _ b3: UInt8, _ b4: UInt8) -> UInt32 {
            // We use big endianness if none has been specified and no BOM was detected.
            let uint32 = UInt32(b1) | UInt32(b2) << 8 | UInt32(b3) << 16 | UInt32(b4) << 24
            switch endianness {
            case .little:
                return UInt32(littleEndian: uint32)
            case .none, .big:
                return UInt32(bigEndian: uint32)
            }
        }
        
        mutating func next() -> UInt32? {
            // First check for the BOM.
            // If the encoding was unspecified (`.utf32`), then we detect the BOM here, specify the encoding, and remove the BOM.
            // If the encoding was specified, and a BOM is present, and it matches, then leave the BOM in place.
            // If the encoding was specified, and a BOM is present, and it does not match, then all bets are off. Leave the BOM and pass it on to String to deal with.
            if !bomCheck {
                // Only do this once
                bomCheck = true
                                
                guard let bom1 = i.next() else { return nil }
                
                if bom1 == 0xFF || bom1 == 0x00 {
                    // A BOM is probably present.
                    
                    // Check for remaining BOM bytes
                    guard let bom2 = i.next() else { return nil }
                    guard let bom3 = i.next() else { return nil }
                    guard let bom4 = i.next() else { return nil }

                    if bom1 == 0xFF && bom2 == 0xFE && bom3 == 0x00 && bom4 == 0x00 {
                        if endianness == nil {
                            // 0xFF FE 00 00 is little endian
                            self.endianness = .little
                            // Continue below, now that we have skipped BOM
                        } else if endianness == .little {
                            // Do not skip BOM
                            return swap(bom1, bom2, bom3, bom4)
                        } else {
                            // Mismatch of BOM and encoding. Pass it on to String.
                            return swap(bom1, bom2, bom3, bom4)
                        }
                    } else if bom1 == 0x00 && bom2 == 0x00 && bom3 == 0xFE && bom4 == 0xFF {
                        if endianness == nil {
                            // 0x00 00 FE FF is big endian
                            self.endianness = .big
                            // Continue below, now that we have skipped BOM
                        } else if endianness == .big {
                            // Do not skip BOM
                            return swap(bom1, bom2, bom3, bom4)
                        } else {
                            // Mismatch of BOM and encoding. Pass it on to String.
                            return swap(bom1, bom2, bom3, bom4)
                        }
                    } else {
                        // Not a full BOM; just return the UInt16 and let String sort it out
                        return swap(bom1, bom2, bom3, bom4)
                    }
                } else {
                    // Not a BOM. Get remaining bytes and return it
                    guard let b2 = i.next() else { return nil }
                    guard let b3 = i.next() else { return nil }
                    guard let b4 = i.next() else { return nil }
                    return swap(bom1, b2, b3, b4)
                }
            }
            
            // Check for end
            guard let b1 = i.next() else { return nil }
            
            // Check for remaining bytes
            guard let b2 = i.next() else { return nil }
            guard let b3 = i.next() else { return nil }
            guard let b4 = i.next() else { return nil }

            return swap(b1, b2, b3, b4)
        }
    }
}

struct UnicodeScalarToDataAdaptor : Sequence {
    typealias Element = UInt8
    typealias S = String.UnicodeScalarView
    
    let underlying: S
    let endianness: Endianness

    init(_ sequence: S, endianness: Endianness) {
        underlying = sequence
        self.endianness = endianness
    }
    
    func makeIterator() -> Iterator {
        Iterator(i: underlying.makeIterator(), endianness: endianness)
    }
    
    struct Iterator : IteratorProtocol {
        var u32: UInt32
        var nextByte = 0
        var i: S.Iterator
        var endianness: Endianness
        var done: Bool
        
        init(i: S.Iterator, endianness: Endianness) {
            u32 = 0
            done = false
            self.i = i
            self.endianness = endianness
        }
        
        mutating func next() -> Element? {
            guard !done else { return nil }
            
            if nextByte > 0 {
                // We have a value already, return next byte
                let result = withUnsafeBytes(of: &u32) {
                    $0[nextByte]
                }

                nextByte += 1
                if nextByte == 4 {
                    nextByte = 0
                }
                return result
            } else {
                guard let u32 = i.next() else {
                    done = true
                    return nil
                }
                
                var value = switch endianness {
                case .little:
                    u32.value.littleEndian
                case .big:
                    u32.value.bigEndian
                }
                
                self.u32 = value
                nextByte = 1
                return withUnsafeBytes(of: &value) {
                    $0[0]
                }
            }
        }
    }
}