File: Utf16StringReader.cs

package info (click to toggle)
mono 6.8.0.105%2Bdfsg-3.3
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,284,512 kB
  • sloc: cs: 11,172,132; xml: 2,850,069; ansic: 671,653; cpp: 122,091; perl: 59,366; javascript: 30,841; asm: 22,168; makefile: 20,093; sh: 15,020; python: 4,827; pascal: 925; sql: 859; sed: 16; php: 1
file content (126 lines) | stat: -rw-r--r-- 5,553 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
//------------------------------------------------------------------------------
// <copyright file="Utf16StringReader.cs" company="Microsoft">
//     Copyright (c) Microsoft Corporation.  All rights reserved.
// </copyright>
//------------------------------------------------------------------------------

namespace System.Web.Security.AntiXss {
    using System;
    using System.Collections.Generic;
    using System.Diagnostics;

    /// <summary>
    /// Reads individual scalar values from a UTF-16 input string.
    /// </summary>
    /// <remarks>
    /// For performance reasons, this is a mutable struct. Use caution when capturing instances of this type.
    /// </remarks>
    internal struct Utf16StringReader {

        /// <summary>
        /// Starting code point for the UTF-16 leading surrogates.
        /// </summary>
        private const char LeadingSurrogateStart = '\uD800';

        /// <summary>
        /// Starting code point for the UTF-16 trailing surrogates.
        /// </summary>
        private const char TrailingSurrogateStart = '\uDC00';

        /// <summary>
        /// The Unicode replacement character U+FFFD.
        /// </summary>
        /// <remarks>
        /// For more info, see http://www.unicode.org/charts/PDF/UFFF0.pdf.
        /// </remarks>
        private const int UnicodeReplacementCharacterCodePoint = '\uFFFD';

        /// <summary>
        /// The current offset into '_input'.
        /// </summary>
        private int _currentOffset;

        /// <summary>
        /// The input string we're iterating on.
        /// </summary>
        private readonly string _input;

        /// <summary>
        /// Initializes the reader with the given UTF-16 input string.
        /// </summary>
        /// <param name="input">The input string to decompose into scalar values.</param>
        public Utf16StringReader(string input) {
            Debug.Assert(input != null);

            _input = input;
            _currentOffset = 0;
        }

        /// <summary>
        /// Similar to Char.ConvertToUtf32, but slightly faster in tight loops since parameter checks are not done.
        /// </summary>
        /// <param name="leadingSurrogate">The UTF-16 leading surrogate character.</param>
        /// <param name="trailingSurrogate">The UTF-16 trailing surrogate character.</param>
        /// <returns>The scalar value resulting from combining these two surrogate characters.</returns>
        /// <remarks>The caller must ensure that the inputs are valid surrogate characters. If not,
        /// the output of this routine is undefined.</remarks>
        private static int ConvertToUtf32(char leadingSurrogate, char trailingSurrogate) {
            Debug.Assert(Char.IsHighSurrogate(leadingSurrogate), "'leadingSurrogate' was not a high surrogate.");
            Debug.Assert(Char.IsLowSurrogate(trailingSurrogate), "'trailingSurrogate' was not a low surrogate.");

            return (int)((leadingSurrogate - LeadingSurrogateStart) * 0x400 + (trailingSurrogate - TrailingSurrogateStart)) + 0x10000;
        }

        /// <summary>
        /// Determines whether a given code point is a valid Unicode scalar value.
        /// </summary>
        /// <param name="codePoint">The code point whose validity is to be checked.</param>
        /// <returns>True if the input is a valid Unicode scalar value, false otherwise.</returns>
        private static bool IsValidUnicodeScalarValue(int codePoint) {
            // Valid scalar values are U+0000 .. U+D7FF and U+E000 .. U+10FFFF.
            // See: http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf, D76
            return (0 <= codePoint && codePoint <= 0xD7FF)
                || (0xE000 <= codePoint && codePoint <= 0x10FFFF);
        }

        /// <summary>
        /// Reads the next scalar value from the input string.
        /// </summary>
        /// <returns>The next scalar value. If the input string contains invalid UTF-16, the
        /// return value is the Unicode replacement character U+FFFD. If the end of the string
        /// is reached, returns -1.</returns>
        public int ReadNextScalarValue() {
            if (_currentOffset >= _input.Length) {
                return -1; // EOF
            }

            char thisCodeUnit = _input[_currentOffset++];
            int thisCodePoint = thisCodeUnit;

            if (Char.IsHighSurrogate(thisCodeUnit)) {
                if (_currentOffset < _input.Length) {
                    char nextCodeUnit = _input[_currentOffset];
                    if (Char.IsLowSurrogate(nextCodeUnit)) {
                        // We encountered a high (leading) surrogate followed by a low
                        // (trailing) surrogate. Bump '_currentOffset' up by one more
                        // since we're consuming both code units.
                        _currentOffset++;
                        thisCodePoint = ConvertToUtf32(thisCodeUnit, nextCodeUnit);
                    }
                }
            }

            if (IsValidUnicodeScalarValue(thisCodePoint)) {
                return thisCodePoint;
            }
            else {
                // ERROR: This code point was either an unmatched high (leading)
                // surrogate or an unmatched low (trailing) surrogate, neither of
                // which maps to a valid Unicode scalar value. Per the Unicode
                // specification (Ch. 3, C10), we replace with U+FFFD.
                return UnicodeReplacementCharacterCodePoint;
            }
        }

    }
}