File: unicode.c

package info (click to toggle)
swiftlang 6.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,519,992 kB
  • sloc: cpp: 9,107,863; ansic: 2,040,022; asm: 1,135,751; python: 296,500; objc: 82,456; f90: 60,502; lisp: 34,951; pascal: 19,946; sh: 18,133; perl: 7,482; ml: 4,937; javascript: 4,117; makefile: 3,840; awk: 3,535; xml: 914; fortran: 619; cs: 573; ruby: 573
file content (155 lines) | stat: -rw-r--r-- 4,180 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
/* Copyright (c) 2018, Google Inc.
 *
 * Permission to use, copy, modify, and/or distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
 * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */

#include <CNIOBoringSSL_bytestring.h>

#include "internal.h"


static int is_valid_code_point(uint32_t v) {
  // References in the following are to Unicode 9.0.0.
  if (// The Unicode space runs from zero to 0x10ffff (3.4 D9).
      v > 0x10ffff ||
      // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved
      // (3.4 D14)
      (v & 0xfffe) == 0xfffe ||
      (v >= 0xfdd0 && v <= 0xfdef) ||
      // Surrogate code points are invalid (3.2 C1).
      (v >= 0xd800 && v <= 0xdfff)) {
    return 0;
  }
  return 1;
}

// BOTTOM_BITS returns a byte with the bottom |n| bits set.
#define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1)

// TOP_BITS returns a byte with the top |n| bits set.
#define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n)))

int cbs_get_utf8(CBS *cbs, uint32_t *out) {
  uint8_t c;
  if (!CBS_get_u8(cbs, &c)) {
    return 0;
  }
  if (c <= 0x7f) {
    *out = c;
    return 1;
  }
  uint32_t v, lower_bound;
  size_t len;
  if ((c & TOP_BITS(3)) == TOP_BITS(2)) {
    v = c & BOTTOM_BITS(5);
    len = 1;
    lower_bound = 0x80;
  } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) {
    v = c & BOTTOM_BITS(4);
    len = 2;
    lower_bound = 0x800;
  } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) {
    v = c & BOTTOM_BITS(3);
    len = 3;
    lower_bound = 0x10000;
  } else {
    return 0;
  }
  for (size_t i = 0; i < len; i++) {
    if (!CBS_get_u8(cbs, &c) ||
        (c & TOP_BITS(2)) != TOP_BITS(1)) {
      return 0;
    }
    v <<= 6;
    v |= c & BOTTOM_BITS(6);
  }
  if (!is_valid_code_point(v) ||
      v < lower_bound) {
    return 0;
  }
  *out = v;
  return 1;
}

int cbs_get_latin1(CBS *cbs, uint32_t *out) {
  uint8_t c;
  if (!CBS_get_u8(cbs, &c)) {
    return 0;
  }
  *out = c;
  return 1;
}

int cbs_get_ucs2_be(CBS *cbs, uint32_t *out) {
  // Note UCS-2 (used by BMPString) does not support surrogates.
  uint16_t c;
  if (!CBS_get_u16(cbs, &c) ||
      !is_valid_code_point(c)) {
    return 0;
  }
  *out = c;
  return 1;
}

int cbs_get_utf32_be(CBS *cbs, uint32_t *out) {
  return CBS_get_u32(cbs, out) && is_valid_code_point(*out);
}

size_t cbb_get_utf8_len(uint32_t u) {
  if (u <= 0x7f) {
    return 1;
  }
  if (u <= 0x7ff) {
    return 2;
  }
  if (u <= 0xffff) {
    return 3;
  }
  return 4;
}

int cbb_add_utf8(CBB *cbb, uint32_t u) {
  if (!is_valid_code_point(u)) {
    return 0;
  }
  if (u <= 0x7f) {
    return CBB_add_u8(cbb, (uint8_t)u);
  }
  if (u <= 0x7ff) {
    return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) &&
           CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
  }
  if (u <= 0xffff) {
    return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) &&
           CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
           CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
  }
  if (u <= 0x10ffff) {
    return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) &&
           CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) &&
           CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) &&
           CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6)));
  }
  return 0;
}

int cbb_add_latin1(CBB *cbb, uint32_t u) {
  return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u);
}

int cbb_add_ucs2_be(CBB *cbb, uint32_t u) {
  return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u);
}

int cbb_add_utf32_be(CBB *cbb, uint32_t u) {
  return is_valid_code_point(u) && CBB_add_u32(cbb, u);
}