File: utf8.h

package info (click to toggle)
r-cran-sourcetools 0.1.7-1-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 308 kB
  • sloc: cpp: 1,985; ansic: 505; sh: 10; makefile: 2
file content (115 lines) | stat: -rw-r--r-- 1,852 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#ifndef SOURCETOOLS_UTF8_UTF8_H
#define SOURCETOOLS_UTF8_UTF8_H

#include <cstddef>

#include <sourcetools/core/core.h>

namespace sourcetools {
namespace utf8 {

namespace detail {
static const unsigned char mask[] = {
  0,    // 00000000
  0x7F, // 01111111
  0x1F, // 00011111
  0x0F, // 00001111
  0x07, // 00000111
  0x03, // 00000011
  0x01  // 00000001
};
} // namespace detail

class iterator
{
public:
  iterator(const char* data)
    : data_(reinterpret_cast<const unsigned char*>(data)),
      offset_(0)
  {
  }

  iterator(const iterator& other)
    : data_(other.data_),
      offset_(other.offset_)
  {
  }

  wchar_t operator*()
  {
    std::size_t n = size();
    if (n == 0 || n > 6)
      return -1;

    const unsigned char* it = data_ + offset_;
    wchar_t ch = (*it++) & detail::mask[n];
    for (std::size_t i = 1; i < n; ++i)
    {
      ch <<= 6;
      ch |= (*it++) & 0x3F;
    }

    return ch;
  }

  iterator& operator++()
  {
    offset_ += size();
    return *this;
  }

  iterator operator++(int)
  {
    iterator copy(*this);
    operator++();
    return copy;
  }

  bool operator==(const iterator& it)
  {
    return
      data_ + offset_ ==
      it.data_ + it.offset_;
  }

  bool operator!=(const iterator& it)
  {
    return
      data_ + offset_ !=
      it.data_ + it.offset_;
  }

private:

  int size()
  {
    unsigned char ch = data_[offset_];
    if (ch == 0)
      return 0;
    else if (ch < 192)
      return 1;
    else if (ch < 224)
      return 2;
    else if (ch < 240)
      return 3;
    else if (ch < 248)
      return 4;
    else if (ch < 252)
      return 5;
    else if (ch < 254)
      return 6;

    // TODO: on error?
    return 1;
  }

private:

  const unsigned char* data_;
  std::size_t offset_;
};

} // namespace utf8
} // namespace sourcetools

#endif /* SOURCETOOLS_UTF8_UTF8_H */