File: utf8iter.cpp

package info (click to toggle)
recoll 1.43.7-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 16,512 kB
  • sloc: cpp: 104,170; python: 9,500; xml: 7,248; ansic: 6,447; sh: 1,212; perl: 130; makefile: 72
file content (123 lines) | stat: -rw-r--r-- 3,453 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
/* Copyright (C) 2017-2020 J.F.Dockes
 *   This program is free software; you can redistribute it and/or modify
 *   it under the terms of the GNU Lesser General Public License as published by
 *   the Free Software Foundation; either version 2.1 of the License, or
 *   (at your option) any later version.
 *
 *   This program is distributed in the hope that it will be useful,
 *   but WITHOUT ANY WARRANTY; without even the implied warranty of
 *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *   GNU Lesser General Public License for more details.
 *
 *   You should have received a copy of the GNU Lesser General Public License
 *   along with this program; if not, write to the
 *   Free Software Foundation, Inc.,
 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 */

#include "utf8iter.h"

#include <unordered_set>

void utf8truncate(std::string& s, int maxlen, int flags, const std::string& ellipsis,
                  const std::string& ws)
{
    if (s.size() <= std::string::size_type(maxlen)) {
        return;
    }
    std::unordered_set<int> wss;
    if (flags & UTF8T_ATWORD) {
        Utf8Iter iter(ws);
        for (; !iter.eof(); iter++) {
            unsigned int c = *iter;
            wss.insert(c);
        }
    }

    if (flags & UTF8T_ELLIPSIS) {
        size_t ellen = utf8len(ellipsis);
        maxlen = std::max(0, maxlen - int(ellen));
    }

    std::string::size_type pos = 0;
    std::string::size_type lastwspos = 0;
    {
        Utf8Iter iter(s);
        for (; !iter.eof(); iter++) {
            unsigned int c = *iter;
            if (iter.getBpos() < std::string::size_type(maxlen)) {
                pos = iter.getBpos() + iter.getBlen();
                if ((flags & UTF8T_ATWORD) && wss.find(c) != wss.end()) {
                    lastwspos = pos;
                }
            } else {
                break;
            }
        }
    }
    if (flags & UTF8T_ATWORD) {
        s.erase(lastwspos);
        for (;;) {
            Utf8Iter iter(s);
            unsigned int c = 0;
            for (; !iter.eof(); iter++) {
                c = *iter;
                pos = iter.getBpos();
            }
            if (wss.find(c) == wss.end()) {
                break;
            }
            s.erase(pos);
        }
    } else {
        s.erase(pos);
    }

    if (flags & UTF8T_ELLIPSIS) {
        s += ellipsis;
    }
}

size_t utf8len(const std::string& s)
{
    size_t len = 0;
    Utf8Iter iter(s);
    while (iter++ != std::string::npos) {
        len++;
    }
    return len;
}

static const std::string replchar{"\xef\xbf\xbd"};

// Check utf-8 encoding, replacing errors with the ? char above
int utf8check(const std::string& in, bool fixit, std::string *out, int maxrepl)
{
    int cnt = 0;
    Utf8Iter it(in);
    for (;!it.eof(); it++) {
        if (it.error()) {
            if (!fixit) {
                return -1;
            }
            *out += replchar;
            ++cnt;
            for (; cnt < maxrepl; cnt++) {
                it.retryfurther();
                if (it.eof())
                    return cnt;
                if (!it.error())
                    break;
                *out += replchar;
            }
            if (it.error()) {
                return -1;
            }
        }
        // We have reached a good char and eof is false
        if (fixit) {
            it.appendchartostring(*out);
        }
    }
    return cnt;
}