File: HTMLEncodingResolver.h

package info (click to toggle)
pageedit 2.4.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 10,956 kB
  • sloc: ansic: 31,806; cpp: 15,036; python: 1,141; javascript: 87; sh: 13; makefile: 7
file content (58 lines) | stat: -rw-r--r-- 2,034 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
/************************************************************************
**
**  Copyright (C) 2016-2024 Kevin B. Hendricks, Stratford Ontario Canada
**  Copyright (C) 2009-2011 Strahinja Markovic  <strahinja.markovic@gmail.com>
**
**  This file is part of PageEdit.
**
**  PageEdit is free software: you can redistribute it and/or modify
**  it under the terms of the GNU General Public License as published by
**  the Free Software Foundation, either version 3 of the License, or
**  (at your option) any later version.
**
**  PageEdit is distributed in the hope that it will be useful,
**  but WITHOUT ANY WARRANTY; without even the implied warranty of
**  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
**  GNU General Public License for more details.
**
**  You should have received a copy of the GNU General Public License
**  along with PageEdit.  If not, see <http://www.gnu.org/licenses/>.
**
*************************************************************************/

#pragma once
#ifndef HTMLEncodingResolver_H
#define HTMLEncodingResolver_H

#include <QStringDecoder>

class QString;

class HTMLEncodingResolver
{

public:

    // Accepts a full path to an HTML file.
    // Reads the file, detects the encoding
    // and returns the text converted to Unicode.
    static QString ReadHTMLFile(const QString &fullfilepath);

private:

    // Accepts an HTML stream and tries to determine its encoding;
    // if no encoding is detected, the default codec for this locale is returned.
    // We use this function because Qt's QTextCodec::codecForHtml() function
    // leaves a *lot* to be desired.
    static QStringDecoder GetDecoderForHTML(const QByteArray &raw_text);

    // This function goes through the entire byte array
    // and tries to see whether this is a valid UTF-8 sequence.
    // If it's valid, this is probably a UTF-8 string.
    static bool IsValidUtf8(const QByteArray &string);

    static QByteArray FixupCodePageMapping(const QByteArray& ba);
};


#endif // HTMLEncodingResolver_H