1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
|
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "autoconfig.h"
#include <iostream>
#include "cstr.h"
#include "mimehandler.h"
#include "log.h"
#include "readfile.h"
#include "transcode.h"
#include "mimeparse.h"
#include "myhtmlparse.h"
#include "indextext.h"
#include "mh_html.h"
#include "smallut.h"
#include "rclutil.h"
#include "md5ut.h"
#include "pathut.h"
#include "rclconfig.h"
using namespace std;
bool MimeHandlerHtml::set_document_file_impl(const string& mt, const string &fn)
{
LOGDEB0("MimeHandlerHtml::set_document_file_impl: " << fn << "\n");
// Check file size against limit. We use the same value as for
// text/plain. xdg-mime sometimes wrongly returns text/html for
// gigantic files (had a case with multi-GB xxx.enex evernote
// export files).
int maxmbs = -1;
m_config->getConfParam("textfilemaxmbs", &maxmbs);
auto totlen = path_filesize(fn);
if (totlen < 0) {
LOGSYSERR("MimeHandlerHtml::set_document_file", "stat", fn);
return false;
}
string otext;
if (maxmbs != -1 && totlen / (1024*1024) > maxmbs) {
LOGINF("MimeHandlerHtml: file too big (textfilemaxmbs=" << maxmbs <<
"), contents will not be indexed: " << fn << "\n");
} else {
string reason;
if (!file_to_string(fn, otext, &reason)) {
LOGERR("textHtmlToDoc: cant read: " << fn << ": " << reason << "\n");
return false;
}
}
m_filename = fn;
return set_document_string(mt, otext);
}
bool MimeHandlerHtml::set_document_string_impl(const string&, const string& htext)
{
m_html = htext;
m_havedoc = true;
if (!m_forPreview) {
// We want to compute the md5 now because we may modify m_html later
string md5, xmd5;
MD5String(htext, md5);
m_metaData[cstr_dj_keymd5] = MD5HexPrint(md5, xmd5);
}
return true;
}
bool MimeHandlerHtml::next_document()
{
if (m_havedoc == false)
return false;
m_havedoc = false;
// If set_doc(fn), take note of file name.
string fn = m_filename;
m_filename.erase();
string charset = m_dfltInputCharset;
LOGDEB("MHHtml::next_doc.: default supposed input charset: [" << charset << "]\n");
// Override default input charset if someone took care to set one:
const auto it = m_metaData.find(cstr_dj_keycharset);
if (it != m_metaData.end() && !it->second.empty()) {
charset = it->second;
LOGDEB("MHHtml: next_doc.: input charset from ext. metadata: [" << charset << "]\n");
}
// - We first try to convert from the supposed charset
// (which may depend of the current directory) to utf-8. If this
// fails, we keep the original text
// - During parsing, if we find a charset parameter, and it differs from
// what we started with, we abort and restart with the parameter value
// instead of the configuration one.
MyHtmlParser result;
for (int pass = 0; pass < 2; pass++) {
string transcoded;
LOGDEB("Html::mkDoc: pass " << pass << "\n");
MyHtmlParser p;
// Try transcoding. If it fails, use original text.
int ecnt;
if (!transcode(m_html, transcoded, charset, cstr_utf8, &ecnt)) {
LOGDEB("textHtmlToDoc: transcode failed from cs '" <<
charset << "' to UTF-8 for[" << (fn.empty()?"unknown":fn) << "]");
transcoded = m_html;
// We don't know the charset, at all
p.reset_charsets();
charset.clear();
} else {
if (ecnt) {
if (pass == 0) {
LOGDEB("textHtmlToDoc: init transcode had " << ecnt <<
" errors for ["<<(fn.empty()?"unknown":fn)<< "]\n");
} else {
LOGERR("textHtmlToDoc: final transcode had " << ecnt <<
" errors for ["<< (fn.empty()?"unknown":fn)<< "]\n");
}
}
// charset has the putative source charset, transcoded is now
// in utf-8
p.set_charsets(charset, cstr_utf8);
}
try {
p.parse_html(transcoded);
// No exception: ok? But throw true to use the same
// code path as if an exception had been thrown by parse_html
throw true;
break;
} catch (bool diag) {
result = p;
if (diag == true) {
// Parser throws true at end of text. ok
if (m_forPreview) {
// Save the html text
m_html = transcoded;
// In many cases, we need to change the charset decl,
// because the file was transcoded. It seems that just
// inserting one is enough (only the 1st one seems to
// be used by browsers/qtextedit).
string::size_type idx = m_html.find("<head>");
if (idx == string::npos)
idx = m_html.find("<HEAD>");
if (idx != string::npos)
m_html.replace(idx+6, 0,
"<meta http-equiv=\"content-type\" "
"content=\"text/html; charset=UTF-8\">");
}
break;
}
LOGDEB("textHtmlToDoc: charset [" << charset << "] doc charset [" <<
result.get_charset() << "]\n");
if (!result.get_charset().empty() &&
!samecharset(result.get_charset(), result.fromcharset)) {
LOGDEB("textHtmlToDoc: reparse for charsets\n");
// Set the origin charset as specified in document before
// transcoding again
charset = result.get_charset();
} else {
LOGERR("textHtmlToDoc:: error: non charset exception\n");
return false;
}
}
}
m_metaData[cstr_dj_keyorigcharset] = result.get_charset();
m_metaData[cstr_dj_keycontent] = result.dump;
m_metaData[cstr_dj_keycharset] = cstr_utf8;
// Avoid setting empty values which would crush ones possibly inherited
// from parent (if we're an attachment)
if (!result.dmtime.empty())
m_metaData[cstr_dj_keymd] = result.dmtime;
m_metaData[cstr_dj_keymt] = cstr_textplain;
for (const auto& entry : result.meta) {
if (!entry.second.empty()) {
m_metaData[entry.first] = entry.second;
}
}
return true;
}
|