1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
/* Copyright (C) 2005 J.F.Dockes
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the
* Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include "autoconfig.h"
#include <stdlib.h>
#include <ctype.h>
#include <cstring>
#include <fstream>
#include <sstream>
#include "idfile.h"
#include "log.h"
#include "smallut.h"
using namespace std;
// Bogus code to avoid bogus valgrind mt warnings about the
// initialization of treat_mbox_... which I can't even remember the
// use of (it's not documented or ever set)
static int treat_mbox_as_rfc822;
class InitTMAR {
public:
InitTMAR() {
treat_mbox_as_rfc822 = getenv("RECOLL_TREAT_MBOX_AS_RFC822") ? 1 : -1;
}
};
static InitTMAR initTM;
/**
* This code is currently ONLY used to identify mbox and mail message files
* which are badly handled by standard mime type identifiers
* There is a very old (circa 1990) mbox format using blocks of ^A (0x01) chars
* to separate messages, that we don't recognize currently
*/
// Mail headers we compare to:
static const char *mailhs[] = {"From: ", "Received: ", "Message-Id: ", "To: ",
"Date: ", "Subject: ", "Status: ", "In-Reply-To: "};
static const int mailhsl[] = {6, 10, 12, 4, 6, 9, 8, 13};
static const int nmh = sizeof(mailhs) / sizeof(char *);
const int wantnhead = 3;
// fn is for message printing
static string idFileInternal(istream& input, const char *fn)
{
bool line1HasFrom = false;
bool gotnonempty = false;
int lookslikemail = 0;
// emacs VM sometimes inserts very long lines with continuations or
// not (for folder information). This forces us to look at many
// lines and long ones
int lnum = 1;
for (int loop = 1; loop < 200; loop++, lnum++) {
#define LL 2*1024
char cline[LL+1];
cline[LL] = 0;
input.getline(cline, LL-1);
if (input.fail()) {
if (input.bad()) {
LOGERR("idfile: error while reading [" << (fn) << "]\n");
return string();
}
// Must be eof ?
break;
}
// gcount includes the \n
std::streamsize ll = input.gcount() - 1;
if (ll > 0)
gotnonempty = true;
LOGDEB2("idfile: lnum " << lnum << " ll " << ll << ": [" << cline << "]\n");
// Check for a few things that can't be found in a mail file,
// (optimization to get a quick negative)
// Empty lines
if (ll <= 0) {
// Accept a few empty lines at the beginning of the file,
// otherwise this is the end of headers
if (gotnonempty || lnum > 10) {
LOGDEB2("Got empty line\n");
break;
} else {
// Don't increment the line counter for initial empty lines.
lnum--;
continue;
}
}
// emacs vm can insert VERY long header lines.
if (ll > LL - 20) {
LOGDEB2("idFile: Line too long\n");
return string();
}
// Check for mbox 'From ' line
if (lnum == 1 && !strncmp("From ", cline, 5)) {
if (treat_mbox_as_rfc822 == -1) {
line1HasFrom = true;
LOGDEB2("idfile: line 1 has From_\n");
}
continue;
}
// Except for a possible first line with 'From ', lines must
// begin with whitespace or have a colon
// (hope no one comes up with a longer header name !
// Take care to convert to unsigned char because ms ctype does
// like negative values
if (!isspace((unsigned char)cline[0])) {
char *cp = strchr(cline, ':');
if (nullptr == cp || (cp - cline) > 70) {
LOGDEB2("idfile: can't be mail header line: [" << (cline) << "]\n");
break;
}
}
// Compare to known headers
for (int i = 0; i < nmh; i++) {
if (!strncasecmp(mailhs[i], cline, mailhsl[i])) {
//fprintf(stderr, "Got [%s]\n", mailhs[i]);
lookslikemail++;
break;
}
}
if (lookslikemail >= wantnhead)
break;
}
if (line1HasFrom)
lookslikemail++;
if (lookslikemail >= wantnhead)
return line1HasFrom ? string("text/x-mail") : string("message/rfc822");
return string();
}
string idFile(const char *fn)
{
ifstream input;
input.open(fn, ios::in);
if (!input.is_open()) {
LOGERR("idFile: could not open [" << fn << "]\n");
return string();
}
return idFileInternal(input, fn);
}
string idFileMem(const string& data)
{
stringstream s(data, stringstream::in);
return idFileInternal(s, "");
}
|