File: encode_rfc2047.c

package info (click to toggle)
nmh 1.6-2
links: PTS
area: main
in suites: jessie, jessie-kfreebsd
size: 6,204 kB
ctags: 3,851
sloc: ansic: 48,922; sh: 16,422; makefile: 559; perl: 509; lex: 402; awk: 74
file content (865 lines) | stat: -rw-r--r-- 21,670 bytes
parent folder | download | duplicates (2)
/*
 * Routines to encode message headers using RFC 2047-encoding.
 *
 * This code is Copyright (c) 2002, by the authors of nmh.  See the
 * COPYRIGHT file in the root directory of the nmh distribution for
 * complete copyright information.
 */

#include <h/mh.h>
#include <h/mhparse.h>
#include <h/addrsbr.h>
#include <h/utils.h>

/*
 * List of headers that contain addresses and as a result require special
 * handling
 */

static char *address_headers[] = {
    "To",
    "From",
    "cc",
    "Bcc",
    "Reply-To",
    "Sender",
    "Resent-To",
    "Resent-From",
    "Resent-cc",
    "Resent-Bcc",
    "Resent-Reply-To",
    "Resent-Sender",
    NULL,
};

/*
 * Macros we use for parsing headers
 */

#define is_fws(c) (c == '\t' || c == ' ' || c == '\n')

#define qphrasevalid(c) ((c >= '0' && c <= '9') || (c >= 'A' && c <= 'Z') || \
			 (c >= 'a' && c <= 'z') || \
			 c == '!' || c == '*' || c == '+' || c == '-' || \
			 c == '/' || c == '=' || c == '_')
#define qpspecial(c) (c < ' ' || c == '=' || c == '?' || c == '_')

#define base64len(n) ((((n) + 2) / 3 ) * 4)	/* String len to base64 len */
#define strbase64(n) ((n) / 4 * 3)		/* Chars that fit in base64 */

#define ENCODELINELIMIT	76

static void unfold_header(char **, int);
static int field_encode_address(const char *, char **, int, const char *);
static int field_encode_quoted(const char *, char **, const char *, int,
			       int, int);
static int field_encode_base64(const char *, char **, const char *);
static int scanstring(const char *, int *, int *, int *);
static int utf8len(const char *);
static int pref_encoding(int, int, int);

/*
 * Encode a message header using RFC 2047 encoding.  We make the assumption
 * that all characters < 128 are ASCII and as a consequence don't need any
 * encoding.
 */

int
encode_rfc2047(const char *name, char **value, int encoding,
	       const char *charset)
{
    int i, asciicount = 0, eightbitcount = 0, qpspecialcount = 0;
    char *p;

    /*
     * First, check to see if we even need to encode the header
     */

    for (p = *value; *p != '\0'; p++) {
	if (isascii((unsigned char) *p)) {
	    asciicount++;
	    if (qpspecial((unsigned char) *p))
	    	qpspecialcount++;
	} else
	    eightbitcount++;
    }

    if (eightbitcount == 0)
    	return 0;

    /*
     * Some rules from RFC 2047:
     *
     * - Encoded words cannot be more than 75 characters long
     * - Multiple "long" encoded words must be on new lines.
     *
     * Also, we're not permitted to encode email addresses, so
     * we need to actually _parse_ email addresses and only encode
     * the right bits.  
     */

    /*
     * If charset was NULL, then get the value from the locale.  But
     * we reject it if it returns US-ASCII
     */

    if (charset == NULL)
    	charset = write_charset_8bit();

    if (strcasecmp(charset, "US-ASCII") == 0) {
    	advise(NULL, "Cannot use US-ASCII with 8 bit characters in header");
	return 1;
    }

    /*
     * If we have an address header, then we need to parse the addresses
     * and only encode the names or comments.  Otherwise, handle it normally.
     */

    for (i = 0; address_headers[i]; i++) {
    	if (strcasecmp(name, address_headers[i]) == 0)
	    return field_encode_address(name, value, encoding, charset);
    }

    /*
     * On the encoding we choose, and the specifics of encoding:
     *
     * - If a specified encoding is passed in, we use that.
     * - Otherwise, pick which encoding is shorter.
     *
     * We don't quite handle continuation right here, but it should be
     * pretty close.
     */

    if (encoding == CE_UNKNOWN)
        encoding = pref_encoding(asciicount, qpspecialcount, eightbitcount);

    unfold_header(value, asciicount + eightbitcount);

    switch (encoding) {

    case CE_BASE64:
    	return field_encode_base64(name, value, charset);

    case CE_QUOTED:
	return field_encode_quoted(name, value, charset, asciicount,
				   eightbitcount + qpspecialcount, 0);

    default:
    	advise(NULL, "Internal error: unknown RFC-2047 encoding type");
	return 1;
    }
}

/*
 * Encode our specified header (or field) using quoted-printable
 */

static int
field_encode_quoted(const char *name, char **value, const char *charset,
		    int ascii, int encoded, int phraserules)
{
    int prefixlen = name ? strlen(name) + 2: 0, outlen = 0, column, newline = 1;
    int charsetlen = strlen(charset), utf8;
    char *output = NULL, *p, *q = NULL;

    /*
     * Right now we just encode the whole thing.  Maybe later on we'll
     * only encode things on a per-atom basis.
     */

    p = *value;

    column = prefixlen + 2;	/* Header name plus ": " */

    utf8 = strcasecmp(charset, "UTF-8") == 0;

    while (*p != '\0') {
    	/*
	 * Start a new line, if it's time
	 */
    	if (newline) {
	    /*
	     * If it's the start of the header, we don't need to pad it
	     *
	     * The length of the output string is ...
	     * =?charset?Q?...?=  so that's 7+strlen(charset) + 2 for \n NUL
	     *
	     * plus 1 for every ASCII character and 3 for every eight bit
	     * or special character (eight bit characters are written as =XX).
	     *
	     */

	    int tokenlen;

	    outlen += 9 + charsetlen + ascii + 3 * encoded;

	    /*
	     * If output is set, then we're continuing the header.  Otherwise
	     * do the initial allocation.
	     */

	    if (output) {
	        int curlen = q - output, i;
		outlen += prefixlen + 1;	/* Header plus \n ": " */
		output = mh_xrealloc(output, outlen);
		q = output + curlen;
		*q++ = '?';
		*q++ = '=';
		*q++ = '\n';
		for (i = 0; i < prefixlen; i++)
		    *q++ = ' ';
	    } else {
	    	/*
		 * A bit of a hack here; the header can contain multiple
		 * spaces (probably at least one) until we get to the
		 * actual text.  Copy until we get to a non-space.
		 */
	    	output = mh_xmalloc(outlen);
		q = output;
		while (is_fws(*p))
		    *q++ = *p++;
	    }

	    tokenlen = snprintf(q, outlen - (q - output), "=?%s?Q?", charset);
	    q += tokenlen;
	    column = prefixlen + tokenlen;
	    newline = 0;
	}

	/*
	 * Process each character, encoding if necessary
	 *
	 * Note that we have a different set of rules if we're processing
	 * RFC 5322 'phrase' (something you'd see in an address header).
	 */

	column++;

	if (*p == ' ') {
	    *q++ = '_';
	    ascii--;
	} else if (isascii((unsigned char) *p) &&
		   (phraserules ? qphrasevalid((unsigned char) *p) :
		   			!qpspecial((unsigned char) *p))) {
	    *q++ = *p;
	    ascii--;
	} else {
	    snprintf(q, outlen - (q - output), "=%02X", (unsigned char) *p);
	    q += 3;
	    column += 2;	/* column already incremented by 1 above */
	    encoded--;
	}

	p++;

	/*
	 * We're not allowed more than ENCODELINELIMIT characters per line,
	 * so reserve some room for the final ?=.
	 *
	 * If prefixlen == 0, we haven't been passed in a header name, so
	 * don't ever wrap the field (we're likely doing an address).
	 */

	if (prefixlen == 0)
	    continue;

	if (column >= ENCODELINELIMIT - 2) {
	    newline = 1;
	} else if (utf8) {
	    /*
	     * Okay, this is a bit weird, but to explain a bit more ...
	     *
	     * RFC 2047 prohibits the splitting of multibyte characters
	     * across encoded words.  Right now we only handle the case
	     * of UTF-8, the most common multibyte encoding.
	     *
	     * p is now pointing at the next input character.  If we're
	     * using UTF-8 _and_ we'd go over ENCODELINELIMIT given the
	     * length of the complete character, then trigger a newline
	     * now.  Note that we check the length * 3 since we have to
	     * allow for the encoded output.
	     */
	    if (column + (utf8len(p) * 3) > ENCODELINELIMIT - 2) {
	    	newline = 1;
	    }
	}
    }

    *q++ = '?';
    *q++ = '=';

    if (prefixlen)
    	*q++ = '\n';

    *q = '\0';

    free(*value);

    *value = output;

    return 0;
}

/*
 * Encode our specified header (or field) using base64.
 *
 * This is a little easier since every character gets encoded, we can
 * calculate the line wrap up front.
 */

static int
field_encode_base64(const char *name, char **value, const char *charset)
{
    int prefixlen = name ? strlen(name) + 2 : 0, charsetlen = strlen(charset);
    int outlen = 0, numencode, curlen;
    char *output = NULL, *p = *value, *q = NULL, *linestart = NULL;

    /*
     * Skip over any leading white space.
     */

    while (*p == ' ' || *p == '\t')
    	p++;

    /*
     * If we had a zero-length prefix, then just encode the whole field
     * as-is, without line wrapping.  Note that in addition to the encoding
     *
     * The added length we need is =? + charset + ?B? ... ?=
     *
     * That's 7 + strlen(charset) + 2 (for \n NUL).
     */

    while (prefixlen && ((base64len(strlen(p)) + 7 + charsetlen +
    			  prefixlen) > ENCODELINELIMIT)) {

	/*
	 * Our very first time, don't pad the line in the front
	 *
	 * Note ENCODELINELIMIT is + 2 because of \n \0
	 */


	if (! output) {
	    outlen += ENCODELINELIMIT + 2;
	    output = q = mh_xmalloc(outlen);
	    linestart = q - prefixlen;	/* Yes, this is intentional */
	} else {
	    int curstart = linestart - output;
	    curlen = q - output;

	    outlen += ENCODELINELIMIT + 2;
	    output = mh_xrealloc(output, outlen);
	    q = output + curlen;
	    linestart = output + curstart;
	}

	/*
	 * We should have enough space now, so prepend the encoding markers
	 * and character set information.  The leading space is intentional.
	 */

	q += snprintf(q, outlen - (q - output), " =?%s?B?", charset);

	/*
         * Find out how much room we have left on the line and see how
         * many characters we can stuff in.  The start of our line
         * is marked by "linestart", so use that to figure out how
         * many characters are left out of ENCODELINELIMIT.  Reserve
         * 2 characters for the end markers and calculate how many
         * characters we can fit into that space given the base64
         * encoding expansion.
	 */

	numencode = strbase64(ENCODELINELIMIT - (q - linestart) - 2);

	if (numencode <= 0) {
	    advise(NULL, "Internal error: tried to encode %d characters "
	    	   "in base64", numencode);
	    return 1;
	}

	/*
	 * RFC 2047 prohibits spanning multibyte characters across tokens.
	 * Right now we only check for UTF-8.
	 *
	 * So note the key here ... we want to make sure the character BEYOND
	 * our last character is not a continuation byte.  If it's the start
	 * of a new multibyte character or a single-byte character, that's ok.
	 */

	if (strcasecmp(charset, "UTF-8") == 0) {
	    /*
	     * p points to the start of our current buffer, so p + numencode
	     * is one past the last character to encode
	     */

	    while (numencode > 0 && ((*(p + numencode) & 0xc0) == 0x80))
	    	numencode--;

	    if (numencode == 0) {
	    	advise(NULL, "Internal error: could not find start of "
		       "UTF-8 character when base64 encoding header");
		return 1;
	    }
	}

	if (writeBase64raw((unsigned char *) p, numencode,
			   (unsigned char *) q) != OK) {
	    advise(NULL, "Internal error: base64 encoding of header failed");
	    return 1;
	}

	p += numencode;
	q += base64len(numencode);

	/*
	 * This will point us at the beginning of the new line (trust me).
	 */

	linestart = q + 3;

	/*
	 * What's going on here?  Well, we know we're continuing to the next
	 * line, so we want to add continuation padding.  We also add the
	 * trailing marker for the RFC 2047 token at this time as well.
	 * This uses a trick of snprintf(); we tell it to print a zero-length
	 * string, but pad it out to prefixlen - 1 characters; that ends
	 * up always printing out the requested number of spaces.  We use
	 * prefixlen - 1 because we always add a space on the starting
	 * token marker; this makes things work out correctly for the first
	 * line, which should have a space between the ':' and the start
	 * of the token.
	 *
	 * It's okay if you don't follow all of that.
	 */

	q += snprintf(q, outlen - (q - output), "?=\n%*s", prefixlen - 1, "");
    }

    /*
     * We're here if there is either no prefix, or we can fit it in less
     * than ENCODELINELIMIT characters.  Encode the whole thing.
     */

    outlen += prefixlen + 9 + charsetlen + base64len(strlen(p));
    curlen = q - output;

    output = mh_xrealloc(output, outlen);
    q = output + curlen;

    q += snprintf(q, outlen - (q - output), "%s=?%s?B?",
    		  prefixlen ? " " : "", charset);

    if (writeBase64raw((unsigned char *) p, strlen(p),
    		       (unsigned char *) q) != OK) {
	advise(NULL, "Internal error: base64 encoding of header failed");
	return 1;
    }

    strcat(q, "?=");

    if (prefixlen)
    	strcat(q, "\n");

    free(*value);

    *value = output;

    return 0;
}

/*
 * Calculate the length of a UTF-8 character.
 *
 * If it's not a UTF-8 character (or we're in the middle of a multibyte
 * character) then simply return 0.
 */

static int
utf8len(const char *p)
{
    int len = 1;

    if (*p == '\0')
    	return 0;

    if (isascii((unsigned char) *p) || (((unsigned char) *p) & 0xc0) == 0x80)
    	return 0;

    p++;
    while ((((unsigned char) *p++) & 0xc0) == 0x80)
    	len++;

    return len;
}

/*
 * "Unfold" a header, making it a single line (without continuation)
 *
 * We cheat a bit here; we never make the string longer, so using the
 * original length here is fine.
 */

static void
unfold_header(char **value, int len)
{
    char *str = mh_xmalloc(len + 1);
    char *p = str, *q = *value;

    while (*q != '\0') {
    	if (*q == '\n') {
	    /*
	     * When we get a newline, skip to the next non-whitespace
	     * character and add a space to replace all of the whitespace
	     *
	     * This has the side effect of stripping off the final newline
	     * for the header; we put it back in the encoding routine.
	     */
	    while (is_fws(*q))
	    	q++;
	    if (*q == '\0')
	    	break;

	    *p++ = ' ';
	} else {
	    *p++ = *q++;
	}
    }

    *p = '\0';

    free(*value);
    *value = str;
}

/*
 * Decode a header containing addresses.  This means we have to parse
 * each address and only encode the display-name or comment field.
 */

static int
field_encode_address(const char *name, char **value, int encoding,
		     const char *charset)
{
    int prefixlen = strlen(name) + 2, column = prefixlen, groupflag;
    int asciichars, specialchars, eightbitchars, reformat = 0, errflag = 0;
    size_t len;
    char *mp, *cp = NULL, *output = NULL;
    char *tmpbuf = NULL;
    size_t tmpbufsize = 0;
    struct mailname *mn;
    char errbuf[BUFSIZ];

    /*
     * Because these are addresses, we need to handle them individually.
     *
     * Break them down and process them one by one.  This means we have to
     * rewrite the whole header, but that's unavoidable.
     */

    /*
     * The output headers always have to start with a space first; this
     * is just the way the API works right now.
     */

    output = add(" ", output);

    for (groupflag = 0; (mp = getname(*value)); ) {
    	if ((mn = getm(mp, NULL, 0, errbuf, sizeof(errbuf))) == NULL) {
	    advise(NULL, "%s: %s", errbuf, mp);
	    errflag++;
	    continue;
	}

	reformat = 0;

	/*
	 * We only care if the phrase (m_pers) or any trailing comment
	 * (m_note) have 8-bit characters.  If doing q-p, we also need
	 * to encode anything marked as qspecial().  Unquote it first
	 * so the specialchars count is right.
	 */

	if (! mn->m_pers)
	    goto check_note;

	if ((len = strlen(mn->m_pers)) + 1 > tmpbufsize) {
	    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
	}

	unquote_string(mn->m_pers, tmpbuf);

	if (scanstring(tmpbuf, &asciichars, &eightbitchars,
		       &specialchars)) {
	    /*
	     * If we have 8-bit characters, encode it.
	     */

	    if (encoding == CE_UNKNOWN)
	    	encoding = pref_encoding(asciichars, specialchars,
					 eightbitchars);

	    /*
	     * This is okay, because the output of unquote_string will be either
	     * equal or shorter than the original.
	     */

	    strcpy(mn->m_pers, tmpbuf);

	    switch (encoding) {

	    case CE_BASE64:
	    	if (field_encode_base64(NULL, &mn->m_pers, charset)) {
		    errflag++;
		    goto out;
		}
		break;

	    case CE_QUOTED:
	    	if (field_encode_quoted(NULL, &mn->m_pers, charset, asciichars,
					eightbitchars + specialchars, 1)) {
		    errflag++;
		    goto out;
		}
		break;

	    default:
		advise(NULL, "Internal error: unknown RFC-2047 encoding type");
		errflag++;
		goto out;
	    }

	    reformat++;
	}

	check_note:

	/*
	 * The "note" field is generally a comment at the end of the address,
	 * at least as how it's implemented here.  Notes are always surrounded
	 * by parenthesis (since they're comments).  Strip them out and
	 * then put them back when we format the final field, but they do
	 * not get encoded.
	 */

	if (! mn->m_note)
	    goto do_reformat;

	if ((len = strlen(mn->m_note)) + 1 > tmpbufsize) {
	    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = len + 1);
	}

	if (mn->m_note[0] != '(' || mn->m_note[len - 1] != ')') {
	    advise(NULL, "Internal error: Invalid note field \"%s\"",
	    	   mn->m_note);
	    errflag++;
	    goto out;
	}

	strncpy(tmpbuf, mn->m_note + 1, len - 1);
	tmpbuf[len - 2] = '\0';

	if (scanstring(tmpbuf, &asciichars, &eightbitchars,
		       &specialchars)) {
	    /*
	     * If we have 8-bit characters, encode it.
	     */

	    if (encoding == CE_UNKNOWN)
	    	encoding = pref_encoding(asciichars, specialchars,
					 eightbitchars);

	    switch (encoding) {

	    case CE_BASE64:
	    	if (field_encode_base64(NULL, &tmpbuf, charset)) {
		    errflag++;
		    goto out;
		}
		break;

	    case CE_QUOTED:
	    	if (field_encode_quoted(NULL, &tmpbuf, charset, asciichars,
					eightbitchars + specialchars, 1)) {
		    errflag++;
		    goto out;
		}
		break;

	    default:
		advise(NULL, "Internal error: unknown RFC-2047 encoding type");
		errflag++;
		goto out;
	    }

	    reformat++;

	    /*
	     * Make sure the size of tmpbuf is correct (it always gets
	     * reallocated in the above functions).
	     */

	    tmpbufsize = strlen(tmpbuf) + 1;

	    /*
	     * Put the note field back surrounded by parenthesis.
	     */

	    mn->m_note = mh_xrealloc(mn->m_note, tmpbufsize + 2);

	    snprintf(mn->m_note, tmpbufsize + 2, "(%s)", tmpbuf);
	}

do_reformat:

	/*
	 * So, some explanation is in order.
	 *
	 * We know we need to rewrite at least one address in the header,
	 * otherwise we wouldn't be here.  If we had to reformat this
	 * particular address, then run it through adrformat().  Otherwise
	 * we can use m_text directly.
	 */

	/*
	 * If we were in a group but are no longer, make sure we add a
	 * semicolon (which needs to be FIRST, as it needs to be at the end
	 * of the last address).
	 */

	if (groupflag && ! mn->m_ingrp) {
	    output = add(";", output);
	    column += 1;
	}

	groupflag = mn->m_ingrp;

	if (mn->m_gname) {
	    cp = add(mn->m_gname, NULL);
	}

	if (reformat) {
	    cp = add(adrformat(mn), cp);
	} else {
	    cp = add(mn->m_text, cp);
	}

	len = strlen(cp);

	/*
	 * If we're not at the beginning of the line, add a command and
	 * either a space or a newline.
	 */

	if (column != prefixlen) {
	    if (len + column + 2 > OUTPUTLINELEN) {

	    	if ((size_t) (prefixlen + 3) < tmpbufsize)
		    tmpbuf = mh_xrealloc(tmpbuf, tmpbufsize = prefixlen + 3);

		snprintf(tmpbuf, tmpbufsize, ",\n%*s", column = prefixlen, "");
		output = add(tmpbuf, output);
	    } else {
	    	output = add(", ", output);
		column += 2;
	    }
	}

	/*
	 * Finally add the address
	 */

	output = add(cp, output);
	column += len;
	free(cp);
	cp = NULL;
    }

    /*
     * Just in case we're at the end of a list
     */

    if (groupflag) {
	output = add(";", output);
    }

    output = add("\n", output);

    free(*value);
    *value = output;
    output = NULL;

out:

    if (tmpbuf)
    	free(tmpbuf);
    if (output)
    	free(output);

    return errflag > 0;
}

/*
 * Scan a string, check for characters that need to be encoded
 */

static int
scanstring(const char *string, int *asciilen, int *eightbitchars,
	   int *specialchars)
{
    *asciilen = 0;
    *eightbitchars = 0;
    *specialchars = 0;

    for (; *string != '\0'; string++) {
    	if ((isascii((unsigned char) *string))) {
	    (*asciilen)++;
	    /*
	     * So, a space is not a valid phrase character, but we're counting
	     * an exception here, because in q-p a space can be directly
	     * encoded as an underscore.
	     */
	    if (!qphrasevalid((unsigned char) *string) && *string != ' ')
	    	(*specialchars)++;
	} else {
	    (*eightbitchars)++;
	}
    }

    return *eightbitchars > 0;
}

/*
 * This function is to be used to decide which encoding algorithm we should
 * use if one is not given.  Basically, we pick whichever one is the shorter
 * of the two.
 *
 * Arguments are:
 *
 * ascii	- Number of ASCII characters in to-be-encoded string.
 * specials	- Number of ASCII characters in to-be-encoded string that
 *		  still require encoding under quoted-printable.  Note that
 *		  these are included in the "ascii" total.
 * eightbit	- Eight-bit characters in the to-be-encoded string.
 *
 * Returns one of CE_BASE64 or CE_QUOTED.
 */

static int
pref_encoding(int ascii, int specials, int eightbits)
{
    /*
     * The length of the q-p encoding is:
     *
     * ascii - specials + (specials + eightbits) * 3.
     *
     * The length of the base64 encoding is:
     *
     * base64len(ascii + eightbits)	(See macro for details)
     */

    return base64len(ascii + eightbits) < (ascii - specials +
    			(specials + eightbits) * 3) ? CE_BASE64 : CE_QUOTED;
}