1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292
|
# Copyright (C) 2001,2002 Python Software Foundation
# Author: barry@zope.com (Barry Warsaw)
"""A parser of RFC 2822 and MIME email messages.
"""
import re
from cStringIO import StringIO
from types import ListType
from email import Errors
from email import Message
EMPTYSTRING = ''
NL = '\n'
try:
True, False
except NameError:
True = 1
False = 0
NLCRE = re.compile('\r\n|\r|\n')
class Parser:
def __init__(self, _class=Message.Message, strict=False):
"""Parser of RFC 2822 and MIME email messages.
Creates an in-memory object tree representing the email message, which
can then be manipulated and turned over to a Generator to return the
textual representation of the message.
The string must be formatted as a block of RFC 2822 headers and header
continuation lines, optionally preceeded by a `Unix-from' header. The
header block is terminated either by the end of the string or by a
blank line.
_class is the class to instantiate for new message objects when they
must be created. This class must have a constructor that can take
zero arguments. Default is Message.Message.
Optional strict tells the parser to be strictly RFC compliant or to be
more forgiving in parsing of ill-formatted MIME documents. When
non-strict mode is used, the parser will try to make up for missing or
erroneous boundaries and other peculiarities seen in the wild.
Default is non-strict parsing.
"""
self._class = _class
self._strict = strict
def parse(self, fp, headersonly=False):
"""Create a message structure from the data in a file.
Reads all the data from the file and returns the root of the message
structure. Optional headersonly is a flag specifying whether to stop
parsing after reading the headers or not. The default is False,
meaning it parses the entire contents of the file.
"""
root = self._class()
firstbodyline = self._parseheaders(root, fp)
if not headersonly:
self._parsebody(root, fp, firstbodyline)
return root
def parsestr(self, text, headersonly=False):
"""Create a message structure from a string.
Returns the root of the message structure. Optional headersonly is a
flag specifying whether to stop parsing after reading the headers or
not. The default is False, meaning it parses the entire contents of
the file.
"""
return self.parse(StringIO(text), headersonly=headersonly)
def _parseheaders(self, container, fp):
# Parse the headers, returning a list of header/value pairs. None as
# the header means the Unix-From header.
lastheader = ''
lastvalue = []
lineno = 0
firstbodyline = None
while True:
# Don't strip the line before we test for the end condition,
# because whitespace-only header lines are RFC compliant
# continuation lines.
line = fp.readline()
if not line:
break
line = line.splitlines()[0]
if not line:
break
# Ignore the trailing newline
lineno += 1
# Check for initial Unix From_ line
if line.startswith('From '):
if lineno == 1:
container.set_unixfrom(line)
continue
elif self._strict:
raise Errors.HeaderParseError(
'Unix-from in headers after first rfc822 header')
else:
# ignore the wierdly placed From_ line
# XXX: maybe set unixfrom anyway? or only if not already?
continue
# Header continuation line
if line[0] in ' \t':
if not lastheader:
raise Errors.HeaderParseError(
'Continuation line seen before first header')
lastvalue.append(line)
continue
# Normal, non-continuation header. BAW: this should check to make
# sure it's a legal header, e.g. doesn't contain spaces. Also, we
# should expose the header matching algorithm in the API, and
# allow for a non-strict parsing mode (that ignores the line
# instead of raising the exception).
i = line.find(':')
if i < 0:
if self._strict:
raise Errors.HeaderParseError(
"Not a header, not a continuation: ``%s''" % line)
elif lineno == 1 and line.startswith('--'):
# allow through duplicate boundary tags.
continue
else:
# There was no separating blank line as mandated by RFC
# 2822, but we're in non-strict mode. So just offer up
# this current line as the first body line.
firstbodyline = line
break
if lastheader:
container[lastheader] = NL.join(lastvalue)
lastheader = line[:i]
lastvalue = [line[i+1:].lstrip()]
# Make sure we retain the last header
if lastheader:
container[lastheader] = NL.join(lastvalue)
return firstbodyline
def _parsebody(self, container, fp, firstbodyline=None):
# Parse the body, but first split the payload on the content-type
# boundary if present.
boundary = container.get_boundary()
isdigest = (container.get_content_type() == 'multipart/digest')
# If there's a boundary, split the payload text into its constituent
# parts and parse each separately. Otherwise, just parse the rest of
# the body as a single message. Note: any exceptions raised in the
# recursive parse need to have their line numbers coerced.
if boundary:
preamble = epilogue = None
# Split into subparts. The first boundary we're looking for won't
# always have a leading newline since we're at the start of the
# body text, and there's not always a preamble before the first
# boundary.
separator = '--' + boundary
payload = fp.read()
if firstbodyline is not None:
payload = firstbodyline + '\n' + payload
# We use an RE here because boundaries can have trailing
# whitespace.
mo = re.search(
r'(?P<sep>' + re.escape(separator) + r')(?P<ws>[ \t]*)',
payload)
if not mo:
if self._strict:
raise Errors.BoundaryError(
"Couldn't find starting boundary: %s" % boundary)
container.set_payload(payload)
return
start = mo.start()
if start > 0:
# there's some pre-MIME boundary preamble
preamble = payload[0:start]
# Find out what kind of line endings we're using
start += len(mo.group('sep')) + len(mo.group('ws'))
mo = NLCRE.search(payload, start)
if mo:
start += len(mo.group(0))
# We create a compiled regexp first because we need to be able to
# specify the start position, and the module function doesn't
# support this signature. :(
cre = re.compile('(?P<sep>\r\n|\r|\n)' +
re.escape(separator) + '--')
mo = cre.search(payload, start)
if mo:
terminator = mo.start()
linesep = mo.group('sep')
if mo.end() < len(payload):
# There's some post-MIME boundary epilogue
epilogue = payload[mo.end():]
elif self._strict:
raise Errors.BoundaryError(
"Couldn't find terminating boundary: %s" % boundary)
else:
# Handle the case of no trailing boundary. Check that it ends
# in a blank line. Some cases (spamspamspam) don't even have
# that!
mo = re.search('(?P<sep>\r\n|\r|\n){2}$', payload)
if not mo:
mo = re.search('(?P<sep>\r\n|\r|\n)$', payload)
if not mo:
raise Errors.BoundaryError(
'No terminating boundary and no trailing empty line')
linesep = mo.group('sep')
terminator = len(payload)
# We split the textual payload on the boundary separator, which
# includes the trailing newline. If the container is a
# multipart/digest then the subparts are by default message/rfc822
# instead of text/plain. In that case, they'll have a optional
# block of MIME headers, then an empty line followed by the
# message headers.
parts = re.split(
linesep + re.escape(separator) + r'[ \t]*' + linesep,
payload[start:terminator])
for part in parts:
if isdigest:
if part.startswith(linesep):
# There's no header block so create an empty message
# object as the container, and lop off the newline so
# we can parse the sub-subobject
msgobj = self._class()
part = part[len(linesep):]
else:
parthdrs, part = part.split(linesep+linesep, 1)
# msgobj in this case is the "message/rfc822" container
msgobj = self.parsestr(parthdrs, headersonly=1)
# while submsgobj is the message itself
msgobj.set_default_type('message/rfc822')
maintype = msgobj.get_content_maintype()
if maintype in ('message', 'multipart'):
submsgobj = self.parsestr(part)
msgobj.attach(submsgobj)
else:
msgobj.set_payload(part)
else:
msgobj = self.parsestr(part)
container.preamble = preamble
container.epilogue = epilogue
container.attach(msgobj)
elif container.get_main_type() == 'multipart':
# Very bad. A message is a multipart with no boundary!
raise Errors.BoundaryError(
'multipart message with no defined boundary')
elif container.get_type() == 'message/delivery-status':
# This special kind of type contains blocks of headers separated
# by a blank line. We'll represent each header block as a
# separate Message object
blocks = []
while True:
blockmsg = self._class()
self._parseheaders(blockmsg, fp)
if not len(blockmsg):
# No more header blocks left
break
blocks.append(blockmsg)
container.set_payload(blocks)
elif container.get_main_type() == 'message':
# Create a container for the payload, but watch out for there not
# being any headers left
try:
msg = self.parse(fp)
except Errors.HeaderParseError:
msg = self._class()
self._parsebody(msg, fp)
container.attach(msg)
else:
text = fp.read()
if firstbodyline is not None:
text = firstbodyline + '\n' + text
container.set_payload(text)
class HeaderParser(Parser):
"""A subclass of Parser, this one only meaningfully parses message headers.
This class can be used if all you're interested in is the headers of a
message. While it consumes the message body, it does not parse it, but
simply makes it available as a string payload.
Parsing with this subclass can be considerably faster if all you're
interested in is the message headers.
"""
def _parsebody(self, container, fp, firstbodyline=None):
# Consume but do not parse, the body
text = fp.read()
if firstbodyline is not None:
text = firstbodyline + '\n' + text
container.set_payload(text)
|