1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
|
# SPDX-License-Identifier: GPL-2.0-only
#
# Copyright (C) 2023 EfficiOS Inc.
#
# pyright: strict, reportTypeCommentUsage=false
import re
from typing import TextIO
# One part of a moultipart document.
#
# For example, for this part of which the header is at line 37:
#
# --- Another Oscar Wilde quote
# I can resist everything except temptation.
#
# The corresponding `Part` object is:
#
# Part('Another Oscar Wilde quote',
# 'I can resist everything except temptation',
# 38)
class Part:
def __init__(self, header_info: str, content: str, first_content_line_no: int):
self._header_info = header_info
self._content = content
self._first_content_line_no = first_content_line_no
@property
def header_info(self):
return self._header_info
@property
def content(self):
return self._content
# Number of the first line, relative to the beginning of the
# containing moultipart document, of the content of this part.
@property
def first_content_line_no(self):
return self._first_content_line_no
def __repr__(self):
return "Part({}, {}, {})".format(
repr(self.header_info), repr(self.content), self.first_content_line_no
)
def _try_parse_header(line: str):
m = re.match(r"---(\s*| .+)$", line)
if m is None:
return
return m.group(1).strip()
# Parses the moultipart document file `in_file` and returns its parts
# (list of `Part` objects).
#
# A moultipart document is a sequence of parts.
#
# A moutlipart part is:
#
# 1. A header line, that is, in this order:
#
# a) Exactly `---`.
# b) Zero or more spaces.
# c) Optional: custom information until the end of the line.
#
# 2. Zero or more lines of text which aren't header lines.
#
# For example, consider the following moultipart document:
#
# --- Victoria
# Parenteau
# ---
# Taillon
# --- This part is empty
# --- Josianne
# Gervais
#
# Then this function would return the following part objects:
#
# [
# Part('Victoria', 'Parenteau\n', 2),
# Part('', 'Taillon\n', 4),
# Part('This part is empty', '', 6),
# Part('Josianne', 'Gervais\n', 7),
# ]
#
# Raises `RuntimeError` on any parsing error.
def parse(in_file: TextIO):
# Read the first header
cur_part_content = ""
cur_first_content_line_no = 2
parts = [] # type: list[Part]
line_no = 1
line = next(in_file)
cur_part_header_info = _try_parse_header(line)
if cur_part_header_info is None:
raise RuntimeError(
"Expecting header line starting with `---`, got `{}`".format(
line.strip("\n")
)
)
for line in in_file:
line_no += 1
maybe_part_header_info = _try_parse_header(line)
if maybe_part_header_info is not None:
# New header
parts.append(
Part(
cur_part_header_info,
cur_part_content,
cur_first_content_line_no,
)
)
cur_part_content = ""
cur_part_header_info = maybe_part_header_info
cur_first_content_line_no = line_no + 1
continue
# Accumulate content lines
cur_part_content += line
# Last part (always exists)
parts.append(
Part(
cur_part_header_info,
cur_part_content,
cur_first_content_line_no,
)
)
return parts
if __name__ == "__main__":
import sys
import pprint
with open(sys.argv[1]) as f:
pprint.pprint(parse(f))
|