1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43
|
import re
def normalizedText(t, n=68, escape = False):
"""
Given a text with newlines and multiple spaces, reduces all
multiple separators to single spaces,
and returns a string with not too long lines.
Parameters:
-----------
- t (str) a text to normalize
- n (int) the maximum length of a line, when possible. Defaults to 68.
- escape (bool): if True, double quotes will be escaped. Deffauts to False
"""
if not t:
return ""
onestring = re.sub("[\\s\n\t]+", " ", t, flags=re.MULTILINE)
if escape:
# escape double quotes
onestring = onestring.replace('"', '\\"')
foundspace=0
last_foundspace=0
pos=0
newline_pos = [0]
while pos < len(onestring):
if onestring[pos] == " ":
foundspace = pos
if pos - newline_pos[-1] >= n:
if last_foundspace > newline_pos[-1]:
newline_pos.append(last_foundspace+1)
pos = last_foundspace + 1
foundspace = pos
else:
newline_pos.append(foundspace+1)
last_foundspace = foundspace
pos += 1
result = []
for i in range(1, len(newline_pos)):
result.append(onestring[newline_pos[i-1]:newline_pos[i]])
result.append(onestring[newline_pos[-1]:])
return result
|