1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
Received: from localhost (dcvr.yhbt.net [127.0.0.1])
by dcvr.yhbt.net (Postfix) with ESMTP id 977481F45A;
Sat, 18 Apr 2020 22:25:08 +0000 (UTC)
Date: Sat, 18 Apr 2020 22:25:08 +0000
From: Eric Wong <e@yhbt.net>
To: test@public-inbox.org
Subject: Re: embedded message test
Message-ID: <20200418222508.GA13918@dcvr>
References: <20200418222020.GA2745@dcvr>
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="TB36FDmn/VVEgNH/"
Content-Disposition: inline
In-Reply-To: <20200418222020.GA2745@dcvr>
--TB36FDmn/VVEgNH/
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
testing embedded message harder
--TB36FDmn/VVEgNH/
Content-Type: message/rfc822
Content-Disposition: attachment; filename="embed2x.eml"
Date: Sat, 18 Apr 2020 22:20:20 +0000
From: Eric Wong <e@yhbt.net>
To: test@public-inbox.org
Subject: embedded message test
Message-ID: <20200418222020.GA2745@dcvr>
MIME-Version: 1.0
Content-Type: multipart/mixed; boundary="/04w6evG8XlLl3ft"
Content-Disposition: inline
--/04w6evG8XlLl3ft
Content-Type: text/plain; charset=utf-8
Content-Disposition: inline
testing embedded message
--/04w6evG8XlLl3ft
Content-Type: message/rfc822
Content-Disposition: attachment; filename="test.eml"
From: Eric Wong <e@yhbt.net>
To: spew@80x24.org
Subject: [PATCH] mail header experiments
Date: Sat, 18 Apr 2020 21:41:14 +0000
Message-Id: <20200418214114.7575-1-e@yhbt.net>
MIME-Version: 1.0
Content-Transfer-Encoding: 8bit
---
lib/PublicInbox/MailHeader.pm | 55 +++++++++++++++++++++++++++++++++++
t/mail_header.t | 31 ++++++++++++++++++++
2 files changed, 86 insertions(+)
create mode 100644 lib/PublicInbox/MailHeader.pm
create mode 100644 t/mail_header.t
diff --git a/lib/PublicInbox/MailHeader.pm b/lib/PublicInbox/MailHeader.pm
new file mode 100644
index 00000000..166baf91
--- /dev/null
+++ b/lib/PublicInbox/MailHeader.pm
@@ -0,0 +1,55 @@
+# Copyright (C) 2020-2021 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+package PublicInbox::MailHeader;
+use strict;
+use HTTP::Parser::XS qw(parse_http_response HEADERS_AS_ARRAYREF);
+use bytes (); #bytes::length
+my %casemap;
+
+sub _headerx_to_list {
+ my (undef, $head, $crlf) = @_;
+
+ # picohttpparser uses `int' as the return value, so the
+ # actual limit is 2GB on most platforms. However, headers
+ # exceeding (or even close to) 1MB seems unreasonable
+ die 'headers too big' if bytes::length($$head) > 0x100000;
+ my ($ret, undef, undef, undef, $headers) =
+ parse_http_response('HTTP/1.0 1 X'. $crlf . $$head,
+ HEADERS_AS_ARRAYREF);
+ die 'failed to parse headers' if $ret <= 0;
+ # %casemap = map {; lc($_) => $_ } ($$head =~ m/^([^:]+):/gsm);
+ # my $nr = @$headers;
+ for (my $i = 0; $i < @$headers; $i += 2) {
+ my $key = $headers->[$i]; # = $casemap{$headers->[$i]};
+ my $val = $headers->[$i + 1];
+ (my $trimmed = $val) =~ s/\r?\n\s+/ /;
+ $headers->[$i + 1] = [
+ $trimmed,
+ "$key: $val"
+ ]
+ }
+ $headers;
+}
+
+sub _header_to_list {
+ my (undef, $head, $crlf) = @_;
+ my @tmp = ($$head =~ m/^(([^ \t:][^:\n]*):[ \t]*
+ ([^\n]*\n(?:[ \t]+[^\n]*\n)*))/gsmx);
+ my @headers;
+ $#headers = scalar @tmp;
+ @headers = ();
+ while (@tmp) {
+ my ($orig, $key, $val) = splice(@tmp, 0, 3);
+ # my $v = $tmp[$i + 2];
+ # $v =~ s/\r?\n[ \t]+/ /sg;
+ # $v =~ s/\r?\n\z//s;
+ $val =~ s/\n[ \t]+/ /sg;
+ chomp($val, $orig);
+ # $val =~ s/\r?\n\z//s;
+ # $orig =~ s/\r?\n\z//s;
+ push @headers, $key, [ $val, $orig ];
+ }
+ \@headers;
+}
+
+1;
diff --git a/t/mail_header.t b/t/mail_header.t
new file mode 100644
index 00000000..4dc62c50
--- /dev/null
+++ b/t/mail_header.t
@@ -0,0 +1,31 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use Test::More;
+use PublicInbox::TestCommon;
+require_mods('PublicInbox::MailHeader');
+
+my $head = <<'EOF';
+From d0147582e289fdd4cdd84e91d8b0f8ae9c230124 Mon Sep 17 00:00:00 2001
+From: Eric Wong <e@yhbt.net>
+Date: Fri, 17 Apr 2020 09:28:49 +0000
+Subject: [PATCH] searchthread: reduce indirection by removing container
+
+EOF
+my $orig = $head;
+use Email::Simple;
+my $xshdr = PublicInbox::MailHeader->_header_to_list(\$head, "\n");
+my $simpl = Email::Simple::Header->_header_to_list(\$head, "\n");
+is_deeply($xshdr, $simpl);
+use Benchmark qw(:all);
+my $res = timethese(100000, {
+ pmh => sub {
+ PublicInbox::MailHeader->_header_to_list(\$head, "\n");
+ },
+ esh => sub {
+ PublicInbox::MailHeader->_header_to_list(\$head, "\n");
+ }
+});
+is($head, $orig);
+use Data::Dumper; diag Dumper($res);
+done_testing;
--/04w6evG8XlLl3ft--
--TB36FDmn/VVEgNH/--
|