File: mhtml_parser.cc

package info (click to toggle)
chromium 138.0.7204.183-1~deb12u1
links: PTS, VCS
area: main
in suites: bookworm-proposed-updates
size: 6,080,960 kB
sloc: cpp: 34,937,079; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,954; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,811; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (489 lines) | stat: -rw-r--r-- 18,440 bytes
parent folder | download | duplicates (2)
/*
 * Copyright (C) 2011 Google Inc. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are
 * met:
 *
 *     * Redistributions of source code must retain the above copyright
 * notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above
 * copyright notice, this list of conditions and the following disclaimer
 * in the documentation and/or other materials provided with the
 * distribution.
 *     * Neither the name of Google Inc. nor the names of its
 * contributors may be used to endorse or promote products derived from
 * this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include "third_party/blink/renderer/platform/mhtml/mhtml_parser.h"

#include <stddef.h>
#include <utility>

#include "base/containers/contains.h"
#include "base/logging.h"
#include "third_party/blink/renderer/platform/heap/garbage_collected.h"
#include "third_party/blink/renderer/platform/mhtml/archive_resource.h"
#include "third_party/blink/renderer/platform/network/http_parsers.h"
#include "third_party/blink/renderer/platform/network/parsed_content_type.h"
#include "third_party/blink/renderer/platform/wtf/hash_map.h"
#include "third_party/blink/renderer/platform/wtf/text/ascii_ctype.h"
#include "third_party/blink/renderer/platform/wtf/text/base64.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/text/string_concatenate.h"
#include "third_party/blink/renderer/platform/wtf/text/string_hash.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"

namespace blink {

namespace {

void QuotedPrintableDecode(base::span<const char> data, Vector<uint8_t>& out) {
  out.clear();
  if (data.empty()) {
    return;
  }

  for (size_t i = 0; i < data.size(); ++i) {
    char current_character = data[i];
    if (current_character != '=') {
      out.push_back(current_character);
      continue;
    }
    // We are dealing with a '=xx' sequence.
    if (data.size() - i < 3) {
      // Unfinished = sequence, append as is.
      out.push_back(current_character);
      continue;
    }
    char upper_character = data[++i];
    char lower_character = data[++i];
    if (upper_character == '\r' && lower_character == '\n')
      continue;

    if (!IsASCIIHexDigit(upper_character) ||
        !IsASCIIHexDigit(lower_character)) {
      // Invalid sequence, = followed by non hex digits, just insert the
      // characters as is.
      out.push_back('=');
      out.push_back(upper_character);
      out.push_back(lower_character);
      continue;
    }
    out.push_back(
        static_cast<char>(ToASCIIHexValue(upper_character, lower_character)));
  }
}

}  // namespace

// This class is a limited MIME parser used to parse the MIME headers of MHTML
// files.
class MIMEHeader final : public GarbageCollected<MIMEHeader> {
 public:
  MIMEHeader();

  enum class Encoding {
    kQuotedPrintable,
    kBase64,
    kEightBit,
    kSevenBit,
    kBinary,
    kUnknown
  };

  static MIMEHeader* ParseHeader(SharedBufferChunkReader* cr_lf_line_reader);

  bool IsMultipart() const {
    return content_type_.StartsWithIgnoringASCIICase("multipart/");
  }

  String ContentType() const { return content_type_; }
  String Charset() const { return charset_; }
  Encoding ContentTransferEncoding() const {
    return content_transfer_encoding_;
  }
  String ContentLocation() const { return content_location_; }
  String ContentID() const { return content_id_; }
  base::Time Date() const { return date_; }

  // Multi-part type and boundaries are only valid for multipart MIME headers.
  String MultiPartType() const { return multipart_type_; }
  String EndOfPartBoundary() const { return end_of_part_boundary_; }
  String EndOfDocumentBoundary() const { return end_of_document_boundary_; }

  void Trace(Visitor* visitor) const {}

 private:
  static Encoding ParseContentTransferEncoding(const String&);

  String content_type_;
  String charset_;
  Encoding content_transfer_encoding_;
  String content_location_;
  String content_id_;
  base::Time date_;
  String multipart_type_;
  String end_of_part_boundary_;
  String end_of_document_boundary_;
};

typedef HashMap<String, String> KeyValueMap;

static KeyValueMap RetrieveKeyValuePairs(SharedBufferChunkReader* buffer) {
  KeyValueMap key_value_pairs;
  String line;
  String key;
  StringBuilder value;
  while (!(line = buffer->NextChunkAsUTF8StringWithLatin1Fallback()).IsNull()) {
    if (line.empty())
      break;  // Empty line means end of key/value section.
    // RFC822 continuation: A line that starts with LWSP is a continuation of
    // the prior line.
    if ((line[0] == '\t') || (line[0] == ' ')) {
      value.Append(line.Substring(1));
      continue;
    }
    // New key/value, store the previous one if any.
    if (!key.empty()) {
      if (base::Contains(key_value_pairs, key)) {
        DVLOG(1) << "Key duplicate found in MIME header. Key is '" << key
                 << "', previous value replaced.";
      }
      key_value_pairs.insert(key, value.ToString().StripWhiteSpace());
      key = String();
      value.Clear();
    }
    wtf_size_t semi_colon_index = line.find(':');
    if (semi_colon_index == kNotFound) {
      // This is not a key value pair, ignore.
      continue;
    }
    key =
        line.Substring(0, semi_colon_index).DeprecatedLower().StripWhiteSpace();
    value.Append(line.Substring(semi_colon_index + 1));
  }
  // Store the last property if there is one.
  if (!key.empty())
    key_value_pairs.Set(key, value.ToString().StripWhiteSpace());
  return key_value_pairs;
}

MIMEHeader* MIMEHeader::ParseHeader(SharedBufferChunkReader* buffer) {
  auto* mime_header = MakeGarbageCollected<MIMEHeader>();
  KeyValueMap key_value_pairs = RetrieveKeyValuePairs(buffer);
  KeyValueMap::iterator mime_parameters_iterator =
      key_value_pairs.find("content-type");
  if (mime_parameters_iterator != key_value_pairs.end()) {
    ParsedContentType parsed_content_type(mime_parameters_iterator->value,
                                          ParsedContentType::Mode::kRelaxed);
    mime_header->content_type_ = parsed_content_type.MimeType();
    if (!mime_header->IsMultipart()) {
      mime_header->charset_ = parsed_content_type.Charset().StripWhiteSpace();
    } else {
      mime_header->multipart_type_ =
          parsed_content_type.ParameterValueForName("type");
      String boundary = parsed_content_type.ParameterValueForName("boundary");
      if (boundary.IsNull()) {
        DVLOG(1) << "No boundary found in multipart MIME header.";
        return nullptr;
      }
      mime_header->end_of_part_boundary_ = "--" + boundary;
      mime_header->end_of_document_boundary_ =
          mime_header->end_of_part_boundary_;
      mime_header->end_of_document_boundary_ =
          mime_header->end_of_document_boundary_ + "--";
    }
  }

  mime_parameters_iterator = key_value_pairs.find("content-transfer-encoding");
  if (mime_parameters_iterator != key_value_pairs.end())
    mime_header->content_transfer_encoding_ =
        ParseContentTransferEncoding(mime_parameters_iterator->value);

  mime_parameters_iterator = key_value_pairs.find("content-location");
  if (mime_parameters_iterator != key_value_pairs.end())
    mime_header->content_location_ = mime_parameters_iterator->value;

  // See rfc2557 - section 8.3 - Use of the Content-ID header and CID URLs.
  mime_parameters_iterator = key_value_pairs.find("content-id");
  if (mime_parameters_iterator != key_value_pairs.end())
    mime_header->content_id_ = mime_parameters_iterator->value;

  mime_parameters_iterator = key_value_pairs.find("date");
  if (mime_parameters_iterator != key_value_pairs.end()) {
    base::Time parsed_time;
    // Behave like //net and parse time-valued headers with a default time zone
    // of UTC.
    if (base::Time::FromUTCString(
            mime_parameters_iterator->value.Utf8().c_str(), &parsed_time))
      mime_header->date_ = parsed_time;
  }

  return mime_header;
}

MIMEHeader::Encoding MIMEHeader::ParseContentTransferEncoding(
    const String& text) {
  String encoding = text.StripWhiteSpace().LowerASCII();
  if (encoding == "base64")
    return Encoding::kBase64;
  if (encoding == "quoted-printable")
    return Encoding::kQuotedPrintable;
  if (encoding == "8bit")
    return Encoding::kEightBit;
  if (encoding == "7bit")
    return Encoding::kSevenBit;
  if (encoding == "binary")
    return Encoding::kBinary;
  DVLOG(1) << "Unknown encoding '" << text << "' found in MIME header.";
  return Encoding::kUnknown;
}

MIMEHeader::MIMEHeader() : content_transfer_encoding_(Encoding::kUnknown) {}

static bool SkipLinesUntilBoundaryFound(SharedBufferChunkReader& line_reader,
                                        const String& boundary) {
  String line;
  while (!(line = line_reader.NextChunkAsUTF8StringWithLatin1Fallback())
              .IsNull()) {
    if (line == boundary)
      return true;
  }
  return false;
}

MHTMLParser::MHTMLParser(scoped_refptr<const SharedBuffer> data)
    : line_reader_(std::move(data), "\r\n") {}

HeapVector<Member<ArchiveResource>> MHTMLParser::ParseArchive() {
  MIMEHeader* header = MIMEHeader::ParseHeader(&line_reader_);
  HeapVector<Member<ArchiveResource>> resources;
  if (ParseArchiveWithHeader(header, resources)) {
    creation_date_ = header->Date();
  } else {
    resources.clear();
  }
  return resources;
}

base::Time MHTMLParser::CreationDate() const {
  return creation_date_;
}

bool MHTMLParser::ParseArchiveWithHeader(
    MIMEHeader* header,
    HeapVector<Member<ArchiveResource>>& resources) {
  if (!header) {
    DVLOG(1) << "Failed to parse MHTML part: no header.";
    return false;
  }

  if (!header->IsMultipart()) {
    // With IE a page with no resource is not multi-part.
    bool end_of_archive_reached = false;
    ArchiveResource* resource =
        ParseNextPart(*header, String(), String(), end_of_archive_reached);
    if (!resource)
      return false;
    resources.push_back(resource);
    return true;
  }

  // Skip the message content (it's a generic browser specific message).
  SkipLinesUntilBoundaryFound(line_reader_, header->EndOfPartBoundary());

  bool end_of_archive = false;
  while (!end_of_archive) {
    MIMEHeader* resource_header = MIMEHeader::ParseHeader(&line_reader_);
    if (!resource_header) {
      DVLOG(1) << "Failed to parse MHTML, invalid MIME header.";
      return false;
    }
    if (resource_header->ContentType() == "multipart/alternative") {
      // Ignore IE nesting which makes little sense (IE seems to nest only some
      // of the frames).
      if (!ParseArchiveWithHeader(resource_header, resources)) {
        DVLOG(1) << "Failed to parse MHTML subframe.";
        return false;
      }
      SkipLinesUntilBoundaryFound(line_reader_, header->EndOfPartBoundary());
      continue;
    }

    ArchiveResource* resource =
        ParseNextPart(*resource_header, header->EndOfPartBoundary(),
                      header->EndOfDocumentBoundary(), end_of_archive);
    if (!resource) {
      DVLOG(1) << "Failed to parse MHTML part.";
      return false;
    }
    resources.push_back(resource);
  }
  return true;
}

ArchiveResource* MHTMLParser::ParseNextPart(
    const MIMEHeader& mime_header,
    const String& end_of_part_boundary,
    const String& end_of_document_boundary,
    bool& end_of_archive_reached) {
  DCHECK_EQ(end_of_part_boundary.empty(), end_of_document_boundary.empty());

  // Per the spec, the bondary to separate parts should start with CRLF.
  // |end_of_part_boundary| passed here does not contain CRLF at the beginning.
  // The parsing logic below takes care of CRLF handling.

  // If no content transfer encoding is specified, default to binary encoding.
  MIMEHeader::Encoding content_transfer_encoding =
      mime_header.ContentTransferEncoding();
  if (content_transfer_encoding == MIMEHeader::Encoding::kUnknown)
    content_transfer_encoding = MIMEHeader::Encoding::kBinary;

  Vector<char> content;
  const bool check_boundary = !end_of_part_boundary.empty();
  bool end_of_part_reached = false;
  if (content_transfer_encoding == MIMEHeader::Encoding::kBinary) {
    if (!check_boundary) {
      DVLOG(1) << "Binary contents requires end of part";
      return nullptr;
    }
    // Due to a bug in MHTMLArchive, CRLF was not added to the beginning of the
    // boundary that is placed after the part encoded as binary. To handle both
    // cases that CRLF may or may not be at the beginning of the boundary, we
    // read the part content till reaching the boundary without CRLF. So the
    // part content may contain CRLF at the end, which will be stripped off
    // later.
    line_reader_.SetSeparator(end_of_part_boundary.Utf8());
    if (!line_reader_.NextChunk(content)) {
      DVLOG(1) << "Binary contents requires end of part";
      return nullptr;
    }
    line_reader_.SetSeparator("\r\n");

    // Strip the CRLF from the end of the content if present.
    // Note: it may be the case that CRLF stripped off is really part of the
    // content, instead of part of the boundary.
    // 1) If the content denotes text or html data, stripping off CRLF will
    //    normally bring no harm.
    // 2) Otherwise, the content denotes image or other type of binary data.
    //    Usually it doesn't have CRLF at the end.
    // In order to support parsing the MHTML archive file produced before the
    // MHTMLArchive bug was fixed, we need to take a risk of stripping off the
    // CRLF that indeed belongs to the content.
    if (content.size() >= 2 && content[content.size() - 2] == '\r' &&
        content[content.size() - 1] == '\n') {
      content.resize(content.size() - 2);
    }

    Vector<char> next_chars;
    if (line_reader_.Peek(next_chars, 2) != 2) {
      DVLOG(1) << "Invalid seperator.";
      return nullptr;
    }
    end_of_part_reached = true;
    DCHECK(next_chars.size() == 2);
    end_of_archive_reached = (next_chars[0] == '-' && next_chars[1] == '-');
    if (!end_of_archive_reached) {
      String line = line_reader_.NextChunkAsUTF8StringWithLatin1Fallback();
      if (!line.empty()) {
        DVLOG(1) << "No CRLF at end of binary section.";
        return nullptr;
      }
    }
  } else {
    String line;
    while (!(line = line_reader_.NextChunkAsUTF8StringWithLatin1Fallback())
                .IsNull()) {
      end_of_archive_reached = (line == end_of_document_boundary);
      if (check_boundary &&
          (line == end_of_part_boundary || end_of_archive_reached)) {
        end_of_part_reached = true;
        break;
      }
      // Note that we use line.utf8() and not line.ascii() as ascii turns
      // special characters (such as tab, line-feed...) into '?'.
      content.AppendSpan(base::span<const char>(line.Utf8()));
      if (content_transfer_encoding == MIMEHeader::Encoding::kQuotedPrintable) {
        // The line reader removes the \r\n, but we need them for the content in
        // this case as the QuotedPrintable decoder expects CR-LF terminated
        // lines.
        content.AppendSpan(base::span_from_cstring("\r\n"));
      }
    }
  }
  if (!end_of_part_reached && check_boundary) {
    DVLOG(1) << "No boundary found for MHTML part.";
    return nullptr;
  }

  Vector<uint8_t> data;
  switch (content_transfer_encoding) {
    case MIMEHeader::Encoding::kBase64:
      if (!Base64Decode(StringView(base::as_byte_span(content)), data)) {
        DVLOG(1) << "Invalid base64 content for MHTML part.";
        return nullptr;
      }
      break;
    case MIMEHeader::Encoding::kQuotedPrintable:
      QuotedPrintableDecode(content, data);
      break;
    case MIMEHeader::Encoding::kEightBit:
    case MIMEHeader::Encoding::kSevenBit:
    case MIMEHeader::Encoding::kBinary:
      data.AppendVector(content);
      break;
    default:
      DVLOG(1) << "Invalid encoding for MHTML part.";
      return nullptr;
  }
  scoped_refptr<SharedBuffer> content_buffer =
      SharedBuffer::Create(std::move(data));
  // FIXME: the URL in the MIME header could be relative, we should resolve it
  // if it is.  The specs mentions 5 ways to resolve a URL:
  // http://tools.ietf.org/html/rfc2557#section-5
  // IE and Firefox (UNMht) seem to generate only absolute URLs.
  KURL location = KURL(NullURL(), mime_header.ContentLocation());
  return MakeGarbageCollected<ArchiveResource>(
      content_buffer, location, mime_header.ContentID(),
      AtomicString(mime_header.ContentType()),
      AtomicString(mime_header.Charset()));
}

// static
KURL MHTMLParser::ConvertContentIDToURI(const String& content_id) {
  // This function is based primarily on an example from rfc2557 in section
  // 9.5, but also based on more normative parts of specs like:
  // - rfc2557 - MHTML - section 8.3 - "Use of the Content-ID header and CID
  //                                    URLs"
  // - rfc1738 - URL - section 4 (reserved scheme names;  includes "cid")
  // - rfc2387 - multipart/related - section 3.4 - "Syntax" (cid := msg-id)
  // - rfc0822 - msg-id = "<" addr-spec ">"; addr-spec = local-part "@" domain

  if (content_id.length() <= 2)
    return KURL();

  if (!content_id.StartsWith('<') || !content_id.EndsWith('>'))
    return KURL();

  StringBuilder uri_builder;
  uri_builder.Append("cid:");
  uri_builder.Append(content_id, 1, content_id.length() - 2);
  return KURL(NullURL(), uri_builder.ToString());
}

}  // namespace blink