File: constructor_string_parser.cc

package info (click to toggle)
chromium 139.0.7258.127-1
links: PTS, VCS
area: main
in suites:
size: 6,122,068 kB
sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (484 lines) | stat: -rw-r--r-- 17,929 bytes
parent folder | download | duplicates (5)
// Copyright 2021 The Chromium Authors
// Use of this source code is governed by an MIT-style license that can be
// found in the LICENSE file or at https://opensource.org/licenses/MIT.

#include "third_party/liburlpattern/constructor_string_parser.h"

#include <string_view>
#include <vector>

#include "base/types/expected.h"
#include "third_party/abseil-cpp/absl/base/macros.h"

namespace liburlpattern {

ConstructorStringParser::ConstructorStringParser(
    std::string_view constructor_string)
    : input_(constructor_string) {}

absl::Status ConstructorStringParser::Parse(
    ProtocolCheckCallback protocol_matches_special_scheme) {
  ABSL_ASSERT(state_ == StringParseState::kInit);
  ABSL_ASSERT(token_index_ == 0u);

  auto tokenize_result = Tokenize(input_, TokenizePolicy::kLenient);
  if (!tokenize_result.has_value()) {
    // This should not happen with kLenient mode, but we handle it anyway.
    return tokenize_result.error();
  }

  token_list_ = std::move(tokenize_result.value());

  // When constructing a pattern using structured input like
  // `new URLPattern({ pathname: 'foo' })` any missing components will be
  // defaulted to wildcards.
  //
  // Components which ordinarily appear "later" than those specified are instead
  // treated as wildcards, which avoids the need to explicitly wildcard each of
  // them. As a result, these values are not initialized to be empty until a
  // "later" component is seen.

  // Iterate through the list of tokens and update our state machine as we go.
  for (; token_index_ < token_list_.size(); token_index_ += token_increment_) {
    // Reset back to our default `token_increment_` value.
    token_increment_ = 1;

    // All states must respect the end of the token list.  The liburlpattern
    // tokenizer guarantees that the last token will have the type `kEnd`.
    if (token_list_[token_index_].type == TokenType::kEnd) {
      // If we failed to find a protocol terminator then we are still in
      // relative mode.  We now need to determine the first component of the
      // relative URL.
      if (state_ == StringParseState::kInit) {
        // Reset back to the start of the input string.
        Rewind();

        // If the string begins with `?` then its a relative search component.
        // If it starts with `#` then its a relative hash component.  Otherwise
        // its a relative pathname.
        //
        // In each case we initialize any components following the initial
        // component to be empty string.
        if (IsHashPrefix()) {
          ChangeState(StringParseState::kHash, Skip(1));
        } else if (IsSearchPrefix()) {
          ChangeState(StringParseState::kSearch, Skip(1));
        } else {
          ChangeState(StringParseState::kPathname, Skip(0));
        }
        continue;
      }

      // If we failed to find an `@`, then there is no username and password.
      // We should rewind and process the data as a hostname.
      else if (state_ == StringParseState::kAuthority) {
        RewindAndSetState(StringParseState::kHostname);
        continue;
      }

      ChangeState(StringParseState::kDone, Skip(0));
      break;
    }

    // In addition, all states must handle pattern groups.  We do not permit
    // a component to end in the middle of a pattern group.  Therefore we skip
    // past any tokens that are within `{` and `}`.  Note, the tokenizer
    // handles grouping `(` and `)` and `:foo` groups for us automatically, so
    // we don't need special code for them here.
    if (IsGroupOpen()) {
      group_depth_ += 1;
      continue;
    }

    if (group_depth_ > 0) {
      if (IsGroupClose()) {
        group_depth_ -= 1;
      } else {
        continue;
      }
    }

    switch (state_) {
      case StringParseState::kInit:
        if (IsProtocolSuffix()) {
          // Update the state to expect the start of an absolute URL.
          RewindAndSetState(StringParseState::kProtocol);
        }
        break;

      case StringParseState::kProtocol:
        // If we find the end of the protocol component...
        if (IsProtocolSuffix()) {
          base::expected protocol_check_result =
              protocol_matches_special_scheme(MakeComponentString());
          if (!protocol_check_result.has_value()) {
            return protocol_check_result.error();
          }
          should_treat_as_standard_url_ = protocol_check_result.value();

          // By default we treat this as a "cannot-be-a-base-URL" or what chrome
          // calls a "path" URL.  In this case we go straight to the pathname
          // component.  The hostname and port are left with their default
          // empty string values.
          StringParseState next_state = StringParseState::kPathname;
          Skip skip = Skip(1);

          // If there are authority slashes, like `https://`, then
          // we must transition to the authority section of the URLPattern.
          if (NextIsAuthoritySlashes()) {
            next_state = StringParseState::kAuthority;
            skip = Skip(3);
          }

          // If there are no authority slashes, but the protocol is special
          // then we still go to the authority section as this is a "standard"
          // URL.  This differs from the above case since we don't need to skip
          // the extra slashes.
          else if (should_treat_as_standard_url_) {
            next_state = StringParseState::kAuthority;
          }

          ChangeState(next_state, skip);
        }
        break;

      case StringParseState::kAuthority:
        // Before going to the hostname state we must see if there is an
        // identity of the form:
        //
        //  <username>:<password>@<hostname>
        //
        // We check for this by looking for the `@` character.  The username
        // and password are themselves each optional, so the `:` may not be
        // present.  If we see the `@` we just go to the username state
        // and let it proceed until it hits either the password separator
        // or the `@` terminator.
        if (IsIdentityTerminator()) {
          RewindAndSetState(StringParseState::kUsername);
        }

        // Stop searching for the `@` character if we see the beginning
        // of the pathname, search, or hash components.
        else if (IsPathnameStart() || IsSearchPrefix() || IsHashPrefix()) {
          RewindAndSetState(StringParseState::kHostname);
        }
        break;

      case StringParseState::kUsername:
        // If we find a `:` then transition to the password component state.
        if (IsPasswordPrefix()) {
          ChangeState(StringParseState::kPassword, Skip(1));
        }

        // If we find a `@` then transition to the hostname component state.
        else if (IsIdentityTerminator()) {
          ChangeState(StringParseState::kHostname, Skip(1));
        }
        break;

      case StringParseState::kPassword:
        // If we find a `@` then transition to the hostname component state.
        if (IsIdentityTerminator()) {
          ChangeState(StringParseState::kHostname, Skip(1));
        }
        break;

      case StringParseState::kHostname:
        // Track whether we are inside ipv6 address brackets.
        if (IsIPv6Open()) {
          hostname_ipv6_bracket_depth_ += 1;
        } else if (IsIPv6Close()) {
          hostname_ipv6_bracket_depth_ -= 1;
        }

        // If we find a `:` then we transition to the port component state.
        // However, we ignore `:` when parsing an ipv6 address.
        else if (IsPortPrefix() && !hostname_ipv6_bracket_depth_) {
          ChangeState(StringParseState::kPort, Skip(1));
        }

        // If we find a `/` then we transition to the pathname component state.
        else if (IsPathnameStart()) {
          ChangeState(StringParseState::kPathname, Skip(0));
        }

        // If we find a `?` then we transition to the search component state.
        else if (IsSearchPrefix()) {
          ChangeState(StringParseState::kSearch, Skip(1));
        }

        // If we find a `#` then we transition to the hash component state.
        else if (IsHashPrefix()) {
          ChangeState(StringParseState::kHash, Skip(1));
        }
        break;

      case StringParseState::kPort:
        // If we find a `/` then we transition to the pathname component state.
        if (IsPathnameStart()) {
          ChangeState(StringParseState::kPathname, Skip(0));
        }
        // If we find a `?` then we transition to the search component state.
        else if (IsSearchPrefix()) {
          ChangeState(StringParseState::kSearch, Skip(1));
        }
        // If we find a `#` then we transition to the hash component state.
        else if (IsHashPrefix()) {
          ChangeState(StringParseState::kHash, Skip(1));
        }
        break;
      case StringParseState::kPathname:
        // If we find a `?` then we transition to the search component state.
        if (IsSearchPrefix()) {
          ChangeState(StringParseState::kSearch, Skip(1));
        }
        // If we find a `#` then we transition to the hash component state.
        else if (IsHashPrefix()) {
          ChangeState(StringParseState::kHash, Skip(1));
        }
        break;
      case StringParseState::kSearch:
        // If we find a `#` then we transition to the hash component state.
        if (IsHashPrefix()) {
          ChangeState(StringParseState::kHash, Skip(1));
        }
        break;
      case StringParseState::kHash:
        // Nothing to do here as we are just looking for the end.
        break;
      case StringParseState::kDone:
        ABSL_ASSERT(false);
        break;
    };
  }

  // Special case: if you specify a hostname, it is assumed that you want the
  // default port, if you didn't specify. This is ensures that
  // https://example.com/* does not match https://example.com:8443/, which is
  // another origin entirely.
  if (result_.hostname && !result_.port) {
    result_.port = "";
  }
  return absl::OkStatus();
}

void ConstructorStringParser::ChangeState(StringParseState new_state,
                                          Skip skip) {
  // First we convert the tokens between `component_start_` and `token_index_`
  // a component pattern string.  This is stored in the appropriate result
  // property based on the current `state_`.
  switch (state_) {
    case StringParseState::kInit:
      // No component to set when transitioning from this state.
      break;
    case StringParseState::kProtocol:
      result_.protocol = MakeComponentString();
      break;
    case StringParseState::kAuthority:
      // No component to set when transitioning from this state.
      break;
    case StringParseState::kUsername:
      result_.username = MakeComponentString();
      break;
    case StringParseState::kPassword:
      result_.password = MakeComponentString();
      break;
    case StringParseState::kHostname:
      result_.hostname = MakeComponentString();
      break;
    case StringParseState::kPort:
      result_.port = MakeComponentString();
      break;
    case StringParseState::kPathname:
      result_.pathname = MakeComponentString();
      break;
    case StringParseState::kSearch:
      result_.search = MakeComponentString();
      break;
    case StringParseState::kHash:
      result_.hash = MakeComponentString();
      break;
    case StringParseState::kDone:
      ABSL_ASSERT(false);
      break;
  }

  if (state_ != StringParseState::kInit &&
      new_state != StringParseState::kDone) {
    // If a component was skipped but a later component is present, it gets its
    // default value, explicitly.
    //
    // This relies on the ordering of the states, which does correspond to the
    // order of components (aside from authority/username/password, which are
    // special).
    static_assert(StringParseState::kHostname < StringParseState::kPort);
    static_assert(StringParseState::kPort < StringParseState::kPathname);
    static_assert(StringParseState::kPathname < StringParseState::kSearch);
    static_assert(StringParseState::kSearch < StringParseState::kHash);
    if (state_ < StringParseState::kHostname &&
        new_state > StringParseState::kHostname && !result_.hostname) {
      result_.hostname = "";
    }
    if (state_ < StringParseState::kPort &&
        new_state > StringParseState::kPort && !result_.port) {
      result_.port = "";
    }
    if (state_ < StringParseState::kPathname &&
        new_state > StringParseState::kPathname && !result_.pathname) {
      result_.pathname = should_treat_as_standard_url_ ? "/" : "";
    }
    if (state_ < StringParseState::kSearch &&
        new_state > StringParseState::kSearch && !result_.search) {
      result_.search = "";
    }
  }

  ChangeStateWithoutSettingComponent(new_state, skip);
}

void ConstructorStringParser::ChangeStateWithoutSettingComponent(
    StringParseState new_state,
    Skip skip) {
  state_ = new_state;

  // Now update `component_start_` to point to the new component.  The `skip`
  // argument tells us how many tokens to ignore to get to the next start.
  component_start_ = token_index_ + skip;

  // Next, move the `token_index_` so that the top of the loop will begin
  // parsing the new component.  We adjust the `token_increment_` down to
  // zero as the skip value already takes into account moving to the start
  // of the next component.
  token_index_ += skip;
  token_increment_ = 0;
}

void ConstructorStringParser::Rewind() {
  token_index_ = component_start_;
  token_increment_ = 0;
}

void ConstructorStringParser::RewindAndSetState(StringParseState new_state) {
  Rewind();
  state_ = new_state;
}

const Token& ConstructorStringParser::SafeToken(size_t index) const {
  if (index < token_list_.size()) {
    return token_list_[index];
  }
  ABSL_ASSERT(!token_list_.empty());
  ABSL_ASSERT(token_list_.back().type == TokenType::kEnd);
  return token_list_.back();
}

bool ConstructorStringParser::IsNonSpecialPatternChar(size_t index,
                                                      const char* value) const {
  const Token& token = SafeToken(index);
  return token.value == value && (token.type == TokenType::kChar ||
                                  token.type == TokenType::kEscapedChar ||
                                  token.type == TokenType::kInvalidChar);
}

bool ConstructorStringParser::IsProtocolSuffix() const {
  return IsNonSpecialPatternChar(token_index_, ":");
}

bool ConstructorStringParser::NextIsAuthoritySlashes() const {
  return IsNonSpecialPatternChar(token_index_ + 1, "/") &&
         IsNonSpecialPatternChar(token_index_ + 2, "/");
}

bool ConstructorStringParser::IsIdentityTerminator() const {
  return IsNonSpecialPatternChar(token_index_, "@");
}

bool ConstructorStringParser::IsPasswordPrefix() const {
  return IsNonSpecialPatternChar(token_index_, ":");
}

bool ConstructorStringParser::IsPortPrefix() const {
  return IsNonSpecialPatternChar(token_index_, ":");
}

bool ConstructorStringParser::IsPathnameStart() const {
  return IsNonSpecialPatternChar(token_index_, "/");
}

bool ConstructorStringParser::IsSearchPrefix() const {
  if (IsNonSpecialPatternChar(token_index_, "?")) {
    return true;
  }

  if (token_list_[token_index_].value != "?") {
    return false;
  }

  // If we have a "?" that is not a normal character, then it must be an
  // optional group modifier.
  ABSL_ASSERT(SafeToken(token_index_).type == TokenType::kOtherModifier);

  // We have a `?` tokenized as a modifier.  We only want to treat this as
  // the search prefix if it would not normally be valid in a liburlpattern
  // string.  A modifier must follow a matching group.  Therefore we inspect
  // the preceding token to see if the `?` is immediately following a group
  // construct.
  //
  // So if the string is:
  //
  //  https://example.com/foo?bar
  //
  // Then we return true because the previous token is a `o` with type kChar.
  // For the string:
  //
  //  https://example.com/:name?bar
  //
  // Then we return false because the previous token is `:name` with type
  // kName.  If the developer intended this to be a search prefix then they
  // would need to escape like question mark like `:name\\?bar`.
  //
  // Note, if `token_index_` is zero the index will wrap around and
  // `SafeToken()` will return the kEnd token.  This will correctly return true
  // from this method as a pattern cannot normally begin with an unescaped `?`.
  const auto& previous_token = SafeToken(token_index_ - 1);
  return previous_token.type != TokenType::kName &&
         previous_token.type != TokenType::kRegex &&
         previous_token.type != TokenType::kClose &&
         previous_token.type != TokenType::kAsterisk;
}

bool ConstructorStringParser::IsHashPrefix() const {
  return IsNonSpecialPatternChar(token_index_, "#");
}

bool ConstructorStringParser::IsGroupOpen() const {
  return token_list_[token_index_].type == TokenType::kOpen;
}

bool ConstructorStringParser::IsGroupClose() const {
  return token_list_[token_index_].type == TokenType::kClose;
}

bool ConstructorStringParser::IsIPv6Open() const {
  return IsNonSpecialPatternChar(token_index_, "[");
}

bool ConstructorStringParser::IsIPv6Close() const {
  return IsNonSpecialPatternChar(token_index_, "]");
}

std::string_view ConstructorStringParser::MakeComponentString() const {
  ABSL_ASSERT(token_index_ < token_list_.size());
  const auto& token = token_list_[token_index_];

  size_t component_char_start = SafeToken(component_start_).index;

  ABSL_ASSERT(component_char_start <= input_.size());
  ABSL_ASSERT(token.index >= component_char_start);
  ABSL_ASSERT(token.index < input_.size() ||
              (token.index == input_.size() && token.type == TokenType::kEnd));
  return input_.substr(component_char_start,
                       token.index - component_char_start);
}

}  // namespace liburlpattern