File: autofill_profile_comparator.cc

package info (click to toggle)
chromium-browser 57.0.2987.98-1~deb8u1
links: PTS, VCS
area: main
in suites: jessie
size: 2,637,852 kB
ctags: 2,544,394
sloc: cpp: 12,815,961; ansic: 3,676,222; python: 1,147,112; asm: 526,608; java: 523,212; xml: 286,794; perl: 92,654; sh: 86,408; objc: 73,271; makefile: 27,698; cs: 18,487; yacc: 13,031; tcl: 12,957; pascal: 4,875; ml: 4,716; lex: 3,904; sql: 3,862; ruby: 1,982; lisp: 1,508; php: 1,368; exp: 404; awk: 325; csh: 117; jsp: 39; sed: 37
file content (959 lines) | stat: -rw-r--r-- 36,925 bytes
// Copyright 2016 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/autofill/core/browser/autofill_profile_comparator.h"

#include <algorithm>
#include <vector>

#include "base/i18n/case_conversion.h"
#include "base/i18n/char_iterator.h"
#include "base/strings/string_piece.h"
#include "base/strings/string_split.h"
#include "base/strings/string_util.h"
#include "base/strings/utf_string_conversion_utils.h"
#include "base/strings/utf_string_conversions.h"
#include "components/autofill/core/browser/address_rewriter.h"
#include "components/autofill/core/browser/autofill_country.h"
#include "components/autofill/core/browser/autofill_data_util.h"
#include "components/autofill/core/browser/state_names.h"
#include "third_party/libphonenumber/phonenumber_api.h"

using i18n::phonenumbers::PhoneNumberUtil;
using base::UTF16ToUTF8;
using base::UTF8ToUTF16;

namespace autofill {
namespace {

const base::char16 kSpace[] = {L' ', L'\0'};

bool ContainsNewline(base::StringPiece16 text) {
  return text.find('\n') != base::StringPiece16::npos;
}

std::ostream& operator<<(std::ostream& os,
                         const ::i18n::phonenumbers::PhoneNumber& n) {
  os << "country_code: " << n.country_code() << " "
     << "national_number: " << n.national_number();
  if (n.has_extension())
    os << " extension: \"" << n.extension() << "\"";
  if (n.has_italian_leading_zero())
    os << " italian_leading_zero: " << n.italian_leading_zero();
  if (n.has_number_of_leading_zeros())
    os << " number_of_leading_zeros: " << n.number_of_leading_zeros();
  if (n.has_raw_input())
    os << " raw_input: \"" << n.raw_input() << "\"";
  return os;
}

}  // namespace

AutofillProfileComparator::AutofillProfileComparator(
    const base::StringPiece& app_locale)
    : app_locale_(app_locale.data(), app_locale.size()) {
  // Use ICU transliteration to remove diacritics and fold case.
  // See http://userguide.icu-project.org/transforms/general
  UErrorCode status = U_ZERO_ERROR;
  std::unique_ptr<icu::Transliterator> transliterator(
      icu::Transliterator::createInstance(
          "NFD; [:Nonspacing Mark:] Remove; Lower; NFC", UTRANS_FORWARD,
          status));
  if (U_FAILURE(status) || transliterator == nullptr) {
    // TODO(rogerm): Add a histogram to count how often this happens.
    LOG(ERROR) << "Failed to create ICU Transliterator: "
               << u_errorName(status);
  }

  transliterator_ = std::move(transliterator);
}

AutofillProfileComparator::~AutofillProfileComparator() {}

base::string16 AutofillProfileComparator::NormalizeForComparison(
    base::StringPiece16 text,
    AutofillProfileComparator::WhitespaceSpec whitespace_spec) const {
  // This algorithm is not designed to be perfect, we could get arbitrarily
  // fancy here trying to canonicalize address lines. Instead, this is designed
  // to handle common cases for all types of data (addresses and names) without
  // the need of domain-specific logic.
  //
  // 1. Convert punctuation to spaces and normalize all whitespace to spaces.
  //    This will convert "Mid-Island Plz." -> "Mid Island Plz " (the trailing
  //    space will be trimmed off outside of the end of the loop).
  //
  // 2. Collapse consecutive punctuation/whitespace characters to a single
  //    space. We pretend the string has already started with whitespace in
  //    order to trim leading spaces.
  //
  // 3. Remove diacritics (accents and other non-spacing marks) and perform
  //    case folding to lower-case.
  base::string16 result;
  result.reserve(text.length());
  bool previous_was_whitespace = (whitespace_spec == RETAIN_WHITESPACE);
  for (base::i18n::UTF16CharIterator iter(text.data(), text.length());
       !iter.end(); iter.Advance()) {
    switch (u_charType(iter.get())) {
      // Punctuation
      case U_DASH_PUNCTUATION:
      case U_START_PUNCTUATION:
      case U_END_PUNCTUATION:
      case U_CONNECTOR_PUNCTUATION:
      case U_OTHER_PUNCTUATION:
      // Whitespace
      case U_CONTROL_CHAR:  // To escape the '\n' character.
      case U_SPACE_SEPARATOR:
      case U_LINE_SEPARATOR:
      case U_PARAGRAPH_SEPARATOR:
        if (!previous_was_whitespace && whitespace_spec == RETAIN_WHITESPACE) {
          result.push_back(' ');
          previous_was_whitespace = true;
        }
        break;

      default:
        previous_was_whitespace = false;
        base::WriteUnicodeCharacter(iter.get(), &result);
        break;
    }
  }

  // Trim off trailing whitespace if we left one.
  if (previous_was_whitespace && !result.empty())
    result.resize(result.size() - 1);

  if (transliterator_ == nullptr)
    return result;

  icu::UnicodeString value = icu::UnicodeString(result.data(), result.length());
  transliterator_->transliterate(value);
  return base::string16(value.getBuffer(), value.length());
}

bool AutofillProfileComparator::AreMergeable(const AutofillProfile& p1,
                                             const AutofillProfile& p2) const {
  // Sorted in order to relative expense of the tests to fail early and cheaply
  // if possible.
  DVLOG(1) << "Comparing profiles:\np1 = " << p1 << "\np2 = " << p2;

  if (!HaveMergeableEmailAddresses(p1, p2)) {
    DVLOG(1) << "Different email addresses.";
    return false;
  }

  if (!HaveMergeableCompanyNames(p1, p2)) {
    DVLOG(1) << "Different email company names.";
    return false;
  }

  if (!HaveMergeablePhoneNumbers(p1, p2)) {
    DVLOG(1) << "Different phone numbers.";
    return false;
  }

  if (!HaveMergeableNames(p1, p2)) {
    DVLOG(1) << "Different names.";
    return false;
  }

  if (!HaveMergeableAddresses(p1, p2)) {
    DVLOG(1) << "Different addresses.";
    return false;
  }

  DVLOG(1) << "Profiles are mergeable.";
  return true;
}

bool AutofillProfileComparator::MergeNames(const AutofillProfile& p1,
                                           const AutofillProfile& p2,
                                           NameInfo* name_info) const {
  DCHECK(HaveMergeableNames(p1, p2));

  const AutofillType kFullName(NAME_FULL);
  const base::string16& full_name_1 = p1.GetInfo(kFullName, app_locale_);
  const base::string16& full_name_2 = p2.GetInfo(kFullName, app_locale_);

  const base::string16& normalized_full_name_1 =
      NormalizeForComparison(full_name_1);
  const base::string16& normalized_full_name_2 =
      NormalizeForComparison(full_name_2);

  const base::string16* best_name = nullptr;
  if (normalized_full_name_1.empty()) {
    // p1 has no name, so use the name from p2.
    best_name = &full_name_2;
  } else if (normalized_full_name_2.empty()) {
    // p2 has no name, so use the name from p1.
    best_name = &full_name_1;
  } else if (data_util::IsCJKName(full_name_1) &&
             data_util::IsCJKName(full_name_2)) {
    // Use a separate logic for CJK names.
    return MergeCJKNames(p1, p2, name_info);
  } else if (IsNameVariantOf(normalized_full_name_1, normalized_full_name_2)) {
    // full_name_2 is a variant of full_name_1.
    best_name = &full_name_1;
  } else {
    // If the assertion that p1 and p2 have mergeable names is true, then
    // full_name_1 must be a name variant of full_name_2;
    best_name = &full_name_2;
  }

  name_info->SetInfo(AutofillType(NAME_FULL), *best_name, app_locale_);
  return true;
}

bool AutofillProfileComparator::MergeCJKNames(
    const AutofillProfile& p1,
    const AutofillProfile& p2,
    NameInfo* info) const {
  DCHECK(data_util::IsCJKName(p1.GetInfo(AutofillType(NAME_FULL),
                                         app_locale_)));
  DCHECK(data_util::IsCJKName(p2.GetInfo(AutofillType(NAME_FULL),
                                         app_locale_)));

  struct Name {
    base::string16 given;
    base::string16 surname;
    base::string16 full;
  };

  Name name1 = {
    p1.GetRawInfo(NAME_FIRST),
    p1.GetRawInfo(NAME_LAST),
    p1.GetRawInfo(NAME_FULL)
  };
  Name name2 = {
    p2.GetRawInfo(NAME_FIRST),
    p2.GetRawInfo(NAME_LAST),
    p2.GetRawInfo(NAME_FULL)
  };

  const Name* most_recent_name =
      p2.use_date() >= p1.use_date() ? &name2 : &name1;

  // The two |NameInfo| objects might disagree about what the full name looks
  // like. If only one of the two has an explicit (user-entered) full name, use
  // that as ground truth. Otherwise, use the most recent profile.
  const Name* full_name_candidate;
  if (name1.full.empty()) {
    full_name_candidate = &name2;
  } else if (name2.full.empty()) {
    full_name_candidate = &name1;
  } else {
    full_name_candidate = most_recent_name;
  }

  // The two |NameInfo| objects might disagree about how the name is split into
  // given/surname. If only one of the two has an explicit (user-entered)
  // given/surname pair, use that as ground truth. Otherwise, use the most
  // recent profile.
  const Name* name_parts_candidate;
  if (name1.given.empty() || name1.surname.empty()) {
    name_parts_candidate = &name2;
  } else if (name2.given.empty() || name2.surname.empty()) {
    name_parts_candidate = &name1;
  } else {
    name_parts_candidate = most_recent_name;
  }

  if (name_parts_candidate->given.empty() ||
      name_parts_candidate->surname.empty()) {
    // The name was not split correctly into a given/surname, so use the logic
    // from |SplitName()|.
    info->SetInfo(AutofillType(NAME_FULL), full_name_candidate->full,
                  app_locale_);
  } else {
    // The name was already split into a given/surname, so keep those intact.
    if (!full_name_candidate->full.empty()) {
      info->SetRawInfo(NAME_FULL, full_name_candidate->full);
    }
    info->SetRawInfo(NAME_FIRST, name_parts_candidate->given);
    info->SetRawInfo(NAME_LAST, name_parts_candidate->surname);
  }

  return true;
}

bool AutofillProfileComparator::MergeEmailAddresses(
    const AutofillProfile& p1,
    const AutofillProfile& p2,
    EmailInfo* email_info) const {
  DCHECK(HaveMergeableEmailAddresses(p1, p2));

  const AutofillType kEmailAddress(EMAIL_ADDRESS);
  const base::string16& e1 = p1.GetInfo(kEmailAddress, app_locale_);
  const base::string16& e2 = p2.GetInfo(kEmailAddress, app_locale_);
  const base::string16* best = nullptr;

  if (e1.empty()) {
    best = &e2;
  } else if (e2.empty()) {
    best = &e1;
  } else {
    best = p2.use_date() > p1.use_date() ? &e2 : &e1;
  }

  email_info->SetInfo(kEmailAddress, *best, app_locale_);
  return true;
}

bool AutofillProfileComparator::MergeCompanyNames(
    const AutofillProfile& p1,
    const AutofillProfile& p2,
    CompanyInfo* company_info) const {
  const AutofillType kCompanyName(COMPANY_NAME);
  const base::string16& c1 = p1.GetInfo(kCompanyName, app_locale_);
  const base::string16& c2 = p2.GetInfo(kCompanyName, app_locale_);
  const base::string16* best = nullptr;

  DCHECK(HaveMergeableCompanyNames(p1, p2))
      << "Company names are not mergeable: '" << c1 << "' vs '" << c2 << "'";

  CompareTokensResult result =
      CompareTokens(NormalizeForComparison(c1), NormalizeForComparison(c2));
  switch (result) {
    case DIFFERENT_TOKENS:
    default:
      NOTREACHED() << "Unexpected mismatch: '" << c1 << "' vs '" << c2 << "'";
      return false;
    case S1_CONTAINS_S2:
      best = &c1;
      break;
    case S2_CONTAINS_S1:
      best = &c2;
      break;
    case SAME_TOKENS:
      best = p2.use_date() > p1.use_date() ? &c2 : &c1;
      break;
  }

  company_info->SetInfo(kCompanyName, *best, app_locale_);
  return true;
}

bool AutofillProfileComparator::MergePhoneNumbers(
    const AutofillProfile& p1,
    const AutofillProfile& p2,
    PhoneNumber* phone_number) const {
  const ServerFieldType kWholePhoneNumber = PHONE_HOME_WHOLE_NUMBER;
  const base::string16& s1 = p1.GetRawInfo(kWholePhoneNumber);
  const base::string16& s2 = p2.GetRawInfo(kWholePhoneNumber);

  DCHECK(HaveMergeablePhoneNumbers(p1, p2))
      << "Phone numbers are not mergeable: '" << s1 << "' vs '" << s2 << "'";

  if (s1.empty()) {
    phone_number->SetRawInfo(kWholePhoneNumber, s2);
    return true;
  }

  if (s2.empty() || s1 == s2) {
    phone_number->SetRawInfo(kWholePhoneNumber, s1);
    return true;
  }

  // Figure out a country code hint.
  const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE);
  std::string region = UTF16ToUTF8(GetNonEmptyOf(p1, p2, kCountryCode));
  if (region.empty())
    region = AutofillCountry::CountryCodeForLocale(app_locale_);

  // Parse the phone numbers.
  PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();

  ::i18n::phonenumbers::PhoneNumber n1;
  if (phone_util->ParseAndKeepRawInput(UTF16ToUTF8(s1), region, &n1) !=
      PhoneNumberUtil::NO_PARSING_ERROR) {
    return false;
  }

  ::i18n::phonenumbers::PhoneNumber n2;
  if (phone_util->ParseAndKeepRawInput(UTF16ToUTF8(s2), region, &n2) !=
      PhoneNumberUtil::NO_PARSING_ERROR) {
    return false;
  }

  ::i18n::phonenumbers::PhoneNumber merged_number;
  DCHECK_EQ(n1.country_code(), n2.country_code());
  merged_number.set_country_code(n1.country_code());
  merged_number.set_national_number(
      std::max(n1.national_number(), n2.national_number()));
  if (n1.has_extension() && !n1.extension().empty()) {
    merged_number.set_extension(n1.extension());
  } else if (n2.has_extension() && !n2.extension().empty()) {
    merged_number.set_extension(n2.extension());
  }
  if (n1.has_italian_leading_zero() || n2.has_italian_leading_zero()) {
    merged_number.set_italian_leading_zero(n1.italian_leading_zero() ||
                                           n2.italian_leading_zero());
  }
  if (n1.has_number_of_leading_zeros() || n2.has_number_of_leading_zeros()) {
    merged_number.set_number_of_leading_zeros(
        std::max(n1.number_of_leading_zeros(), n2.number_of_leading_zeros()));
  }

  PhoneNumberUtil::PhoneNumberFormat format =
      region.empty() ? PhoneNumberUtil::NATIONAL
                     : PhoneNumberUtil::INTERNATIONAL;

  std::string new_number;
  phone_util->Format(merged_number, format, &new_number);

  DVLOG(2) << "n1 = {" << n1 << "}";
  DVLOG(2) << "n2 = {" << n2 << "}";
  DVLOG(2) << "merged_number = {" << merged_number << "}";
  DVLOG(2) << "new_number = \"" << new_number << "\"";

  // Check if it's a North American number that's missing the area code.
  // Libphonenumber doesn't know how to format short numbers; it will still
  // include the country code prefix.
  if (merged_number.country_code() == 1 &&
      merged_number.national_number() <= 9999999 &&
      base::StartsWith(new_number, "+1", base::CompareCase::SENSITIVE)) {
    size_t offset = 2;  // The char just after "+1".
    while (offset < new_number.size() &&
           base::IsAsciiWhitespace(new_number[offset])) {
      ++offset;
    }
    new_number = new_number.substr(offset);
  }

  phone_number->SetRawInfo(kWholePhoneNumber, UTF8ToUTF16(new_number));

  return true;
}

bool AutofillProfileComparator::MergeAddresses(const AutofillProfile& p1,
                                               const AutofillProfile& p2,
                                               Address* address) const {
  DCHECK(HaveMergeableAddresses(p1, p2));

  // One of the countries is empty or they are the same modulo case, so we just
  // have to find the non-empty one, if any.
  const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE);
  const base::string16& country_code =
      base::i18n::ToUpper(GetNonEmptyOf(p1, p2, kCountryCode));
  address->SetInfo(kCountryCode, country_code, app_locale_);

  // One of the zip codes is empty, they are the same, or one is a substring
  // of the other. We prefer the most recently used zip code.
  const AutofillType kZipCode(ADDRESS_HOME_ZIP);
  const base::string16& zip1 = p1.GetInfo(kZipCode, app_locale_);
  const base::string16& zip2 = p2.GetInfo(kZipCode, app_locale_);
  if (zip1.empty()) {
    address->SetInfo(kZipCode, zip2, app_locale_);
  } else if (zip2.empty()) {
    address->SetInfo(kZipCode, zip1, app_locale_);
  } else {
    address->SetInfo(kZipCode, (p2.use_date() > p1.use_date() ? zip2 : zip1),
                     app_locale_);
  }

  // One of the states is empty or one of the states has a subset of tokens from
  // the other. Pick the non-empty state that is shorter. This is usually the
  // abbreviated one.
  const AutofillType kState(ADDRESS_HOME_STATE);
  const base::string16& state1 = p1.GetInfo(kState, app_locale_);
  const base::string16& state2 = p2.GetInfo(kState, app_locale_);
  if (state1.empty()) {
    address->SetInfo(kState, state2, app_locale_);
  } else if (state2.empty()) {
    address->SetInfo(kState, state1, app_locale_);
  } else {
    address->SetInfo(kState, (state2.size() < state1.size() ? state2 : state1),
                     app_locale_);
  }

  AddressRewriter rewriter = AddressRewriter::ForCountryCode(country_code);

  // One of the cities is empty or one of the cities has a subset of tokens from
  // the other. Pick the city name with more tokens; this is usually the most
  // explicit one.
  const AutofillType kCity(ADDRESS_HOME_CITY);
  const base::string16& city1 = p1.GetInfo(kCity, app_locale_);
  const base::string16& city2 = p2.GetInfo(kCity, app_locale_);
  if (city1.empty()) {
    address->SetInfo(kCity, city2, app_locale_);
  } else if (city2.empty()) {
    address->SetInfo(kCity, city1, app_locale_);
  } else {
    // Prefer the one with more tokens, making sure to apply address
    // normalization and rewriting before doing the comparison.
    CompareTokensResult result =
        CompareTokens(rewriter.Rewrite(NormalizeForComparison(city1)),
                      rewriter.Rewrite(NormalizeForComparison(city2)));
    switch (result) {
      case SAME_TOKENS:
        // They have the same set of unique tokens. Let's pick the more recently
        // used one.
        address->SetInfo(kCity, (p2.use_date() > p1.use_date() ? city2 : city1),
                         app_locale_);
        break;
      case S1_CONTAINS_S2:
        // city1 has more unique tokens than city2.
        address->SetInfo(kCity, city1, app_locale_);
        break;
      case S2_CONTAINS_S1:
        // city2 has more unique tokens than city1.
        address->SetInfo(kCity, city2, app_locale_);
        break;
      case DIFFERENT_TOKENS:
      default:
        // The cities aren't mergeable and we shouldn't be doing any of
        // this.
        NOTREACHED() << "Unexpected mismatch: '" << city1 << "' vs '" << city2
                     << "'";
        return false;
    }
  }

  // One of the dependend localities is empty or one of the localities has a
  // subset of tokens from the other. Pick the locality name with more tokens;
  // this is usually the most explicit one.
  const AutofillType kDependentLocality(ADDRESS_HOME_DEPENDENT_LOCALITY);
  const base::string16& locality1 = p1.GetInfo(kDependentLocality, app_locale_);
  const base::string16& locality2 = p2.GetInfo(kDependentLocality, app_locale_);
  if (locality1.empty()) {
    address->SetInfo(kDependentLocality, locality2, app_locale_);
  } else if (locality2.empty()) {
    address->SetInfo(kDependentLocality, locality1, app_locale_);
  } else {
    // Prefer the one with more tokens, making sure to apply address
    // normalization and rewriting before doing the comparison.
    CompareTokensResult result =
        CompareTokens(rewriter.Rewrite(NormalizeForComparison(locality1)),
                      rewriter.Rewrite(NormalizeForComparison(locality2)));
    switch (result) {
      case SAME_TOKENS:
        // They have the same set of unique tokens. Let's pick the more recently
        // used one.
        address->SetInfo(
            kDependentLocality,
            (p2.use_date() > p1.use_date() ? locality2 : locality1),
            app_locale_);
        break;
      case S1_CONTAINS_S2:
        // locality1 has more unique tokens than locality2.
        address->SetInfo(kDependentLocality, locality1, app_locale_);
        break;
      case S2_CONTAINS_S1:
        // locality2 has more unique tokens than locality1.
        address->SetInfo(kDependentLocality, locality2, app_locale_);
        break;
      case DIFFERENT_TOKENS:
      default:
        // The localities aren't mergeable and we shouldn't be doing any of
        // this.
        NOTREACHED() << "Unexpected mismatch: '" << locality1 << "' vs '"
                     << locality2 << "'";
        return false;
    }
  }

  // One of the sorting codes is empty, they are the same, or one is a substring
  // of the other. We prefer the most recently used sorting code.
  const AutofillType kSortingCode(ADDRESS_HOME_SORTING_CODE);
  const base::string16& sorting1 = p1.GetInfo(kSortingCode, app_locale_);
  const base::string16& sorting2 = p2.GetInfo(kSortingCode, app_locale_);
  if (sorting1.empty()) {
    address->SetInfo(kSortingCode, sorting2, app_locale_);
  } else if (sorting2.empty()) {
    address->SetInfo(kSortingCode, sorting1, app_locale_);
  } else {
    address->SetInfo(kSortingCode,
                     (p2.use_date() > p1.use_date() ? sorting2 : sorting1),
                     app_locale_);
  }

  // One of the addresses is empty or one of the addresses has a subset of
  // tokens from the other. Prefer the more verbosely expressed one.
  const AutofillType kStreetAddress(ADDRESS_HOME_STREET_ADDRESS);
  const base::string16& address1 = p1.GetInfo(kStreetAddress, app_locale_);
  const base::string16& address2 = p2.GetInfo(kStreetAddress, app_locale_);
  // If one of the addresses is empty then use the other.
  if (address1.empty()) {
    address->SetInfo(kStreetAddress, address2, app_locale_);
  } else if (address2.empty()) {
    address->SetInfo(kStreetAddress, address1, app_locale_);
  } else {
    // Prefer the multi-line address if one is multi-line and the other isn't.
    bool address1_multiline = ContainsNewline(address1);
    bool address2_multiline = ContainsNewline(address2);
    if (address1_multiline && !address2_multiline) {
      address->SetInfo(kStreetAddress, address1, app_locale_);
    } else if (address2_multiline && !address1_multiline) {
      address->SetInfo(kStreetAddress, address2, app_locale_);
    } else {
      // Prefer the one with more tokens if they're both single-line or both
      // multi-line addresses, making sure to apply address normalization and
      // rewriting before doing the comparison.
      CompareTokensResult result =
          CompareTokens(rewriter.Rewrite(NormalizeForComparison(address1)),
                        rewriter.Rewrite(NormalizeForComparison(address2)));
      switch (result) {
        case SAME_TOKENS:
          // They have the same set of unique tokens. Let's pick the one that's
          // longer.
          address->SetInfo(
              kStreetAddress,
              (p2.use_date() > p1.use_date() ? address2 : address1),
              app_locale_);
          break;
        case S1_CONTAINS_S2:
          // address1 has more unique tokens than address2.
          address->SetInfo(kStreetAddress, address1, app_locale_);
          break;
        case S2_CONTAINS_S1:
          // address2 has more unique tokens than address1.
          address->SetInfo(kStreetAddress, address2, app_locale_);
          break;
        case DIFFERENT_TOKENS:
        default:
          // The addresses aren't mergeable and we shouldn't be doing any of
          // this.
          NOTREACHED() << "Unexpected mismatch: '" << address1 << "' vs '"
                       << address2 << "'";
          return false;
      }
    }
  }
  return true;
}

// static
std::set<base::StringPiece16> AutofillProfileComparator::UniqueTokens(
    base::StringPiece16 s) {
  std::vector<base::StringPiece16> tokens = base::SplitStringPiece(
      s, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);
  return std::set<base::StringPiece16>(tokens.begin(), tokens.end());
}

// static
AutofillProfileComparator::CompareTokensResult
AutofillProfileComparator::CompareTokens(base::StringPiece16 s1,
                                         base::StringPiece16 s2) {
  // Note: std::include() expects the items in each range to be in sorted order,
  // hence the use of std::set<> instead of std::unordered_set<>.
  std::set<base::StringPiece16> t1 = UniqueTokens(s1);
  std::set<base::StringPiece16> t2 = UniqueTokens(s2);

  // Does s1 contains all of the tokens in s2? As a special case, return 0 if
  // the two sets are exactly the same.
  if (std::includes(t1.begin(), t1.end(), t2.begin(), t2.end()))
    return t1.size() == t2.size() ? SAME_TOKENS : S1_CONTAINS_S2;

  // Does s2 contain all of the tokens in s1?
  if (std::includes(t2.begin(), t2.end(), t1.begin(), t1.end()))
    return S2_CONTAINS_S1;

  // Neither string contains all of the tokens from the other.
  return DIFFERENT_TOKENS;
}

base::string16 AutofillProfileComparator::GetNonEmptyOf(
    const AutofillProfile& p1,
    const AutofillProfile& p2,
    AutofillType t) const {
  const base::string16& s1 = p1.GetInfo(t, app_locale_);
  if (!s1.empty())
    return s1;
  return p2.GetInfo(t, app_locale_);
}

// static
std::set<base::string16> AutofillProfileComparator::GetNamePartVariants(
    const base::string16& name_part) {
  const size_t kMaxSupportedSubNames = 8;

  std::vector<base::string16> sub_names = base::SplitString(
      name_part, kSpace, base::TRIM_WHITESPACE, base::SPLIT_WANT_NONEMPTY);

  // Limit the number of sub-names we support (to constrain memory usage);
  if (sub_names.size() > kMaxSupportedSubNames)
    return {name_part};

  // Start with the empty string as a variant.
  std::set<base::string16> variants = {base::EmptyString16()};

  // For each sub-name, add a variant of all the already existing variants that
  // appends this sub-name and one that appends the initial of this sub-name.
  // Duplicates will be discarded when they're added to the variants set.
  for (const base::string16& sub_name : sub_names) {
    if (sub_name.empty())
      continue;
    std::vector<base::string16> new_variants;
    for (const base::string16& variant : variants) {
      new_variants.push_back(base::CollapseWhitespace(
          base::JoinString({variant, sub_name}, kSpace), true));
      new_variants.push_back(base::CollapseWhitespace(
          base::JoinString({variant, sub_name.substr(0, 1)}, kSpace), true));
    }
    variants.insert(new_variants.begin(), new_variants.end());
  }

  // As a common case, also add the variant that just concatenates all of the
  // initials.
  base::string16 initials;
  for (const base::string16& sub_name : sub_names) {
    if (sub_name.empty())
      continue;
    initials.push_back(sub_name[0]);
  }
  variants.insert(initials);

  // And, we're done.
  return variants;
}

bool AutofillProfileComparator::IsNameVariantOf(
    const base::string16& full_name_1,
    const base::string16& full_name_2) const {
  data_util::NameParts name_1_parts = data_util::SplitName(full_name_1);

  // Build the variants of full_name_1`s given, middle and family names.
  //
  // TODO(rogerm): Figure out whether or not we should break apart a compound
  // family name into variants (crbug/619051)
  const std::set<base::string16> given_name_variants =
      GetNamePartVariants(name_1_parts.given);
  const std::set<base::string16> middle_name_variants =
      GetNamePartVariants(name_1_parts.middle);
  const base::string16& family_name = name_1_parts.family;

  // Iterate over all full name variants of profile 2 and see if any of them
  // match the full name from profile 1.
  for (const base::string16& given_name : given_name_variants) {
    for (const base::string16& middle_name : middle_name_variants) {
      base::string16 candidate = base::CollapseWhitespace(
          base::JoinString({given_name, middle_name, family_name}, kSpace),
          true);
      if (candidate == full_name_2)
        return true;
    }
  }

  // Also check if the name is just composed of the user's initials. For
  // example, "thomas jefferson miller" could be composed as "tj miller".
  if (!name_1_parts.given.empty() && !name_1_parts.middle.empty()) {
    base::string16 initials;
    initials.push_back(name_1_parts.given[0]);
    initials.push_back(name_1_parts.middle[0]);
    base::string16 candidate = base::CollapseWhitespace(
        base::JoinString({initials, family_name}, kSpace), true);
    if (candidate == full_name_2)
      return true;
  }

  // There was no match found.
  return false;
}

bool AutofillProfileComparator::HaveMergeableNames(
    const AutofillProfile& p1,
    const AutofillProfile& p2) const {
  base::string16 full_name_1 =
      NormalizeForComparison(p1.GetInfo(AutofillType(NAME_FULL), app_locale_));
  base::string16 full_name_2 =
      NormalizeForComparison(p2.GetInfo(AutofillType(NAME_FULL), app_locale_));

  if (full_name_1.empty() || full_name_2.empty() ||
      full_name_1 == full_name_2) {
    return true;
  }

  if (data_util::IsCJKName(full_name_1) && data_util::IsCJKName(full_name_2)) {
    return HaveMergeableCJKNames(p1, p2);
  }

  // Is it reasonable to merge the names from p1 and p2.
  return IsNameVariantOf(full_name_1, full_name_2) ||
         IsNameVariantOf(full_name_2, full_name_1);
}

bool AutofillProfileComparator::HaveMergeableCJKNames(
    const AutofillProfile& p1,
    const AutofillProfile& p2) const {
  base::string16 name_1 =
      NormalizeForComparison(p1.GetInfo(AutofillType(NAME_FULL), app_locale_),
                             DISCARD_WHITESPACE);
  base::string16 name_2 =
      NormalizeForComparison(p2.GetInfo(AutofillType(NAME_FULL), app_locale_),
                             DISCARD_WHITESPACE);
  return name_1 == name_2;
}

bool AutofillProfileComparator::HaveMergeableEmailAddresses(
    const AutofillProfile& p1,
    const AutofillProfile& p2) const {
  const base::string16& email_1 =
      p1.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_);
  const base::string16& email_2 =
      p2.GetInfo(AutofillType(EMAIL_ADDRESS), app_locale_);
  return email_1.empty() || email_2.empty() ||
         case_insensitive_compare_.StringsEqual(email_1, email_2);
}

bool AutofillProfileComparator::HaveMergeableCompanyNames(
    const AutofillProfile& p1,
    const AutofillProfile& p2) const {
  const base::string16& company_name_1 = NormalizeForComparison(
      p1.GetInfo(AutofillType(COMPANY_NAME), app_locale_));
  const base::string16& company_name_2 = NormalizeForComparison(
      p2.GetInfo(AutofillType(COMPANY_NAME), app_locale_));
  return company_name_1.empty() || company_name_2.empty() ||
         CompareTokens(company_name_1, company_name_2) != DIFFERENT_TOKENS;
}

bool AutofillProfileComparator::HaveMergeablePhoneNumbers(
    const AutofillProfile& p1,
    const AutofillProfile& p2) const {
  // We work with the raw phone numbers to avoid losing any helpful information
  // as we parse.
  const base::string16& raw_phone_1 = p1.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);
  const base::string16& raw_phone_2 = p2.GetRawInfo(PHONE_HOME_WHOLE_NUMBER);

  // Are the two phone numbers trivially mergeable?
  if (raw_phone_1.empty() || raw_phone_2.empty() ||
      raw_phone_1 == raw_phone_2) {
    return true;
  }

  // TODO(rogerm): Modify ::autofill::i18n::PhoneNumbersMatch to support
  // SHORT_NSN_MATCH and just call that instead of accessing the underlying
  // utility library directly?

  // The phone number util library needs the numbers in utf8.
  const std::string phone_1 = base::UTF16ToUTF8(raw_phone_1);
  const std::string phone_2 = base::UTF16ToUTF8(raw_phone_2);

  // Parse and compare the phone numbers.
  PhoneNumberUtil* phone_util = PhoneNumberUtil::GetInstance();
  switch (phone_util->IsNumberMatchWithTwoStrings(phone_1, phone_2)) {
    case PhoneNumberUtil::SHORT_NSN_MATCH:
    case PhoneNumberUtil::NSN_MATCH:
    case PhoneNumberUtil::EXACT_MATCH:
      return true;
    case PhoneNumberUtil::INVALID_NUMBER:
    case PhoneNumberUtil::NO_MATCH:
      return false;
    default:
      NOTREACHED();
      return false;
  }
}

bool AutofillProfileComparator::HaveMergeableAddresses(
    const AutofillProfile& p1,
    const AutofillProfile& p2) const {
  // If the address are not in the same country, then they're not the same. If
  // one of the address countries is unknown/invalid the comparison continues.
  const AutofillType kCountryCode(HTML_TYPE_COUNTRY_CODE, HTML_MODE_NONE);
  const base::string16& country1 = p1.GetInfo(kCountryCode, app_locale_);
  const base::string16& country2 = p2.GetInfo(kCountryCode, app_locale_);
  if (!country1.empty() && !country2.empty() &&
      !case_insensitive_compare_.StringsEqual(country1, country2)) {
    return false;
  }

  // Zip
  // ----
  // If the addresses are definitely not in the same zip/area code then we're
  // done. Otherwise,the comparison continues.
  const AutofillType kZipCode(ADDRESS_HOME_ZIP);
  const base::string16& zip1 = NormalizeForComparison(
      p1.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);
  const base::string16& zip2 = NormalizeForComparison(
      p2.GetInfo(kZipCode, app_locale_), DISCARD_WHITESPACE);
  if (!zip1.empty() && !zip2.empty() &&
      zip1.find(zip2) == base::string16::npos &&
      zip2.find(zip1) == base::string16::npos) {
    return false;
  }

  // Use the token rewrite rules for the (common) country of the address to
  // transform equivalent substrings to a representative token for comparison.
  AddressRewriter rewriter =
      AddressRewriter::ForCountryCode(country1.empty() ? country2 : country1);

  // State
  // ------
  // Heuristic: States are mergeable if one is a (possibly empty) bag of words
  // subset of the other.
  //
  // TODO(rogerm): If the match is between non-empty zip codes then we can infer
  // that the two state strings are intended to have the same meaning. This
  // handles the cases where we have invalid or poorly formed data in one of the
  // state values (like "Select one", or "CA - California").
  const AutofillType kState(ADDRESS_HOME_STATE);
  const base::string16& state1 =
      rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kState, app_locale_)));
  const base::string16& state2 =
      rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kState, app_locale_)));
  if (CompareTokens(state1, state2) == DIFFERENT_TOKENS) {
    return false;
  }

  // City
  // ------
  // Heuristic: Cities are mergeable if one is a (possibly empty) bag of words
  // subset of the other.
  //
  // TODO(rogerm): If the match is between non-empty zip codes then we can infer
  // that the two city strings are intended to have the same meaning. This
  // handles the cases where we have a city vs one of its suburbs.
  const AutofillType kCity(ADDRESS_HOME_CITY);
  const base::string16& city1 =
      rewriter.Rewrite(NormalizeForComparison(p1.GetInfo(kCity, app_locale_)));
  const base::string16& city2 =
      rewriter.Rewrite(NormalizeForComparison(p2.GetInfo(kCity, app_locale_)));
  if (CompareTokens(city1, city2) == DIFFERENT_TOKENS) {
    return false;
  }

  // Dependent Locality
  // -------------------
  // Heuristic: Dependent Localities are mergeable if one is a (possibly empty)
  // bag of words subset of the other.
  const AutofillType kDependentLocality(ADDRESS_HOME_DEPENDENT_LOCALITY);
  const base::string16& locality1 = rewriter.Rewrite(
      NormalizeForComparison(p1.GetInfo(kDependentLocality, app_locale_)));
  const base::string16& locality2 = rewriter.Rewrite(
      NormalizeForComparison(p2.GetInfo(kDependentLocality, app_locale_)));
  if (CompareTokens(locality1, locality2) == DIFFERENT_TOKENS) {
    return false;
  }

  // Sorting Code
  // -------------
  // Heuristic: Sorting codes are mergeable if one is empty or one is a
  // substring of the other, post normalization and whitespace removed. This
  // is similar to postal/zip codes.
  const AutofillType kSortingCode(ADDRESS_HOME_SORTING_CODE);
  const base::string16& sorting1 = NormalizeForComparison(
      p1.GetInfo(kSortingCode, app_locale_), DISCARD_WHITESPACE);
  const base::string16& sorting2 = NormalizeForComparison(
      p2.GetInfo(kSortingCode, app_locale_), DISCARD_WHITESPACE);
  if (!sorting1.empty() && !sorting2.empty() &&
      sorting1.find(sorting2) == base::string16::npos &&
      sorting2.find(sorting1) == base::string16::npos) {
    return false;
  }

  // Address
  // --------
  // Heuristic: Street addresses are mergeable if one is a (possibly empty) bag
  // of words subset of the other.
  const base::string16& address1 = rewriter.Rewrite(NormalizeForComparison(
      p1.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)));
  const base::string16& address2 = rewriter.Rewrite(NormalizeForComparison(
      p2.GetInfo(AutofillType(ADDRESS_HOME_STREET_ADDRESS), app_locale_)));
  if (CompareTokens(address1, address2) == DIFFERENT_TOKENS) {
    return false;
  }

  return true;
}

}  // namespace autofill