| 12
 3
 4
 5
 6
 7
 8
 9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
 100
 101
 102
 103
 104
 105
 106
 107
 108
 109
 110
 111
 112
 113
 114
 115
 116
 117
 118
 119
 120
 121
 122
 123
 124
 125
 126
 127
 128
 129
 130
 131
 132
 133
 134
 135
 136
 137
 138
 
 | //===--- MisleadingBidirectional.cpp - clang-tidy -------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "MisleadingBidirectional.h"
#include "clang/Frontend/CompilerInstance.h"
#include "clang/Lex/Preprocessor.h"
#include "llvm/Support/ConvertUTF.h"
#include <optional>
using namespace clang;
using namespace clang::tidy::misc;
static bool containsMisleadingBidi(StringRef Buffer,
                                   bool HonorLineBreaks = true) {
  const char *CurPtr = Buffer.begin();
  enum BidiChar {
    PS = 0x2029,
    RLO = 0x202E,
    RLE = 0x202B,
    LRO = 0x202D,
    LRE = 0x202A,
    PDF = 0x202C,
    RLI = 0x2067,
    LRI = 0x2066,
    FSI = 0x2068,
    PDI = 0x2069
  };
  SmallVector<BidiChar> BidiContexts;
  // Scan each character while maintaining a stack of opened bidi context.
  // RLO/RLE/LRO/LRE all are closed by PDF while RLI LRI and FSI are closed by
  // PDI. New lines reset the context count. Extra PDF / PDI are ignored.
  //
  // Warn if we end up with an unclosed context.
  while (CurPtr < Buffer.end()) {
    unsigned char C = *CurPtr;
    if (isASCII(C)) {
      ++CurPtr;
      bool IsParagrapSep =
          (C == 0xA || C == 0xD || (0x1C <= C && C <= 0x1E) || C == 0x85);
      bool IsSegmentSep = (C == 0x9 || C == 0xB || C == 0x1F);
      if (IsParagrapSep || IsSegmentSep)
        BidiContexts.clear();
      continue;
    }
    llvm::UTF32 CodePoint;
    llvm::ConversionResult Result = llvm::convertUTF8Sequence(
        (const llvm::UTF8 **)&CurPtr, (const llvm::UTF8 *)Buffer.end(),
        &CodePoint, llvm::strictConversion);
    // If conversion fails, utf-8 is designed so that we can just try next char.
    if (Result != llvm::conversionOK) {
      ++CurPtr;
      continue;
    }
    // Open a PDF context.
    if (CodePoint == RLO || CodePoint == RLE || CodePoint == LRO ||
        CodePoint == LRE)
      BidiContexts.push_back(PDF);
    // Close PDF Context.
    else if (CodePoint == PDF) {
      if (!BidiContexts.empty() && BidiContexts.back() == PDF)
        BidiContexts.pop_back();
    }
    // Open a PDI Context.
    else if (CodePoint == RLI || CodePoint == LRI || CodePoint == FSI)
      BidiContexts.push_back(PDI);
    // Close a PDI Context.
    else if (CodePoint == PDI) {
      auto R = llvm::find(llvm::reverse(BidiContexts), PDI);
      if (R != BidiContexts.rend())
        BidiContexts.resize(BidiContexts.rend() - R - 1);
    }
    // Line break or equivalent
    else if (CodePoint == PS)
      BidiContexts.clear();
  }
  return !BidiContexts.empty();
}
class MisleadingBidirectionalCheck::MisleadingBidirectionalHandler
    : public CommentHandler {
public:
  MisleadingBidirectionalHandler(MisleadingBidirectionalCheck &Check)
      : Check(Check) {}
  bool HandleComment(Preprocessor &PP, SourceRange Range) override {
    // FIXME: check that we are in a /* */ comment
    StringRef Text =
        Lexer::getSourceText(CharSourceRange::getCharRange(Range),
                             PP.getSourceManager(), PP.getLangOpts());
    if (containsMisleadingBidi(Text, true))
      Check.diag(
          Range.getBegin(),
          "comment contains misleading bidirectional Unicode characters");
    return false;
  }
private:
  MisleadingBidirectionalCheck &Check;
};
MisleadingBidirectionalCheck::MisleadingBidirectionalCheck(
    StringRef Name, ClangTidyContext *Context)
    : ClangTidyCheck(Name, Context),
      Handler(std::make_unique<MisleadingBidirectionalHandler>(*this)) {}
MisleadingBidirectionalCheck::~MisleadingBidirectionalCheck() = default;
void MisleadingBidirectionalCheck::registerPPCallbacks(
    const SourceManager &SM, Preprocessor *PP, Preprocessor *ModuleExpanderPP) {
  PP->addCommentHandler(Handler.get());
}
void MisleadingBidirectionalCheck::check(
    const ast_matchers::MatchFinder::MatchResult &Result) {
  if (const auto *SL = Result.Nodes.getNodeAs<StringLiteral>("strlit")) {
    StringRef Literal = SL->getBytes();
    if (containsMisleadingBidi(Literal, false))
      diag(SL->getBeginLoc(), "string literal contains misleading "
                              "bidirectional Unicode characters");
  }
}
void MisleadingBidirectionalCheck::registerMatchers(
    ast_matchers::MatchFinder *Finder) {
  Finder->addMatcher(ast_matchers::stringLiteral().bind("strlit"), this);
}
 |