File: Lexer.cpp

package info (click to toggle)
swiftlang 6.2.3-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 2,856,264 kB
sloc: cpp: 9,995,718; ansic: 2,234,019; asm: 1,092,167; python: 313,940; objc: 82,726; f90: 80,126; lisp: 38,373; pascal: 25,580; sh: 20,378; ml: 5,058; perl: 4,751; makefile: 4,725; awk: 3,535; javascript: 3,018; xml: 918; fortran: 664; cs: 573; ruby: 396
file content (315 lines) | stat: -rw-r--r-- 8,234 bytes
parent folder | download | duplicates (2)
//===-- Lexer.cpp ---------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#include "llbuild/Ninja/Lexer.h"

#include "llbuild/Basic/LLVM.h"

#include <cstring>
#include <string>
#include <iostream>
#include <iomanip>

using namespace llbuild;
using namespace llbuild::ninja;

///

const char* Token::getKindName() const {
#define CASE(name) case Kind::name: return #name
  switch (tokenKind) {
    CASE(Colon);
    CASE(Comment);
    CASE(EndOfFile);
    CASE(Equals);
    CASE(Identifier);
    CASE(Indentation);
    CASE(KWBuild);
    CASE(KWDefault);
    CASE(KWInclude);
    CASE(KWPool);
    CASE(KWRule);
    CASE(KWSubninja);
    CASE(Newline);
    CASE(Pipe);
    CASE(PipePipe);
    CASE(String);
    CASE(Unknown);
  }
#undef CASE

  return "<invalid token kind>";
}

#ifndef NDEBUG
void Token::dump() {
  std::cerr << "(Token \"" << getKindName() << "\" "
            << (const void*) start << " " << length << " "
            << line << " " << column << ")\n";
}
#endif

///

Lexer::Lexer(StringRef buffer)
  : buffer(buffer), bufferPos(buffer.data()), lineNumber(1), columnNumber(0),
    mode(LexingMode::None)
{
}

Lexer::~Lexer() {
}

int Lexer::peekNextChar() {
  if (bufferPos == buffer.end())
    return -1;
  return *bufferPos;
}

int Lexer::getNextChar() {
  if (bufferPos == buffer.end())
    return -1;

  // Handle DOS/Mac newlines here, by stripping duplicates and by returning '\n'
  // for both.
  char result = *bufferPos++;
  if (result == '\n' || result == '\r') {
    if (bufferPos != buffer.end() && *bufferPos == ('\n' + '\r' - result))
      ++bufferPos;
    result = '\n';
  }

  if (result == '\n') {
    ++lineNumber;
    columnNumber = 0;
  } else {
    ++columnNumber;
  }

  return result;
}

Token& Lexer::setTokenKind(Token& result, Token::Kind kind) const {
  result.tokenKind = kind;
  result.length = bufferPos - result.start;
  return result;
}

void Lexer::skipToEndOfLine() {
  // Skip to the end of the line, but not past the actual newline character
  // (which we want to generate a Newline token).
  for (;;) {
    int c = peekNextChar();
    if (c == -1 || c == '\n' || c == '\r')
      break;
    getNextChar();
  }
}

Token& Lexer::setIdentifierTokenKind(Token& result) const {
  unsigned length = bufferPos - result.start;
  switch (length) {
  case 4:
    if (memcmp("rule", result.start, 4) == 0)
      return setTokenKind(result, Token::Kind::KWRule);
    if (memcmp("pool", result.start, 4) == 0)
      return setTokenKind(result, Token::Kind::KWPool);
    break;

  case 5:
    if (memcmp("build", result.start, 5) == 0)
      return setTokenKind(result, Token::Kind::KWBuild);
    break;

  case 7:
    if (memcmp("default", result.start, 7) == 0)
      return setTokenKind(result, Token::Kind::KWDefault);
    if (memcmp("include", result.start, 7) == 0)
      return setTokenKind(result, Token::Kind::KWInclude);
    break;

  case 8:
    if (memcmp("subninja", result.start, 7) == 0)
      return setTokenKind(result, Token::Kind::KWSubninja);
    break;
  }

  return setTokenKind(result, Token::Kind::Identifier);
}

Token& Lexer::lexIdentifier(Token& result) {
  // Consume characters as long as we are in an identifier.
  while (Lexer::isIdentifierChar(peekNextChar())) {
    getNextChar();
  }

  // If we are in identifier specific mode, ignore keywords.
  if (mode == Lexer::LexingMode::IdentifierSpecific)
    return setTokenKind(result, Token::Kind::Identifier);

  // Recognize keywords specially.
  return setIdentifierTokenKind(result);
}

static bool isNonNewlineSpace(int c) {
  return isspace(c) && c != '\n' && c != '\r';
}

Token &Lexer::lexPathString(Token &result) {
  // String tokens in path contexts consume until a space, ':', or '|'
  // character.
  while (true) {
    int c = peekNextChar();

    // If this is an escape character, skip the next character.
    if (c == '$') {
      getNextChar(); // Consume the actual '$'.

      // Consume the next character.
      c = getNextChar();

      // If the character was a newline, consume any leading spaces.
      if (c == '\n') {
        while (isNonNewlineSpace(peekNextChar()))
          getNextChar();
      }

      continue;
    }

    // Otherwise, continue only if this is not the EOL or EOF.
    if (isspace(c) || c == ':' || c == '|' || c == -1)
      break;

    getNextChar();
  }

  return setTokenKind(result, Token::Kind::String);
}

Token& Lexer::lexVariableString(Token& result) {
  // String tokens in variable assignments consume until the end of the line.
  while (true) {
    int c = peekNextChar();

    // If this is an escape character, skip the next character.
    if (c == '$') {
      getNextChar(); // Consume the actual '$'.
      getNextChar(); // Consume the next character.
      continue;
    }

    // Otherwise, continue only if this is not the EOL or EOF.
    if (c == '\n' || c == -1 || c == '\r')
      break;

    getNextChar();
  }

  return setTokenKind(result, Token::Kind::String);
}

Token& Lexer::lex(Token& result) {
  // Check if we need to emit an indentation token.
  int c = peekNextChar();
  if (isNonNewlineSpace(c) && columnNumber == 0) {
    // If we are at the start of a line, then any leading whitespace should be
    // parsed as an indentation token.
    //
    // We do not need to handle "$\n" sequences here because they will be
    // consumed next, and the exact length of the indentation token is never
    // used.
    if (columnNumber == 0) {
      result.start = bufferPos;
      result.line = lineNumber;
      result.column = columnNumber;

      do {
        getNextChar();
      } while (isNonNewlineSpace(peekNextChar()));

      return setTokenKind(result, Token::Kind::Indentation);
    }
  }

  // Otherwise, consume any leading whitespace or "$\n" escape sequences (except
  // at the start of lines, which Ninja does not recognize).
  while (true) {
    // Check for escape sequences.
    if (c == '$' && columnNumber != 0) {
      // If this is a newline escape, consume it.
      if ((bufferPos + 1 != buffer.end() && bufferPos[1] == '\n') ||
          (bufferPos + 2 != buffer.end() && bufferPos[1] == '\r' &&
           bufferPos[2] == '\n')) {
        getNextChar();
        getNextChar();
      } else {
        // Otherwise, break out and lex normally.
        break;
      }
    } else if (isNonNewlineSpace(c)) {
      getNextChar();
    } else {
      break;
    }
    
    c = peekNextChar();
  }

  // Initialize the token position.
  result.start = bufferPos;
  result.line = lineNumber;
  result.column = columnNumber;

  // Check if we are at a string mode independent token.
  if (c == '\n' || c == '\r') {
    getNextChar();
    return setTokenKind(result, Token::Kind::Newline);
  }
  if (c == -1)
    return setTokenKind(result, Token::Kind::EndOfFile);

  // If we are in string lexing mode, delegate immediately if appropriate.
  if (mode == LexingMode::VariableString)
    return lexVariableString(result);
  if (mode == LexingMode::PathString) {
    // Only delegate for characters not special to path lexing.
    if (c != ':' && c != '|')
      return lexPathString(result);
  }

  // Otherwise, consume the character and lex from the regular token set.
  getNextChar();
  switch (c) {
  case ':': return setTokenKind(result, Token::Kind::Colon);
  case '=': return setTokenKind(result, Token::Kind::Equals);

  case '#': {
    skipToEndOfLine();
    return setTokenKind(result, Token::Kind::Comment);
  }

  case '|': {
    if (peekNextChar() == '|') {
      (void) getNextChar();
      return setTokenKind(result, Token::Kind::PipePipe);
    }
    return setTokenKind(result, Token::Kind::Pipe);
  }

  default:
    if (Lexer::isIdentifierChar(c))
      return lexIdentifier(result);

    return setTokenKind(result, Token::Kind::Unknown);
  }
}