368 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			368 lines
		
	
	
		
			10 KiB
		
	
	
	
		
			C++
		
	
	
	
//===-- ResourceScriptToken.cpp ---------------------------------*- C++-*-===//
 | 
						|
//
 | 
						|
//                     The LLVM Compiler Infrastructure
 | 
						|
//
 | 
						|
// This file is distributed under the University of Illinois Open Source
 | 
						|
// License. See LICENSE.TXT for details.
 | 
						|
//
 | 
						|
//===---------------------------------------------------------------------===//
 | 
						|
//
 | 
						|
// This file implements an interface defined in ResourceScriptToken.h.
 | 
						|
// In particular, it defines an .rc script tokenizer.
 | 
						|
//
 | 
						|
//===---------------------------------------------------------------------===//
 | 
						|
 | 
						|
#include "ResourceScriptToken.h"
 | 
						|
#include "llvm/Support/raw_ostream.h"
 | 
						|
 | 
						|
#include <algorithm>
 | 
						|
#include <cassert>
 | 
						|
#include <cctype>
 | 
						|
#include <cstdlib>
 | 
						|
#include <utility>
 | 
						|
 | 
						|
using namespace llvm;
 | 
						|
 | 
						|
using Kind = RCToken::Kind;
 | 
						|
 | 
						|
// Checks if Representation is a correct description of an RC integer.
 | 
						|
// It should be a 32-bit unsigned integer, either decimal, octal (0[0-7]+),
 | 
						|
// or hexadecimal (0x[0-9a-f]+). It might be followed by a single 'L'
 | 
						|
// character (that is the difference between our representation and
 | 
						|
// StringRef's one). If Representation is correct, 'true' is returned and
 | 
						|
// the return value is put back in Num.
 | 
						|
static bool rcGetAsInteger(StringRef Representation, uint32_t &Num) {
 | 
						|
  size_t Length = Representation.size();
 | 
						|
  if (Length == 0)
 | 
						|
    return false;
 | 
						|
  // Strip the last 'L' if unnecessary.
 | 
						|
  if (std::toupper(Representation.back()) == 'L')
 | 
						|
    Representation = Representation.drop_back(1);
 | 
						|
 | 
						|
  return !Representation.getAsInteger<uint32_t>(0, Num);
 | 
						|
}
 | 
						|
 | 
						|
RCToken::RCToken(RCToken::Kind RCTokenKind, StringRef Value)
 | 
						|
    : TokenKind(RCTokenKind), TokenValue(Value) {}
 | 
						|
 | 
						|
uint32_t RCToken::intValue() const {
 | 
						|
  assert(TokenKind == Kind::Int);
 | 
						|
  // We assume that the token already is a correct integer (checked by
 | 
						|
  // rcGetAsInteger).
 | 
						|
  uint32_t Result;
 | 
						|
  bool IsSuccess = rcGetAsInteger(TokenValue, Result);
 | 
						|
  assert(IsSuccess);
 | 
						|
  (void)IsSuccess;  // Silence the compiler warning when -DNDEBUG flag is on.
 | 
						|
  return Result;
 | 
						|
}
 | 
						|
 | 
						|
bool RCToken::isLongInt() const {
 | 
						|
  return TokenKind == Kind::Int && std::toupper(TokenValue.back()) == 'L';
 | 
						|
}
 | 
						|
 | 
						|
StringRef RCToken::value() const { return TokenValue; }
 | 
						|
 | 
						|
Kind RCToken::kind() const { return TokenKind; }
 | 
						|
 | 
						|
bool RCToken::isBinaryOp() const {
 | 
						|
  switch (TokenKind) {
 | 
						|
  case Kind::Plus:
 | 
						|
  case Kind::Minus:
 | 
						|
  case Kind::Pipe:
 | 
						|
  case Kind::Amp:
 | 
						|
    return true;
 | 
						|
  default:
 | 
						|
    return false;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
static Error getStringError(const Twine &message) {
 | 
						|
  return make_error<StringError>("Error parsing file: " + message,
 | 
						|
                                 inconvertibleErrorCode());
 | 
						|
}
 | 
						|
 | 
						|
namespace {
 | 
						|
 | 
						|
class Tokenizer {
 | 
						|
public:
 | 
						|
  Tokenizer(StringRef Input) : Data(Input), DataLength(Input.size()) {}
 | 
						|
 | 
						|
  Expected<std::vector<RCToken>> run();
 | 
						|
 | 
						|
private:
 | 
						|
  // All 'advancing' methods return boolean values; if they're equal to false,
 | 
						|
  // the stream has ended or failed.
 | 
						|
  bool advance(size_t Amount = 1);
 | 
						|
  bool skipWhitespaces();
 | 
						|
 | 
						|
  // Consumes a token. If any problem occurred, a non-empty Error is returned.
 | 
						|
  Error consumeToken(const Kind TokenKind);
 | 
						|
 | 
						|
  // Check if tokenizer is about to read FollowingChars.
 | 
						|
  bool willNowRead(StringRef FollowingChars) const;
 | 
						|
 | 
						|
  // Check if tokenizer can start reading an identifier at current position.
 | 
						|
  // The original tool did non specify the rules to determine what is a correct
 | 
						|
  // identifier. We assume they should follow the C convention:
 | 
						|
  // [a-zA-Z_][a-zA-Z0-9_]*.
 | 
						|
  bool canStartIdentifier() const;
 | 
						|
  // Check if tokenizer can continue reading an identifier.
 | 
						|
  bool canContinueIdentifier() const;
 | 
						|
 | 
						|
  // Check if tokenizer can start reading an integer.
 | 
						|
  // A correct integer always starts with a 0-9 digit,
 | 
						|
  // can contain characters 0-9A-Fa-f (digits),
 | 
						|
  // Ll (marking the integer is 32-bit), Xx (marking the representation
 | 
						|
  // is hexadecimal). As some kind of separator should come after the
 | 
						|
  // integer, we can consume the integer until a non-alphanumeric
 | 
						|
  // character.
 | 
						|
  bool canStartInt() const;
 | 
						|
  bool canContinueInt() const;
 | 
						|
 | 
						|
  bool canStartString() const;
 | 
						|
 | 
						|
  // Check if tokenizer can start reading a single line comment (e.g. a comment
 | 
						|
  // that begins with '//')
 | 
						|
  bool canStartLineComment() const;
 | 
						|
 | 
						|
  // Check if tokenizer can start or finish reading a block comment (e.g. a
 | 
						|
  // comment that begins with '/*' and ends with '*/')
 | 
						|
  bool canStartBlockComment() const;
 | 
						|
 | 
						|
  // Throw away all remaining characters on the current line.
 | 
						|
  void skipCurrentLine();
 | 
						|
 | 
						|
  bool streamEof() const;
 | 
						|
 | 
						|
  // Classify the token that is about to be read from the current position.
 | 
						|
  Kind classifyCurrentToken() const;
 | 
						|
 | 
						|
  // Process the Kind::Identifier token - check if it is
 | 
						|
  // an identifier describing a block start or end.
 | 
						|
  void processIdentifier(RCToken &token) const;
 | 
						|
 | 
						|
  StringRef Data;
 | 
						|
  size_t DataLength, Pos;
 | 
						|
};
 | 
						|
 | 
						|
void Tokenizer::skipCurrentLine() {
 | 
						|
  Pos = Data.find_first_of("\r\n", Pos);
 | 
						|
  Pos = Data.find_first_not_of("\r\n", Pos);
 | 
						|
 | 
						|
  if (Pos == StringRef::npos)
 | 
						|
    Pos = DataLength;
 | 
						|
}
 | 
						|
 | 
						|
Expected<std::vector<RCToken>> Tokenizer::run() {
 | 
						|
  Pos = 0;
 | 
						|
  std::vector<RCToken> Result;
 | 
						|
 | 
						|
  // Consume an optional UTF-8 Byte Order Mark.
 | 
						|
  if (willNowRead("\xef\xbb\xbf"))
 | 
						|
    advance(3);
 | 
						|
 | 
						|
  while (!streamEof()) {
 | 
						|
    if (!skipWhitespaces())
 | 
						|
      break;
 | 
						|
 | 
						|
    Kind TokenKind = classifyCurrentToken();
 | 
						|
    if (TokenKind == Kind::Invalid)
 | 
						|
      return getStringError("Invalid token found at position " + Twine(Pos));
 | 
						|
 | 
						|
    const size_t TokenStart = Pos;
 | 
						|
    if (Error TokenError = consumeToken(TokenKind))
 | 
						|
      return std::move(TokenError);
 | 
						|
 | 
						|
    // Comments are just deleted, don't bother saving them.
 | 
						|
    if (TokenKind == Kind::LineComment || TokenKind == Kind::StartComment)
 | 
						|
      continue;
 | 
						|
 | 
						|
    RCToken Token(TokenKind, Data.take_front(Pos).drop_front(TokenStart));
 | 
						|
    if (TokenKind == Kind::Identifier) {
 | 
						|
      processIdentifier(Token);
 | 
						|
    } else if (TokenKind == Kind::Int) {
 | 
						|
      uint32_t TokenInt;
 | 
						|
      if (!rcGetAsInteger(Token.value(), TokenInt)) {
 | 
						|
        // The integer has incorrect format or cannot be represented in
 | 
						|
        // a 32-bit integer.
 | 
						|
        return getStringError("Integer invalid or too large: " +
 | 
						|
                              Token.value().str());
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
    Result.push_back(Token);
 | 
						|
  }
 | 
						|
 | 
						|
  return Result;
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::advance(size_t Amount) {
 | 
						|
  Pos += Amount;
 | 
						|
  return !streamEof();
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::skipWhitespaces() {
 | 
						|
  while (!streamEof() && std::isspace(Data[Pos]))
 | 
						|
    advance();
 | 
						|
  return !streamEof();
 | 
						|
}
 | 
						|
 | 
						|
Error Tokenizer::consumeToken(const Kind TokenKind) {
 | 
						|
  switch (TokenKind) {
 | 
						|
  // One-character token consumption.
 | 
						|
#define TOKEN(Name)
 | 
						|
#define SHORT_TOKEN(Name, Ch) case Kind::Name:
 | 
						|
#include "ResourceScriptTokenList.def"
 | 
						|
    advance();
 | 
						|
    return Error::success();
 | 
						|
 | 
						|
  case Kind::LineComment:
 | 
						|
    advance(2);
 | 
						|
    skipCurrentLine();
 | 
						|
    return Error::success();
 | 
						|
 | 
						|
  case Kind::StartComment: {
 | 
						|
    advance(2);
 | 
						|
    auto EndPos = Data.find("*/", Pos);
 | 
						|
    if (EndPos == StringRef::npos)
 | 
						|
      return getStringError(
 | 
						|
          "Unclosed multi-line comment beginning at position " + Twine(Pos));
 | 
						|
    advance(EndPos - Pos);
 | 
						|
    advance(2);
 | 
						|
    return Error::success();
 | 
						|
  }
 | 
						|
  case Kind::Identifier:
 | 
						|
    while (!streamEof() && canContinueIdentifier())
 | 
						|
      advance();
 | 
						|
    return Error::success();
 | 
						|
 | 
						|
  case Kind::Int:
 | 
						|
    while (!streamEof() && canContinueInt())
 | 
						|
      advance();
 | 
						|
    return Error::success();
 | 
						|
 | 
						|
  case Kind::String:
 | 
						|
    // Consume the preceding 'L', if there is any.
 | 
						|
    if (std::toupper(Data[Pos]) == 'L')
 | 
						|
      advance();
 | 
						|
    // Consume the double-quote.
 | 
						|
    advance();
 | 
						|
 | 
						|
    // Consume the characters until the end of the file, line or string.
 | 
						|
    while (true) {
 | 
						|
      if (streamEof()) {
 | 
						|
        return getStringError("Unterminated string literal.");
 | 
						|
      } else if (Data[Pos] == '"') {
 | 
						|
        // Consume the ending double-quote.
 | 
						|
        advance();
 | 
						|
        // However, if another '"' follows this double-quote, the string didn't
 | 
						|
        // end and we just included '"' into the string.
 | 
						|
        if (!willNowRead("\""))
 | 
						|
          return Error::success();
 | 
						|
      } else if (Data[Pos] == '\n') {
 | 
						|
        return getStringError("String literal not terminated in the line.");
 | 
						|
      }
 | 
						|
 | 
						|
      advance();
 | 
						|
    }
 | 
						|
 | 
						|
  case Kind::Invalid:
 | 
						|
    assert(false && "Cannot consume an invalid token.");
 | 
						|
  }
 | 
						|
 | 
						|
  llvm_unreachable("Unknown RCToken::Kind");
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::willNowRead(StringRef FollowingChars) const {
 | 
						|
  return Data.drop_front(Pos).startswith(FollowingChars);
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::canStartIdentifier() const {
 | 
						|
  assert(!streamEof());
 | 
						|
 | 
						|
  const char CurChar = Data[Pos];
 | 
						|
  return std::isalpha(CurChar) || CurChar == '_' || CurChar == '.';
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::canContinueIdentifier() const {
 | 
						|
  assert(!streamEof());
 | 
						|
  const char CurChar = Data[Pos];
 | 
						|
  return std::isalnum(CurChar) || CurChar == '_' || CurChar == '.' ||
 | 
						|
         CurChar == '/' || CurChar == '\\';
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::canStartInt() const {
 | 
						|
  assert(!streamEof());
 | 
						|
  return std::isdigit(Data[Pos]);
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::canStartBlockComment() const {
 | 
						|
  assert(!streamEof());
 | 
						|
  return Data.drop_front(Pos).startswith("/*");
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::canStartLineComment() const {
 | 
						|
  assert(!streamEof());
 | 
						|
  return Data.drop_front(Pos).startswith("//");
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::canContinueInt() const {
 | 
						|
  assert(!streamEof());
 | 
						|
  return std::isalnum(Data[Pos]);
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::canStartString() const {
 | 
						|
  return willNowRead("\"") || willNowRead("L\"") || willNowRead("l\"");
 | 
						|
}
 | 
						|
 | 
						|
bool Tokenizer::streamEof() const { return Pos == DataLength; }
 | 
						|
 | 
						|
Kind Tokenizer::classifyCurrentToken() const {
 | 
						|
  if (canStartBlockComment())
 | 
						|
    return Kind::StartComment;
 | 
						|
  if (canStartLineComment())
 | 
						|
    return Kind::LineComment;
 | 
						|
 | 
						|
  if (canStartInt())
 | 
						|
    return Kind::Int;
 | 
						|
  if (canStartString())
 | 
						|
    return Kind::String;
 | 
						|
  // BEGIN and END are at this point of lexing recognized as identifiers.
 | 
						|
  if (canStartIdentifier())
 | 
						|
    return Kind::Identifier;
 | 
						|
 | 
						|
  const char CurChar = Data[Pos];
 | 
						|
 | 
						|
  switch (CurChar) {
 | 
						|
  // One-character token classification.
 | 
						|
#define TOKEN(Name)
 | 
						|
#define SHORT_TOKEN(Name, Ch)                                                  \
 | 
						|
  case Ch:                                                                     \
 | 
						|
    return Kind::Name;
 | 
						|
#include "ResourceScriptTokenList.def"
 | 
						|
 | 
						|
  default:
 | 
						|
    return Kind::Invalid;
 | 
						|
  }
 | 
						|
}
 | 
						|
 | 
						|
void Tokenizer::processIdentifier(RCToken &Token) const {
 | 
						|
  assert(Token.kind() == Kind::Identifier);
 | 
						|
  StringRef Name = Token.value();
 | 
						|
 | 
						|
  if (Name.equals_lower("begin"))
 | 
						|
    Token = RCToken(Kind::BlockBegin, Name);
 | 
						|
  else if (Name.equals_lower("end"))
 | 
						|
    Token = RCToken(Kind::BlockEnd, Name);
 | 
						|
}
 | 
						|
 | 
						|
} // anonymous namespace
 | 
						|
 | 
						|
namespace llvm {
 | 
						|
 | 
						|
Expected<std::vector<RCToken>> tokenizeRC(StringRef Input) {
 | 
						|
  return Tokenizer(Input).run();
 | 
						|
}
 | 
						|
 | 
						|
} // namespace llvm
 |