[AST] Store the string data in StringLiteral in a trailing array of chars

Use the newly available space in the bit-fields of Stmt and store the string data in a trailing array of chars after the trailing array of SourceLocation. This cuts the size of StringLiteral by 2 pointers. Also refactor slightly StringLiteral::Create and StringLiteral::CreateEmpty so that StringLiteral::Create is just responsible for the allocation, and the constructor is responsible for doing all the initialization. This match what is done for the other classes in general. This patch should have no other functional changes apart from this. A concern was raised during review about the interaction between this patch and serialization abbreviations. I believe however that there is currently no abbreviation defined for StringLiteral. The only statements/expressions which have abbreviations are currently DeclRefExpr, IntegerLiteral, CharacterLiteral and ImplicitCastExpr. Differential Revision: https://reviews.llvm.org/D54166 Reviewed By: dblaikie, rjmccall llvm-svn: 346969
2018-11-15 17:31:16 +00:00 · 2018-11-15 17:31:16 +00:00 · b94ad1e1d3
parent bc56b2432d
commit b94ad1e1d3
5 changed files with 235 additions and 152 deletions
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@ -1568,98 +1568,131 @@ public:
 ///   char X[2] = "foobar";
 /// In this case, getByteLength() will return 6, but the string literal will
 /// have type "char[2]".
-class StringLiteral : public Expr {
+class StringLiteral final
    : public Expr,
      private llvm::TrailingObjects<StringLiteral, unsigned, SourceLocation,
                                    char> {
  friend class ASTStmtReader;
  friend TrailingObjects;
  /// StringLiteral is followed by several trailing objects. They are in order:
  ///
  /// * A single unsigned storing the length in characters of this string. The
  ///   length in bytes is this length times the width of a single character.
  ///   Always present and stored as a trailing objects because storing it in
  ///   StringLiteral would increase the size of StringLiteral by sizeof(void *)
  ///   due to alignment requirements. If you add some data to StringLiteral,
  ///   consider moving it inside StringLiteral.
  ///
  /// * An array of getNumConcatenated() SourceLocation, one for each of the
  ///   token this string is made of.
  ///
  /// * An array of getByteLength() char used to store the string data.
 public:
  enum StringKind { Ascii, Wide, UTF8, UTF16, UTF32 };
 private:
-  friend class ASTStmtReader;
+  unsigned numTrailingObjects(OverloadToken<unsigned>) const { return 1; }
  unsigned numTrailingObjects(OverloadToken<SourceLocation>) const {
    return getNumConcatenated();
  }
-  union {
+  unsigned numTrailingObjects(OverloadToken<char>) const {
-    const char *asChar;
+    return getByteLength();
-    const uint16_t *asUInt16;
+  }
    const uint32_t *asUInt32;
  } StrData;
  unsigned Length;
  unsigned CharByteWidth : 4;
  unsigned Kind : 3;
  unsigned IsPascal : 1;
  unsigned NumConcatenated;
  SourceLocation TokLocs[1];
-  StringLiteral(QualType Ty) :
+  char *getStrDataAsChar() { return getTrailingObjects<char>(); }
-    Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false,
+  const char *getStrDataAsChar() const { return getTrailingObjects<char>(); }
-         false) {}
+
  const uint16_t *getStrDataAsUInt16() const {
    return reinterpret_cast<const uint16_t *>(getTrailingObjects<char>());
  }
  const uint32_t *getStrDataAsUInt32() const {
    return reinterpret_cast<const uint32_t *>(getTrailingObjects<char>());
  }
  /// Build a string literal.
  StringLiteral(const ASTContext &Ctx, StringRef Str, StringKind Kind,
                bool Pascal, QualType Ty, const SourceLocation *Loc,
                unsigned NumConcatenated);
  /// Build an empty string literal.
  StringLiteral(EmptyShell Empty, unsigned NumConcatenated, unsigned Length,
                unsigned CharByteWidth);
  /// Map a target and string kind to the appropriate character width.
  static unsigned mapCharByteWidth(TargetInfo const &Target, StringKind SK);
  /// Set one of the string literal token.
  void setStrTokenLoc(unsigned TokNum, SourceLocation L) {
    assert(TokNum < getNumConcatenated() && "Invalid tok number");
    getTrailingObjects<SourceLocation>()[TokNum] = L;
  }
 public:
  /// This is the "fully general" constructor that allows representation of
  /// strings formed from multiple concatenated tokens.
-  static StringLiteral *Create(const ASTContext &C, StringRef Str,
+  static StringLiteral *Create(const ASTContext &Ctx, StringRef Str,
                               StringKind Kind, bool Pascal, QualType Ty,
-                               const SourceLocation *Loc, unsigned NumStrs);
+                               const SourceLocation *Loc,
                               unsigned NumConcatenated);
  /// Simple constructor for string literals made from one token.
-  static StringLiteral *Create(const ASTContext &C, StringRef Str,
+  static StringLiteral *Create(const ASTContext &Ctx, StringRef Str,
                               StringKind Kind, bool Pascal, QualType Ty,
                               SourceLocation Loc) {
-    return Create(C, Str, Kind, Pascal, Ty, &Loc, 1);
+    return Create(Ctx, Str, Kind, Pascal, Ty, &Loc, 1);
  }
  /// Construct an empty string literal.
-  static StringLiteral *CreateEmpty(const ASTContext &C, unsigned NumStrs);
+  static StringLiteral *CreateEmpty(const ASTContext &Ctx,
                                    unsigned NumConcatenated, unsigned Length,
                                    unsigned CharByteWidth);
  StringRef getString() const {
-    assert(CharByteWidth==1
+    assert(getCharByteWidth() == 1 &&
-           && "This function is used in places that assume strings use char");
+           "This function is used in places that assume strings use char");
-    return StringRef(StrData.asChar, getByteLength());
+    return StringRef(getStrDataAsChar(), getByteLength());
  }
  /// Allow access to clients that need the byte representation, such as
  /// ASTWriterStmt::VisitStringLiteral().
  StringRef getBytes() const {
    // FIXME: StringRef may not be the right type to use as a result for this.
-    if (CharByteWidth == 1)
+    return StringRef(getStrDataAsChar(), getByteLength());
      return StringRef(StrData.asChar, getByteLength());
    if (CharByteWidth == 4)
      return StringRef(reinterpret_cast<const char*>(StrData.asUInt32),
                       getByteLength());
    assert(CharByteWidth == 2 && "unsupported CharByteWidth");
    return StringRef(reinterpret_cast<const char*>(StrData.asUInt16),
                     getByteLength());
  }
  void outputString(raw_ostream &OS) const;
  uint32_t getCodeUnit(size_t i) const {
-    assert(i < Length && "out of bounds access");
+    assert(i < getLength() && "out of bounds access");
-    if (CharByteWidth == 1)
+    switch (getCharByteWidth()) {
-      return static_cast<unsigned char>(StrData.asChar[i]);
+    case 1:
-    if (CharByteWidth == 4)
+      return static_cast<unsigned char>(getStrDataAsChar()[i]);
-      return StrData.asUInt32[i];
+    case 2:
-    assert(CharByteWidth == 2 && "unsupported CharByteWidth");
+      return getStrDataAsUInt16()[i];
-    return StrData.asUInt16[i];
+    case 4:
      return getStrDataAsUInt32()[i];
    }
    llvm_unreachable("Unsupported character width!");
  }
-  unsigned getByteLength() const { return CharByteWidth*Length; }
+  unsigned getByteLength() const { return getCharByteWidth() * getLength(); }
-  unsigned getLength() const { return Length; }
+  unsigned getLength() const { return *getTrailingObjects<unsigned>(); }
-  unsigned getCharByteWidth() const { return CharByteWidth; }
+  unsigned getCharByteWidth() const { return StringLiteralBits.CharByteWidth; }
-  /// Sets the string data to the given string data.
+  StringKind getKind() const {
-  void setString(const ASTContext &C, StringRef Str,
+    return static_cast<StringKind>(StringLiteralBits.Kind);
-                 StringKind Kind, bool IsPascal);
+  }
-  StringKind getKind() const { return static_cast<StringKind>(Kind); }
+  bool isAscii() const { return getKind() == Ascii; }
-
+  bool isWide() const { return getKind() == Wide; }
-
+  bool isUTF8() const { return getKind() == UTF8; }
-  bool isAscii() const { return Kind == Ascii; }
+  bool isUTF16() const { return getKind() == UTF16; }
-  bool isWide() const { return Kind == Wide; }
+  bool isUTF32() const { return getKind() == UTF32; }
-  bool isUTF8() const { return Kind == UTF8; }
+  bool isPascal() const { return StringLiteralBits.IsPascal; }
  bool isUTF16() const { return Kind == UTF16; }
  bool isUTF32() const { return Kind == UTF32; }
  bool isPascal() const { return IsPascal; }
  bool containsNonAscii() const {
    for (auto c : getString())
@ -1677,15 +1710,14 @@ public:
  /// getNumConcatenated - Get the number of string literal tokens that were
  /// concatenated in translation phase #6 to form this string literal.
-  unsigned getNumConcatenated() const { return NumConcatenated; }
+  unsigned getNumConcatenated() const {
-
+    return StringLiteralBits.NumConcatenated;
  SourceLocation getStrTokenLoc(unsigned TokNum) const {
    assert(TokNum < NumConcatenated && "Invalid tok number");
    return TokLocs[TokNum];
  }
-  void setStrTokenLoc(unsigned TokNum, SourceLocation L) {
+
-    assert(TokNum < NumConcatenated && "Invalid tok number");
+  /// Get one of the string literal token.
-    TokLocs[TokNum] = L;
+  SourceLocation getStrTokenLoc(unsigned TokNum) const {
    assert(TokNum < getNumConcatenated() && "Invalid tok number");
    return getTrailingObjects<SourceLocation>()[TokNum];
  }
  /// getLocationOfByte - Return a source location that points to the specified
@ -1702,14 +1734,18 @@ public:
                    unsigned *StartTokenByteOffset = nullptr) const;
  typedef const SourceLocation *tokloc_iterator;
  tokloc_iterator tokloc_begin() const { return TokLocs; }
  tokloc_iterator tokloc_end() const { return TokLocs + NumConcatenated; }
-  SourceLocation getBeginLoc() const LLVM_READONLY { return TokLocs[0]; }
+  tokloc_iterator tokloc_begin() const {
-  SourceLocation getEndLoc() const LLVM_READONLY {
+    return getTrailingObjects<SourceLocation>();
    return TokLocs[NumConcatenated - 1];
  }
  tokloc_iterator tokloc_end() const {
    return getTrailingObjects<SourceLocation>() + getNumConcatenated();
  }
  SourceLocation getBeginLoc() const LLVM_READONLY { return *tokloc_begin(); }
  SourceLocation getEndLoc() const LLVM_READONLY { return *(tokloc_end() - 1); }
  static bool classof(const Stmt *T) {
    return T->getStmtClass() == StringLiteralClass;
  }
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@ -366,6 +366,28 @@ protected:
    unsigned IsExact : 1;
  };
  class StringLiteralBitfields {
    friend class ASTStmtReader;
    friend class StringLiteral;
    unsigned : NumExprBits;
    /// The kind of this string literal.
    /// One of the enumeration values of StringLiteral::StringKind.
    unsigned Kind : 3;
    /// The width of a single character in bytes. Only values of 1, 2,
    /// and 4 bytes are supported. StringLiteral::mapCharByteWidth maps
    /// the target + string kind to the appropriate CharByteWidth.
    unsigned CharByteWidth : 3;
    unsigned IsPascal : 1;
    /// The number of concatenated token this string is made of.
    /// This is the number of trailing SourceLocation.
    unsigned NumConcatenated;
  };
  class CharacterLiteralBitfields {
    friend class CharacterLiteral;
@ -566,6 +588,7 @@ protected:
    PredefinedExprBitfields PredefinedExprBits;
    DeclRefExprBitfields DeclRefExprBits;
    FloatingLiteralBitfields FloatingLiteralBits;
    StringLiteralBitfields StringLiteralBits;
    CharacterLiteralBitfields CharacterLiteralBits;
    UnaryOperatorBitfields UnaryOperatorBits;
    UnaryExprOrTypeTraitExprBitfields UnaryExprOrTypeTraitExprBits;
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@ -912,42 +912,80 @@ unsigned StringLiteral::mapCharByteWidth(TargetInfo const &Target,
  return CharByteWidth;
 }
-StringLiteral *StringLiteral::Create(const ASTContext &C, StringRef Str,
+StringLiteral::StringLiteral(const ASTContext &Ctx, StringRef Str,
                             StringKind Kind, bool Pascal, QualType Ty,
                             const SourceLocation *Loc,
-                                     unsigned NumStrs) {
+                             unsigned NumConcatenated)
-  assert(C.getAsConstantArrayType(Ty) &&
+    : Expr(StringLiteralClass, Ty, VK_LValue, OK_Ordinary, false, false, false,
           false) {
  assert(Ctx.getAsConstantArrayType(Ty) &&
         "StringLiteral must be of constant array type!");
  unsigned CharByteWidth = mapCharByteWidth(Ctx.getTargetInfo(), Kind);
  unsigned ByteLength = Str.size();
  assert((ByteLength % CharByteWidth == 0) &&
         "The size of the data must be a multiple of CharByteWidth!");
-  // Allocate enough space for the StringLiteral plus an array of locations for
+  // Avoid the expensive division. The compiler should be able to figure it
-  // any concatenated string tokens.
+  // out by itself. However as of clang 7, even with the appropriate
-  void *Mem =
+  // llvm_unreachable added just here, it is not able to do so.
-      C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1),
+  unsigned Length;
-                 alignof(StringLiteral));
+  switch (CharByteWidth) {
-  StringLiteral *SL = new (Mem) StringLiteral(Ty);
+  case 1:
-
+    Length = ByteLength;
-  // OPTIMIZE: could allocate this appended to the StringLiteral.
+    break;
-  SL->setString(C,Str,Kind,Pascal);
+  case 2:
-
+    Length = ByteLength / 2;
-  SL->TokLocs[0] = Loc[0];
+    break;
-  SL->NumConcatenated = NumStrs;
+  case 4:
-
+    Length = ByteLength / 4;
-  if (NumStrs != 1)
+    break;
-    memcpy(&SL->TokLocs[1], Loc+1, sizeof(SourceLocation)*(NumStrs-1));
+  default:
-  return SL;
+    llvm_unreachable("Unsupported character width!");
  }
-StringLiteral *StringLiteral::CreateEmpty(const ASTContext &C,
+  StringLiteralBits.Kind = Kind;
-                                          unsigned NumStrs) {
+  StringLiteralBits.CharByteWidth = CharByteWidth;
-  void *Mem =
+  StringLiteralBits.IsPascal = Pascal;
-      C.Allocate(sizeof(StringLiteral) + sizeof(SourceLocation) * (NumStrs - 1),
+  StringLiteralBits.NumConcatenated = NumConcatenated;
  *getTrailingObjects<unsigned>() = Length;
  // Initialize the trailing array of SourceLocation.
  // This is safe since SourceLocation is POD-like.
  std::memcpy(getTrailingObjects<SourceLocation>(), Loc,
              NumConcatenated * sizeof(SourceLocation));
  // Initialize the trailing array of char holding the string data.
  std::memcpy(getTrailingObjects<char>(), Str.data(), ByteLength);
 }
 StringLiteral::StringLiteral(EmptyShell Empty, unsigned NumConcatenated,
                             unsigned Length, unsigned CharByteWidth)
    : Expr(StringLiteralClass, Empty) {
  StringLiteralBits.CharByteWidth = CharByteWidth;
  StringLiteralBits.NumConcatenated = NumConcatenated;
  *getTrailingObjects<unsigned>() = Length;
 }
 StringLiteral *StringLiteral::Create(const ASTContext &Ctx, StringRef Str,
                                     StringKind Kind, bool Pascal, QualType Ty,
                                     const SourceLocation *Loc,
                                     unsigned NumConcatenated) {
  void *Mem = Ctx.Allocate(totalSizeToAlloc<unsigned, SourceLocation, char>(
                               1, NumConcatenated, Str.size()),
                           alignof(StringLiteral));
-  StringLiteral *SL =
+  return new (Mem)
-      new (Mem) StringLiteral(C.adjustStringLiteralBaseType(QualType()));
+      StringLiteral(Ctx, Str, Kind, Pascal, Ty, Loc, NumConcatenated);
-  SL->CharByteWidth = 0;
+}
-  SL->Length = 0;
+
-  SL->NumConcatenated = NumStrs;
+StringLiteral *StringLiteral::CreateEmpty(const ASTContext &Ctx,
-  return SL;
+                                          unsigned NumConcatenated,
                                          unsigned Length,
                                          unsigned CharByteWidth) {
  void *Mem = Ctx.Allocate(totalSizeToAlloc<unsigned, SourceLocation, char>(
                               1, NumConcatenated, Length * CharByteWidth),
                           alignof(StringLiteral));
  return new (Mem)
      StringLiteral(EmptyShell(), NumConcatenated, Length, CharByteWidth);
 }
 void StringLiteral::outputString(raw_ostream &OS) const {
@ -1046,42 +1084,6 @@ void StringLiteral::outputString(raw_ostream &OS) const {
  OS << '"';
 }
 void StringLiteral::setString(const ASTContext &C, StringRef Str,
                              StringKind Kind, bool IsPascal) {
  //FIXME: we assume that the string data comes from a target that uses the same
  // code unit size and endianness for the type of string.
  this->Kind = Kind;
  this->IsPascal = IsPascal;
  CharByteWidth = mapCharByteWidth(C.getTargetInfo(),Kind);
  assert((Str.size()%CharByteWidth == 0)
         && "size of data must be multiple of CharByteWidth");
  Length = Str.size()/CharByteWidth;
  switch(CharByteWidth) {
    case 1: {
      char *AStrData = new (C) char[Length];
      std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData));
      StrData.asChar = AStrData;
      break;
    }
    case 2: {
      uint16_t *AStrData = new (C) uint16_t[Length];
      std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData));
      StrData.asUInt16 = AStrData;
      break;
    }
    case 4: {
      uint32_t *AStrData = new (C) uint32_t[Length];
      std::memcpy(AStrData,Str.data(),Length*sizeof(*AStrData));
      StrData.asUInt32 = AStrData;
      break;
    }
    default:
      llvm_unreachable("unsupported CharByteWidth");
  }
 }
 /// getLocationOfByte - Return a source location that points to the specified
 /// byte of this string literal.
 ///
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@ -595,22 +595,35 @@ void ASTStmtReader::VisitImaginaryLiteral(ImaginaryLiteral *E) {
 void ASTStmtReader::VisitStringLiteral(StringLiteral *E) {
  VisitExpr(E);
-  unsigned Len = Record.readInt();
+
-  assert(Record.peekInt() == E->getNumConcatenated() &&
+  // NumConcatenated, Length and CharByteWidth are set by the empty
  // ctor since they are needed to allocate storage for the trailing objects.
  unsigned NumConcatenated = Record.readInt();
  unsigned Length = Record.readInt();
  unsigned CharByteWidth = Record.readInt();
  assert((NumConcatenated == E->getNumConcatenated()) &&
         "Wrong number of concatenated tokens!");
-  Record.skipInts(1);
+  assert((Length == E->getLength()) && "Wrong Length!");
-  auto kind = static_cast<StringLiteral::StringKind>(Record.readInt());
+  assert((CharByteWidth == E->getCharByteWidth()) && "Wrong character width!");
-  bool isPascal = Record.readInt();
+  E->StringLiteralBits.Kind = Record.readInt();
  E->StringLiteralBits.IsPascal = Record.readInt();
-  // Read string data
+  // The character width is originally computed via mapCharByteWidth.
-  auto B = &Record.peekInt();
+  // Check that the deserialized character width is consistant with the result
-  SmallString<16> Str(B, B + Len);
+  // of calling mapCharByteWidth.
-  E->setString(Record.getContext(), Str, kind, isPascal);
+  assert((CharByteWidth ==
-  Record.skipInts(Len);
+          StringLiteral::mapCharByteWidth(Record.getContext().getTargetInfo(),
                                          E->getKind())) &&
         "Wrong character width!");
-  // Read source locations
+  // Deserialize the trailing array of SourceLocation.
-  for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I)
+  for (unsigned I = 0; I < NumConcatenated; ++I)
    E->setStrTokenLoc(I, ReadSourceLocation());
  // Deserialize the trailing array of char holding the string data.
  char *StrData = E->getStrDataAsChar();
  for (unsigned I = 0; I < Length * CharByteWidth; ++I)
    StrData[I] = Record.readInt();
 }
 void ASTStmtReader::VisitCharacterLiteral(CharacterLiteral *E) {
@ -2423,8 +2436,11 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) {
      break;
    case EXPR_STRING_LITERAL:
-      S = StringLiteral::CreateEmpty(Context,
+      S = StringLiteral::CreateEmpty(
-                                     Record[ASTStmtReader::NumExprFields + 1]);
+          Context,
          /* NumConcatenated=*/Record[ASTStmtReader::NumExprFields + 0],
          /* Length=*/Record[ASTStmtReader::NumExprFields + 1],
          /* CharByteWidth=*/Record[ASTStmtReader::NumExprFields + 2]);
      break;
    case EXPR_CHARACTER_LITERAL:
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@ -518,17 +518,23 @@ void ASTStmtWriter::VisitImaginaryLiteral(ImaginaryLiteral *E) {
 void ASTStmtWriter::VisitStringLiteral(StringLiteral *E) {
  VisitExpr(E);
-  Record.push_back(E->getByteLength());
+
  // Store the various bits of data of StringLiteral.
  Record.push_back(E->getNumConcatenated());
  Record.push_back(E->getLength());
  Record.push_back(E->getCharByteWidth());
  Record.push_back(E->getKind());
  Record.push_back(E->isPascal());
-  // FIXME: String data should be stored as a blob at the end of the
+
-  // StringLiteral. However, we can't do so now because we have no
+  // Store the trailing array of SourceLocation.
  // provision for coping with abbreviations when we're jumping around
  // the AST file during deserialization.
  Record.append(E->getBytes().begin(), E->getBytes().end());
  for (unsigned I = 0, N = E->getNumConcatenated(); I != N; ++I)
    Record.AddSourceLocation(E->getStrTokenLoc(I));
  // Store the trailing array of char holding the string data.
  StringRef StrData = E->getBytes();
  for (unsigned I = 0, N = E->getByteLength(); I != N; ++I)
    Record.push_back(StrData[I]);
  Code = serialization::EXPR_STRING_LITERAL;
 }