forked from OSchip/llvm-project
Parallelize string merging.
String merging is one of the most time-consuming functions in lld. This patch parallelize it to speed it up. On my 2-socket 20-core 40-threads Xeon E5-2680 @ 2.8 GHz machine, this patch shorten the clang debug build link time from 7.11s to 5.16s. It's a 27% improvement and actually pretty noticeable. In this test condition, lld is now 4x faster than gold. Differential Revision: https://reviews.llvm.org/D38266 llvm-svn: 314588
This commit is contained in:
parent
4db732a7db
commit
c97a70c6f5
|
|
@ -37,6 +37,7 @@
|
|||
#include "llvm/Support/SHA1.h"
|
||||
#include "llvm/Support/xxhash.h"
|
||||
#include <cstdlib>
|
||||
#include <thread>
|
||||
|
||||
using namespace llvm;
|
||||
using namespace llvm::dwarf;
|
||||
|
|
@ -48,6 +49,8 @@ using namespace llvm::support::endian;
|
|||
using namespace lld;
|
||||
using namespace lld::elf;
|
||||
|
||||
const size_t MergeNoTailSection::NumShards;
|
||||
|
||||
uint64_t SyntheticSection::getVA() const {
|
||||
if (OutputSection *Sec = getParent())
|
||||
return Sec->Addr + OutSecOff;
|
||||
|
|
@ -2181,19 +2184,19 @@ template <class ELFT> bool VersionNeedSection<ELFT>::empty() const {
|
|||
return getNeedNum() == 0;
|
||||
}
|
||||
|
||||
MergeSyntheticSection::MergeSyntheticSection(StringRef Name, uint32_t Type,
|
||||
uint64_t Flags, uint32_t Alignment)
|
||||
: SyntheticSection(Flags, Type, Alignment, Name),
|
||||
Builder(StringTableBuilder::RAW, Alignment) {}
|
||||
|
||||
void MergeSyntheticSection::addSection(MergeInputSection *MS) {
|
||||
MS->Parent = this;
|
||||
Sections.push_back(MS);
|
||||
}
|
||||
|
||||
size_t MergeSyntheticSection::getSize() const { return Builder.getSize(); }
|
||||
MergeTailSection::MergeTailSection(StringRef Name, uint32_t Type,
|
||||
uint64_t Flags, uint32_t Alignment)
|
||||
: MergeSyntheticSection(Name, Type, Flags, Alignment),
|
||||
Builder(StringTableBuilder::RAW, Alignment) {}
|
||||
|
||||
void MergeSyntheticSection::writeTo(uint8_t *Buf) { Builder.write(Buf); }
|
||||
size_t MergeTailSection::getSize() const { return Builder.getSize(); }
|
||||
|
||||
void MergeTailSection::writeTo(uint8_t *Buf) { Builder.write(Buf); }
|
||||
|
||||
void MergeTailSection::finalizeContents() {
|
||||
// Add all string pieces to the string table builder to create section
|
||||
|
|
@ -2215,17 +2218,63 @@ void MergeTailSection::finalizeContents() {
|
|||
Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I));
|
||||
}
|
||||
|
||||
void MergeNoTailSection::writeTo(uint8_t *Buf) {
|
||||
for (size_t I = 0; I < NumShards; ++I)
|
||||
Shards[I].write(Buf + ShardOffsets[I]);
|
||||
}
|
||||
|
||||
// This function is very hot (i.e. it can take several seconds to finish)
|
||||
// because sometimes the number of inputs is in an order of magnitude of
|
||||
// millions. So, we use multi-threading.
|
||||
//
|
||||
// For any strings S and T, we know S is not mergeable with T if S's hash
|
||||
// value is different from T's. If that's the case, we can safely put S and
|
||||
// T into different string builders without worrying about merge misses.
|
||||
// We do it in parallel.
|
||||
void MergeNoTailSection::finalizeContents() {
|
||||
// Add all string pieces to the string table builder to create section
|
||||
// contents. Because we are not tail-optimizing, offsets of strings are
|
||||
// fixed when they are added to the builder (string table builder contains
|
||||
// a hash table from strings to offsets).
|
||||
for (MergeInputSection *Sec : Sections)
|
||||
// Initializes string table builders.
|
||||
for (size_t I = 0; I < NumShards; ++I)
|
||||
Shards.emplace_back(StringTableBuilder::RAW, Alignment);
|
||||
|
||||
// Concurrency level. Must be a power of 2.
|
||||
size_t Concurrency = 1;
|
||||
if (Config->Threads)
|
||||
if (int N = std::thread::hardware_concurrency())
|
||||
Concurrency = std::min(PowerOf2Floor(N), NumShards);
|
||||
|
||||
// Add section pieces to the builders.
|
||||
parallelForEachN(0, Concurrency, [&](size_t ThreadId) {
|
||||
for (MergeInputSection *Sec : Sections) {
|
||||
for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) {
|
||||
if (!Sec->Pieces[I].Live)
|
||||
continue;
|
||||
CachedHashStringRef Str = Sec->getData(I);
|
||||
size_t ShardId = getShardId(Str.hash());
|
||||
if ((ShardId & (Concurrency - 1)) == ThreadId)
|
||||
Sec->Pieces[I].OutputOff = Shards[ShardId].add(Str);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Compute an in-section offset for each shard.
|
||||
size_t Off = 0;
|
||||
for (size_t I = 0; I < NumShards; ++I) {
|
||||
Shards[I].finalizeInOrder();
|
||||
if (Shards[I].getSize() > 0)
|
||||
Off = alignTo(Off, Alignment);
|
||||
ShardOffsets[I] = Off;
|
||||
Off += Shards[I].getSize();
|
||||
}
|
||||
Size = Off;
|
||||
|
||||
// So far, section pieces have offsets from beginning of shards, but
|
||||
// we want offsets from beginning of the whole section. Fix them.
|
||||
parallelForEach(Sections, [&](MergeInputSection *Sec) {
|
||||
for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
|
||||
if (Sec->Pieces[I].Live)
|
||||
Sec->Pieces[I].OutputOff = Builder.add(Sec->getData(I));
|
||||
|
||||
Builder.finalizeInOrder();
|
||||
Sec->Pieces[I].OutputOff +=
|
||||
ShardOffsets[getShardId(Sec->getData(I).hash())];
|
||||
});
|
||||
}
|
||||
|
||||
static MergeSyntheticSection *createMergeSynthetic(StringRef Name,
|
||||
|
|
|
|||
|
|
@ -668,24 +668,26 @@ public:
|
|||
class MergeSyntheticSection : public SyntheticSection {
|
||||
public:
|
||||
void addSection(MergeInputSection *MS);
|
||||
size_t getSize() const override;
|
||||
void writeTo(uint8_t *Buf) override;
|
||||
|
||||
protected:
|
||||
MergeSyntheticSection(StringRef Name, uint32_t Type, uint64_t Flags,
|
||||
uint32_t Alignment);
|
||||
uint32_t Alignment)
|
||||
: SyntheticSection(Flags, Type, Alignment, Name) {}
|
||||
|
||||
std::vector<MergeInputSection *> Sections;
|
||||
llvm::StringTableBuilder Builder;
|
||||
};
|
||||
|
||||
class MergeTailSection final : public MergeSyntheticSection {
|
||||
public:
|
||||
MergeTailSection(StringRef Name, uint32_t Type, uint64_t Flags,
|
||||
uint32_t Alignment)
|
||||
: MergeSyntheticSection(Name, Type, Flags, Alignment) {}
|
||||
uint32_t Alignment);
|
||||
|
||||
size_t getSize() const override;
|
||||
void writeTo(uint8_t *Buf) override;
|
||||
void finalizeContents() override;
|
||||
|
||||
private:
|
||||
llvm::StringTableBuilder Builder;
|
||||
};
|
||||
|
||||
class MergeNoTailSection final : public MergeSyntheticSection {
|
||||
|
|
@ -694,7 +696,27 @@ public:
|
|||
uint32_t Alignment)
|
||||
: MergeSyntheticSection(Name, Type, Flags, Alignment) {}
|
||||
|
||||
size_t getSize() const override { return Size; }
|
||||
void writeTo(uint8_t *Buf) override;
|
||||
void finalizeContents() override;
|
||||
|
||||
private:
|
||||
// We use the most significant bits of a hash as a shard ID.
|
||||
// The reason why we don't want to use the least significant bits is
|
||||
// because DenseMap also uses lower bits to determine a bucket ID.
|
||||
// If we use lower bits, it significantly increases the probability of
|
||||
// hash collisons.
|
||||
size_t getShardId(uint32_t Hash) {
|
||||
return Hash >> (32 - llvm::countTrailingZeros(NumShards));
|
||||
}
|
||||
|
||||
// Section size
|
||||
size_t Size;
|
||||
|
||||
// String table contents
|
||||
constexpr static size_t NumShards = 32;
|
||||
std::vector<llvm::StringTableBuilder> Shards;
|
||||
size_t ShardOffsets[NumShards];
|
||||
};
|
||||
|
||||
// .MIPS.abiflags section.
|
||||
|
|
|
|||
|
|
@ -5,8 +5,7 @@
|
|||
# RUN: llvm-objdump -s %t1 | FileCheck %s
|
||||
|
||||
# CHECK: Contents of section .comment:
|
||||
# CHECK-NEXT: 0000 00666f6f 00626172 004c4c44 20312e30 .foo.bar.LLD 1.0
|
||||
# CHECK-NEXT: 0010 00 .
|
||||
# CHECK-NEXT: foo.LLD 1.0..bar
|
||||
|
||||
.ident "foo"
|
||||
|
||||
|
|
|
|||
|
|
@ -61,11 +61,11 @@
|
|||
# DATA-NEXT: AddressAlignment: 1
|
||||
# DATA-NEXT: EntrySize: 0
|
||||
# DATA-NEXT: SectionData (
|
||||
# DATA-NEXT: 0000: 73686F72 7420756E 7369676E 65642069 |short unsigned i|
|
||||
# DATA-NEXT: 0010: 6E740075 6E736967 6E656420 696E7400 |nt.unsigned int.|
|
||||
# DATA-NEXT: 0000: 756E7369 676E6564 20696E74 00636861 |unsigned int.cha|
|
||||
# DATA-NEXT: 0010: 7200756E 7369676E 65642063 68617200 |r.unsigned char.|
|
||||
# DATA-NEXT: 0020: 6C6F6E67 20756E73 69676E65 6420696E |long unsigned in|
|
||||
# DATA-NEXT: 0030: 74006368 61720075 6E736967 6E656420 |t.char.unsigned |
|
||||
# DATA-NEXT: 0040: 63686172 00 |char.|
|
||||
# DATA-NEXT: 0030: 74007368 6F727420 756E7369 676E6564 |t.short unsigned|
|
||||
# DATA-NEXT: 0040: 20696E74 00 | int.|
|
||||
# DATA-NEXT: )
|
||||
# DATA-NEXT: }
|
||||
|
||||
|
|
|
|||
|
|
@ -54,7 +54,7 @@ zed:
|
|||
// NOTAIL-NEXT: AddressAlignment: 1
|
||||
// NOTAIL-NEXT: EntrySize: 0
|
||||
// NOTAIL-NEXT: SectionData (
|
||||
// NOTAIL-NEXT: 0000: 61626300 626300 |abc.bc.|
|
||||
// NOTAIL-NEXT: 0000: 62630061 626300 |bc.abc.|
|
||||
// NOTAIL-NEXT: )
|
||||
|
||||
// NOMERGE: Name: .rodata1
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@
|
|||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: Symbol {
|
||||
// CHECK-NEXT: Name: s3
|
||||
// CHECK-NEXT: Value: 0x200125
|
||||
// CHECK-NEXT: Value: 0x200120
|
||||
// CHECK-NEXT: Size: 0
|
||||
// CHECK-NEXT: Binding: Local (0x0)
|
||||
// CHECK-NEXT: Type: Object (0x1)
|
||||
|
|
@ -23,7 +23,7 @@
|
|||
// CHECK-NEXT: }
|
||||
// CHECK-NEXT: Symbol {
|
||||
// CHECK-NEXT: Name: s1
|
||||
// CHECK-NEXT: Value: 0x200120
|
||||
// CHECK-NEXT: Value: 0x200125
|
||||
// CHECK-NEXT: Size: 0
|
||||
// CHECK-NEXT: Binding: Local (0x0)
|
||||
// CHECK-NEXT: Type: Object (0x1)
|
||||
|
|
|
|||
Loading…
Reference in New Issue