forked from OSchip/llvm-project
				
			
		
			
				
	
	
		
			281 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			C++
		
	
	
	
			
		
		
	
	
			281 lines
		
	
	
		
			9.9 KiB
		
	
	
	
		
			C++
		
	
	
	
| //===-- working_set.cpp ---------------------------------------------------===//
 | |
| //
 | |
| //                     The LLVM Compiler Infrastructure
 | |
| //
 | |
| // This file is distributed under the University of Illinois Open Source
 | |
| // License. See LICENSE.TXT for details.
 | |
| //
 | |
| //===----------------------------------------------------------------------===//
 | |
| //
 | |
| // This file is a part of EfficiencySanitizer, a family of performance tuners.
 | |
| //
 | |
| // This file contains working-set-specific code.
 | |
| //===----------------------------------------------------------------------===//
 | |
| 
 | |
| #include "working_set.h"
 | |
| #include "esan.h"
 | |
| #include "esan_circular_buffer.h"
 | |
| #include "esan_flags.h"
 | |
| #include "esan_shadow.h"
 | |
| #include "esan_sideline.h"
 | |
| #include "sanitizer_common/sanitizer_procmaps.h"
 | |
| 
 | |
| // We shadow every cache line of app memory with one shadow byte.
 | |
| // - The highest bit of each shadow byte indicates whether the corresponding
 | |
| //   cache line has ever been accessed.
 | |
| // - The lowest bit of each shadow byte indicates whether the corresponding
 | |
| //   cache line was accessed since the last sample.
 | |
| // - The other bits are used for working set snapshots at successively
 | |
| //   lower frequencies, each bit to the left from the lowest bit stepping
 | |
| //   down the frequency by 2 to the power of getFlags()->snapshot_step.
 | |
| // Thus we have something like this:
 | |
| //   Bit 0: Since last sample
 | |
| //   Bit 1: Since last 2^2 samples
 | |
| //   Bit 2: Since last 2^4 samples
 | |
| //   Bit 3: ...
 | |
| //   Bit 7: Ever accessed.
 | |
| // We live with races in accessing each shadow byte.
 | |
| typedef unsigned char byte;
 | |
| 
 | |
| namespace __esan {
 | |
| 
 | |
| // Our shadow memory assumes that the line size is 64.
 | |
| static const u32 CacheLineSize = 64;
 | |
| 
 | |
| // See the shadow byte layout description above.
 | |
| static const u32 TotalWorkingSetBitIdx = 7;
 | |
| // We accumulate to the left until we hit this bit.
 | |
| // We don't need to accumulate to the final bit as it's set on each ref
 | |
| // by the compiler instrumentation.
 | |
| static const u32 MaxAccumBitIdx = 6;
 | |
| static const u32 CurWorkingSetBitIdx = 0;
 | |
| static const byte ShadowAccessedVal =
 | |
|   (1 << TotalWorkingSetBitIdx) | (1 << CurWorkingSetBitIdx);
 | |
| 
 | |
| static SidelineThread Thread;
 | |
| // If we use real-time-based timer samples this won't overflow in any realistic
 | |
| // scenario, but if we switch to some other unit (such as memory accesses) we
 | |
| // may want to consider a 64-bit int.
 | |
| static u32 SnapshotNum;
 | |
| 
 | |
| // We store the wset size for each of 8 different sampling frequencies.
 | |
| static const u32 NumFreq = 8; // One for each bit of our shadow bytes.
 | |
| // We cannot use static objects as the global destructor is called
 | |
| // prior to our finalize routine.
 | |
| // These are each circular buffers, sized up front.
 | |
| CircularBuffer<u32> SizePerFreq[NumFreq];
 | |
| // We cannot rely on static initializers (they may run too late) but
 | |
| // we record the size here for clarity:
 | |
| u32 CircularBufferSizes[NumFreq] = {
 | |
|   // These are each mmap-ed so our minimum is one page.
 | |
|   32*1024,
 | |
|   16*1024,
 | |
|   8*1024,
 | |
|   4*1024,
 | |
|   4*1024,
 | |
|   4*1024,
 | |
|   4*1024,
 | |
|   4*1024,
 | |
| };
 | |
| 
 | |
| void processRangeAccessWorkingSet(uptr PC, uptr Addr, SIZE_T Size,
 | |
|                                   bool IsWrite) {
 | |
|   if (Size == 0)
 | |
|     return;
 | |
|   SIZE_T I = 0;
 | |
|   uptr LineSize = getFlags()->cache_line_size;
 | |
|   // As Addr+Size could overflow at the top of a 32-bit address space,
 | |
|   // we avoid the simpler formula that rounds the start and end.
 | |
|   SIZE_T NumLines = Size / LineSize +
 | |
|     // Add any extra at the start or end adding on an extra line:
 | |
|     (LineSize - 1 + Addr % LineSize + Size % LineSize) / LineSize;
 | |
|   byte *Shadow = (byte *)appToShadow(Addr);
 | |
|   // Write shadow bytes until we're word-aligned.
 | |
|   while (I < NumLines && (uptr)Shadow % 4 != 0) {
 | |
|     if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
 | |
|       *Shadow |= ShadowAccessedVal;
 | |
|     ++Shadow;
 | |
|     ++I;
 | |
|   }
 | |
|   // Write whole shadow words at a time.
 | |
|   // Using a word-stride loop improves the runtime of a microbenchmark of
 | |
|   // memset calls by 10%.
 | |
|   u32 WordValue = ShadowAccessedVal | ShadowAccessedVal << 8 |
 | |
|     ShadowAccessedVal << 16 | ShadowAccessedVal << 24;
 | |
|   while (I + 4 <= NumLines) {
 | |
|     if ((*(u32*)Shadow & WordValue) != WordValue)
 | |
|       *(u32*)Shadow |= WordValue;
 | |
|     Shadow += 4;
 | |
|     I += 4;
 | |
|   }
 | |
|   // Write any trailing shadow bytes.
 | |
|   while (I < NumLines) {
 | |
|     if ((*Shadow & ShadowAccessedVal) != ShadowAccessedVal)
 | |
|       *Shadow |= ShadowAccessedVal;
 | |
|     ++Shadow;
 | |
|     ++I;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // This routine will word-align ShadowStart and ShadowEnd prior to scanning.
 | |
| // It does *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
 | |
| // measures the access during the entire execution and should never be cleared.
 | |
| static u32 countAndClearShadowValues(u32 BitIdx, uptr ShadowStart,
 | |
|                                      uptr ShadowEnd) {
 | |
|   u32 WorkingSetSize = 0;
 | |
|   u32 ByteValue = 0x1 << BitIdx;
 | |
|   u32 WordValue = ByteValue | ByteValue << 8 | ByteValue << 16 |
 | |
|     ByteValue << 24;
 | |
|   // Get word aligned start.
 | |
|   ShadowStart = RoundDownTo(ShadowStart, sizeof(u32));
 | |
|   bool Accum = getFlags()->record_snapshots && BitIdx < MaxAccumBitIdx;
 | |
|   // Do not clear the bit that measures access during the entire execution.
 | |
|   bool Clear = BitIdx < TotalWorkingSetBitIdx;
 | |
|   for (u32 *Ptr = (u32 *)ShadowStart; Ptr < (u32 *)ShadowEnd; ++Ptr) {
 | |
|     if ((*Ptr & WordValue) != 0) {
 | |
|       byte *BytePtr = (byte *)Ptr;
 | |
|       for (u32 j = 0; j < sizeof(u32); ++j) {
 | |
|         if (BytePtr[j] & ByteValue) {
 | |
|           ++WorkingSetSize;
 | |
|           if (Accum) {
 | |
|             // Accumulate to the lower-frequency bit to the left.
 | |
|             BytePtr[j] |= (ByteValue << 1);
 | |
|           }
 | |
|         }
 | |
|       }
 | |
|       if (Clear) {
 | |
|         // Clear this bit from every shadow byte.
 | |
|         *Ptr &= ~WordValue;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
|   return WorkingSetSize;
 | |
| }
 | |
| 
 | |
| // Scan shadow memory to calculate the number of cache lines being accessed,
 | |
| // i.e., the number of non-zero bits indexed by BitIdx in each shadow byte.
 | |
| // We also clear the lowest bits (most recent working set snapshot).
 | |
| // We do *not* clear for BitIdx==TotalWorkingSetBitIdx, as that top bit
 | |
| // measures the access during the entire execution and should never be cleared.
 | |
| static u32 computeWorkingSizeAndReset(u32 BitIdx) {
 | |
|   u32 WorkingSetSize = 0;
 | |
|   MemoryMappingLayout MemIter(true/*cache*/);
 | |
|   MemoryMappedSegment Segment;
 | |
|   while (MemIter.Next(&Segment)) {
 | |
|     VPrintf(4, "%s: considering %p-%p app=%d shadow=%d prot=%u\n", __FUNCTION__,
 | |
|             Segment.start, Segment.end, Segment.protection,
 | |
|             isAppMem(Segment.start), isShadowMem(Segment.start));
 | |
|     if (isShadowMem(Segment.start) && Segment.IsWritable()) {
 | |
|       VPrintf(3, "%s: walking %p-%p\n", __FUNCTION__, Segment.start,
 | |
|               Segment.end);
 | |
|       WorkingSetSize +=
 | |
|           countAndClearShadowValues(BitIdx, Segment.start, Segment.end);
 | |
|     }
 | |
|   }
 | |
|   return WorkingSetSize;
 | |
| }
 | |
| 
 | |
| // This is invoked from a signal handler but in a sideline thread doing nothing
 | |
| // else so it is a little less fragile than a typical signal handler.
 | |
| static void takeSample(void *Arg) {
 | |
|   u32 BitIdx = CurWorkingSetBitIdx;
 | |
|   u32 Freq = 1;
 | |
|   ++SnapshotNum; // Simpler to skip 0 whose mod matches everything.
 | |
|   while (BitIdx <= MaxAccumBitIdx && (SnapshotNum % Freq) == 0) {
 | |
|     u32 NumLines = computeWorkingSizeAndReset(BitIdx);
 | |
|     VReport(1, "%s: snapshot #%5d bit %d freq %4d: %8u\n", SanitizerToolName,
 | |
|             SnapshotNum, BitIdx, Freq, NumLines);
 | |
|     SizePerFreq[BitIdx].push_back(NumLines);
 | |
|     Freq = Freq << getFlags()->snapshot_step;
 | |
|     BitIdx++;
 | |
|   }
 | |
| }
 | |
| 
 | |
| unsigned int getSampleCountWorkingSet()
 | |
| {
 | |
|   return SnapshotNum;
 | |
| }
 | |
| 
 | |
| // Initialization that must be done before any instrumented code is executed.
 | |
| void initializeShadowWorkingSet() {
 | |
|   CHECK(getFlags()->cache_line_size == CacheLineSize);
 | |
|   registerMemoryFaultHandler();
 | |
| }
 | |
| 
 | |
| void initializeWorkingSet() {
 | |
|   if (getFlags()->record_snapshots) {
 | |
|     for (u32 i = 0; i < NumFreq; ++i)
 | |
|       SizePerFreq[i].initialize(CircularBufferSizes[i]);
 | |
|     Thread.launchThread(takeSample, nullptr, getFlags()->sample_freq);
 | |
|   }
 | |
| }
 | |
| 
 | |
| static u32 getPeriodForPrinting(u32 MilliSec, const char *&Unit) {
 | |
|   if (MilliSec > 600000) {
 | |
|     Unit = "min";
 | |
|     return MilliSec / 60000;
 | |
|   } else if (MilliSec > 10000) {
 | |
|     Unit = "sec";
 | |
|     return MilliSec / 1000;
 | |
|   } else {
 | |
|     Unit = "ms";
 | |
|     return MilliSec;
 | |
|   }
 | |
| }
 | |
| 
 | |
| static u32 getSizeForPrinting(u32 NumOfCachelines, const char *&Unit) {
 | |
|   // We need a constant to avoid software divide support:
 | |
|   static const u32 KilobyteCachelines = (0x1 << 10) / CacheLineSize;
 | |
|   static const u32 MegabyteCachelines = KilobyteCachelines << 10;
 | |
| 
 | |
|   if (NumOfCachelines > 10 * MegabyteCachelines) {
 | |
|     Unit = "MB";
 | |
|     return NumOfCachelines / MegabyteCachelines;
 | |
|   } else if (NumOfCachelines > 10 * KilobyteCachelines) {
 | |
|     Unit = "KB";
 | |
|     return NumOfCachelines / KilobyteCachelines;
 | |
|   } else {
 | |
|     Unit = "Bytes";
 | |
|     return NumOfCachelines * CacheLineSize;
 | |
|   }
 | |
| }
 | |
| 
 | |
| void reportWorkingSet() {
 | |
|   const char *Unit;
 | |
|   if (getFlags()->record_snapshots) {
 | |
|     u32 Freq = 1;
 | |
|     Report(" Total number of samples: %u\n", SnapshotNum);
 | |
|     for (u32 i = 0; i < NumFreq; ++i) {
 | |
|       u32 Time = getPeriodForPrinting(getFlags()->sample_freq*Freq, Unit);
 | |
|       Report(" Samples array #%d at period %u %s\n", i, Time, Unit);
 | |
|       // FIXME: report whether we wrapped around and thus whether we
 | |
|       // have data on the whole run or just the last N samples.
 | |
|       for (u32 j = 0; j < SizePerFreq[i].size(); ++j) {
 | |
|         u32 Size = getSizeForPrinting(SizePerFreq[i][j], Unit);
 | |
|         Report("#%4d: %8u %s (%9u cache lines)\n", j, Size, Unit,
 | |
|                SizePerFreq[i][j]);
 | |
|       }
 | |
|       Freq = Freq << getFlags()->snapshot_step;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   // Get the working set size for the entire execution.
 | |
|   u32 NumOfCachelines = computeWorkingSizeAndReset(TotalWorkingSetBitIdx);
 | |
|   u32 Size = getSizeForPrinting(NumOfCachelines, Unit);
 | |
|   Report(" %s: the total working set size: %u %s (%u cache lines)\n",
 | |
|          SanitizerToolName, Size, Unit, NumOfCachelines);
 | |
| }
 | |
| 
 | |
| int finalizeWorkingSet() {
 | |
|   if (getFlags()->record_snapshots)
 | |
|     Thread.joinThread();
 | |
|   reportWorkingSet();
 | |
|   if (getFlags()->record_snapshots) {
 | |
|     for (u32 i = 0; i < NumFreq; ++i)
 | |
|       SizePerFreq[i].free();
 | |
|   }
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| } // namespace __esan
 |