Simplify and generalize the SROA "convert to scalar" transformation to

be able to handle *ANY* alloca that is poked by loads and stores of bitcasts and GEPs with constant offsets. Before the code had a number of annoying limitations and caused it to miss cases such as storing into holes in structs and complex casts (as in bitfield-sroa) where we had unions of bitfields etc. This also handles a number of important cases that are exposed due to the ABI lowering stuff we do to pass stuff by value. One case that is pretty great is that we compile 2006-11-07-InvalidArrayPromote.ll into: define i32 @func(<4 x float> %v0, <4 x float> %v1) nounwind { %tmp10 = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %v1) %tmp105 = bitcast <4 x i32> %tmp10 to i128 %tmp1056 = zext i128 %tmp105 to i256 %tmp.upgrd.43 = lshr i256 %tmp1056, 96 %tmp.upgrd.44 = trunc i256 %tmp.upgrd.43 to i32 ret i32 %tmp.upgrd.44 } which turns into: _func: subl $28, %esp cvttps2dq %xmm1, %xmm0 movaps %xmm0, (%esp) movl 12(%esp), %eax addl $28, %esp ret Which is pretty good code all things considering :). One effect of this is that SROA will start generating arbitrary bitwidth integers that are a multiple of 8 bits. In the case above, we got a 256 bit integer, but the codegen guys assure me that it can handle the simple and/or/shift/zext stuff that we're doing on these operations. This addresses rdar://6532315 llvm-svn: 63469
2009-01-31 02:28:54 +00:00 · 2009-01-31 02:28:54 +00:00 · ec99c46d44
parent db7c5f6a7b
commit ec99c46d44
5 changed files with 197 additions and 326 deletions
--- a/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/llvm/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@ -125,13 +125,14 @@ namespace {
    void RewriteLoadUserOfWholeAlloca(LoadInst *LI, AllocationInst *AI,
                                      SmallVector<AllocaInst*, 32> &NewElts);
-    const Type *CanConvertToScalar(Value *V, bool &IsNotTrivial);
+    bool CanConvertToScalar(Value *V, bool &IsNotTrivial, const Type *&ResTy,
                            uint64_t Offset);
    void ConvertToScalar(AllocationInst *AI, const Type *Ty);
-    void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset);
+    void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset);
    Value *ConvertUsesOfLoadToScalar(LoadInst *LI, AllocaInst *NewAI, 
-                                     unsigned Offset);
+                                     uint64_t Offset);
    Value *ConvertUsesOfStoreToScalar(StoreInst *SI, AllocaInst *NewAI, 
-                                      unsigned Offset);
+                                      uint64_t Offset);
    static Instruction *isOnlyCopiedFromConstantGlobal(AllocationInst *AI);
  };
 }
@ -271,9 +272,15 @@ bool SROA::performScalarRepl(Function &F) {
    // If we can turn this aggregate value (potentially with casts) into a
    // simple scalar value that can be mem2reg'd into a register value.
    // IsNotTrivial tracks whether this is something that mem2reg could have
    // promoted itself.  If so, we don't want to transform it needlessly.  Note
    // that we can't just check based on the type: the alloca may be of an i32
    // but that has pointer arithmetic to set byte 3 of it or something.
    bool IsNotTrivial = false;
-    if (const Type *ActualType = CanConvertToScalar(AI, IsNotTrivial))
+    const Type *ActualType = 0;
-      if (IsNotTrivial && ActualType != Type::VoidTy) {
+    if (CanConvertToScalar(AI, IsNotTrivial, ActualType, 0))
      if (IsNotTrivial && ActualType &&
          TD->getTypeSizeInBits(ActualType) < SRThreshold*8) {
        ConvertToScalar(AI, ActualType);
        Changed = true;
        continue;
@ -1145,229 +1152,124 @@ void SROA::CanonicalizeAllocaUsers(AllocationInst *AI) {
  }
 }
-/// MergeInType - Add the 'In' type to the accumulated type so far.  If the
+/// MergeInType - Add the 'In' type to the accumulated type (Accum) so far at
-/// types are incompatible, return true, otherwise update Accum and return
+/// the offset specified by Offset (which is specified in bytes).
 /// false.
 ///
-/// There are three cases we handle here:
+/// There are two cases we handle here:
-///   1) An effectively-integer union, where the pieces are stored into as
+///   1) A union of vector types of the same size and potentially its elements.
 ///      smaller integers (common with byte swap and other idioms).
 ///   2) A union of vector types of the same size and potentially its elements.
 ///      Here we turn element accesses into insert/extract element operations.
-///   3) A union of scalar types, such as int/float or int/pointer.  Here we
+///      This promotes a <4 x float> with a store of float to the third element
-///      merge together into integers, allowing the xform to work with #1 as
+///      into a <4 x float> that uses insert element.
-///      well.
+///   2) A fully general blob of memory, which we turn into some (potentially
-static bool MergeInType(const Type *In, const Type *&Accum,
+///      large) integer type with extract and insert operations where the loads
 ///      and stores would mutate the memory.
 static void MergeInType(const Type *In, uint64_t Offset, const Type *&Accum,
                        const TargetData &TD) {
  // If this is our first type, just use it.
-  const VectorType *PTy;
+  if (Accum == 0 || In == Type::VoidTy ||
-  if (Accum == Type::VoidTy || In == Accum) {
+      // Or if this is a same type, keep it.
      (In == Accum && Offset == 0)) {
    Accum = In;
-  } else if (In == Type::VoidTy) {
+    return;
    // Noop.
  } else if (In->isInteger() && Accum->isInteger()) {   // integer union.
    // Otherwise pick whichever type is larger.
    if (cast<IntegerType>(In)->getBitWidth() > 
        cast<IntegerType>(Accum)->getBitWidth())
      Accum = In;
  } else if (isa<PointerType>(In) && isa<PointerType>(Accum)) {
    // Pointer unions just stay as one of the pointers.
  } else if (isa<VectorType>(In) || isa<VectorType>(Accum)) {
    if ((PTy = dyn_cast<VectorType>(Accum)) && 
        PTy->getElementType() == In) {
      // Accum is a vector, and we are accessing an element: ok.
    } else if ((PTy = dyn_cast<VectorType>(In)) && 
               PTy->getElementType() == Accum) {
      // In is a vector, and accum is an element: ok, remember In.
      Accum = In;
    } else if ((PTy = dyn_cast<VectorType>(In)) && isa<VectorType>(Accum) &&
               PTy->getBitWidth() == cast<VectorType>(Accum)->getBitWidth()) {
      // Two vectors of the same size: keep Accum.
    } else {
      // Cannot insert an short into a <4 x int> or handle
      // <2 x int> -> <4 x int>
      return true;
    }
  } else {
    // Pointer/FP/Integer unions merge together as integers.
    switch (Accum->getTypeID()) {
    case Type::PointerTyID: Accum = TD.getIntPtrType(); break;
    case Type::FloatTyID:   Accum = Type::Int32Ty; break;
    case Type::DoubleTyID:  Accum = Type::Int64Ty; break;
    case Type::X86_FP80TyID:  return true;
    case Type::FP128TyID: return true;
    case Type::PPC_FP128TyID: return true;
    default:
      assert(Accum->isInteger() && "Unknown FP type!");
      break;
    }
    switch (In->getTypeID()) {
    case Type::PointerTyID: In = TD.getIntPtrType(); break;
    case Type::FloatTyID:   In = Type::Int32Ty; break;
    case Type::DoubleTyID:  In = Type::Int64Ty; break;
    case Type::X86_FP80TyID:  return true;
    case Type::FP128TyID: return true;
    case Type::PPC_FP128TyID: return true;
    default:
      assert(In->isInteger() && "Unknown FP type!");
      break;
    }
    return MergeInType(In, Accum, TD);
  }
-  return false;
+  
  if (const VectorType *VATy = dyn_cast<VectorType>(Accum)) {
    if (VATy->getElementType() == In &&
        Offset % TD.getTypePaddedSize(In) == 0 &&
        Offset < TD.getTypePaddedSize(VATy))
      return;  // Accum is a vector, and we are accessing an element: ok.
    if (const VectorType *VInTy = dyn_cast<VectorType>(In))
      if (VInTy->getBitWidth() == VATy->getBitWidth() && Offset == 0)
        return; // Two vectors of the same size: keep either one of them.
  }
  if (const VectorType *VInTy = dyn_cast<VectorType>(In)) {
    // In is a vector, and we are accessing an element: keep V.
    if (VInTy->getElementType() == Accum &&
        Offset % TD.getTypePaddedSize(Accum) == 0 &&
        Offset < TD.getTypePaddedSize(VInTy)) {
      Accum = VInTy;
      return;
    }
  }
  // Otherwise, we have a case that we can't handle with an optimized form.
  // Convert the alloca to an integer that is as large as the largest store size
  // of the value values.
  uint64_t InSize = TD.getTypeStoreSizeInBits(In)+8*Offset;
  uint64_t ASize  = TD.getTypeStoreSizeInBits(Accum);
  if (InSize > ASize) ASize = InSize;
  Accum = IntegerType::get(ASize);
 }
-/// getIntAtLeastAsBigAs - Return an integer type that is at least as big as the
+/// CanConvertToScalar - V is a pointer.  If we can convert the pointee and all
-/// specified type.  If there is no suitable type, this returns null.
+/// its accesses to use a to single scalar type, return true, and set ResTy to
-const Type *getIntAtLeastAsBigAs(unsigned NumBits) {
+/// the new type.  Further, if the use is not a completely trivial use that
-  if (NumBits > 64) return 0;
+/// mem2reg could promote, set IsNotTrivial.  Offset is the current offset from
-  if (NumBits > 32) return Type::Int64Ty;
+/// the base of the alloca being analyzed.
  if (NumBits > 16) return Type::Int32Ty;
  if (NumBits > 8) return Type::Int16Ty;
  return Type::Int8Ty;    
 }
 /// CanConvertToScalar - V is a pointer.  If we can convert the pointee to a
 /// single scalar integer type, return that type.  Further, if the use is not
 /// a completely trivial use that mem2reg could promote, set IsNotTrivial.  If
 /// there are no uses of this pointer, return Type::VoidTy to differentiate from
 /// failure.
 ///
-const Type *SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial) {
+bool SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial,
-  const Type *UsedType = Type::VoidTy; // No uses, no forced type.
+                              const Type *&ResTy, uint64_t Offset) {
  const PointerType *PTy = cast<PointerType>(V->getType());
  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
    Instruction *User = cast<Instruction>(*UI);
    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
      // Don't break volatile loads.
      if (LI->isVolatile())
-        return 0;
+        return false;
-
+      MergeInType(LI->getType(), Offset, ResTy, *TD);
      // FIXME: Loads of a first class aggregrate value could be converted to a
      // series of loads and insertvalues
      if (!LI->getType()->isSingleValueType())
        return 0;
      if (MergeInType(LI->getType(), UsedType, *TD))
        return 0;
      continue;
    }
    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
      // Storing the pointer, not into the value?
      if (SI->getOperand(0) == V || SI->isVolatile()) return 0;
-
+      MergeInType(SI->getOperand(0)->getType(), Offset, ResTy, *TD);
      // FIXME: Stores of a first class aggregrate value could be converted to a
      // series of extractvalues and stores
      if (!SI->getOperand(0)->getType()->isSingleValueType())
        return 0;
      // NOTE: We could handle storing of FP imms into integers here!
      if (MergeInType(SI->getOperand(0)->getType(), UsedType, *TD))
        return 0;
      continue;
    }
-    if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+    
    if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
      if (!CanConvertToScalar(BCI, IsNotTrivial, ResTy, Offset))
        return false;
      IsNotTrivial = true;
      const Type *SubTy = CanConvertToScalar(CI, IsNotTrivial);
      if (!SubTy || MergeInType(SubTy, UsedType, *TD)) return 0;
      continue;
    }
    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
-      // Check to see if this is stepping over an element: GEP Ptr, int C
+      // If this is a GEP with a variable indices, we can't handle it.
-      if (GEP->getNumOperands() == 2 && isa<ConstantInt>(GEP->getOperand(1))) {
+      if (!GEP->hasAllConstantIndices())
-        unsigned Idx = cast<ConstantInt>(GEP->getOperand(1))->getZExtValue();
+        return false;
        unsigned ElSize = TD->getTypePaddedSize(PTy->getElementType());
        unsigned BitOffset = Idx*ElSize*8;
        if (BitOffset > 64 || !isPowerOf2_32(ElSize)) return 0;
        IsNotTrivial = true;
        const Type *SubElt = CanConvertToScalar(GEP, IsNotTrivial);
        if (SubElt == 0) return 0;
        if (SubElt != Type::VoidTy && SubElt->isInteger()) {
          const Type *NewTy = 
            getIntAtLeastAsBigAs(TD->getTypePaddedSizeInBits(SubElt)+BitOffset);
          if (NewTy == 0 || MergeInType(NewTy, UsedType, *TD)) return 0;
          continue;
        }
        // Cannot handle this!
        return 0;
      }
-      if (GEP->getNumOperands() == 3 && 
+      // Compute the offset that this GEP adds to the pointer.
-          isa<ConstantInt>(GEP->getOperand(1)) &&
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-          isa<ConstantInt>(GEP->getOperand(2)) &&
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
-          cast<ConstantInt>(GEP->getOperand(1))->isZero()) {
+                                                &Indices[0], Indices.size());
-        // We are stepping into an element, e.g. a structure or an array:
+      // See if all uses can be converted.
-        // GEP Ptr, i32 0, i32 Cst
+      if (!CanConvertToScalar(GEP, IsNotTrivial, ResTy, Offset+GEPOffset))
-        const Type *AggTy = PTy->getElementType();
+        return false;
-        unsigned Idx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+      IsNotTrivial = true;
-        
+      continue;
        if (const ArrayType *ATy = dyn_cast<ArrayType>(AggTy)) {
          if (Idx >= ATy->getNumElements()) return 0;  // Out of range.
        } else if (const VectorType *VectorTy = dyn_cast<VectorType>(AggTy)) {
          // Getting an element of the vector.
          if (Idx >= VectorTy->getNumElements()) return 0;  // Out of range.
          // Merge in the vector type.
          if (MergeInType(VectorTy, UsedType, *TD)) return 0;
          const Type *SubTy = CanConvertToScalar(GEP, IsNotTrivial);
          if (SubTy == 0) return 0;
          if (SubTy != Type::VoidTy && MergeInType(SubTy, UsedType, *TD))
            return 0;
          // We'll need to change this to an insert/extract element operation.
          IsNotTrivial = true;
          continue;    // Everything looks ok
        } else if (isa<StructType>(AggTy)) {
          // Structs are always ok.
        } else {
          return 0;
        }
        const Type *NTy =
          getIntAtLeastAsBigAs(TD->getTypePaddedSizeInBits(AggTy));
        if (NTy == 0 || MergeInType(NTy, UsedType, *TD)) return 0;
        const Type *SubTy = CanConvertToScalar(GEP, IsNotTrivial);
        if (SubTy == 0) return 0;
        if (SubTy != Type::VoidTy && MergeInType(SubTy, UsedType, *TD))
          return 0;
        continue;    // Everything looks ok
      }
      return 0;
    }
-    // Cannot handle this!
+    // Otherwise, we cannot handle this!
-    return 0;
+    return false;
  }
-  return UsedType;
+  return true;
 }
 /// ConvertToScalar - The specified alloca passes the CanConvertToScalar
 /// predicate and is non-trivial.  Convert it to something that can be trivially
 /// promoted into a register by mem2reg.
 void SROA::ConvertToScalar(AllocationInst *AI, const Type *ActualTy) {
-  DOUT << "CONVERT TO SCALAR: " << *AI << "  TYPE = "
+  DOUT << "CONVERT TO SCALAR: " << *AI << "  TYPE = " << *ActualTy << "\n";
       << *ActualTy << "\n";
  ++NumConverted;
  BasicBlock *EntryBlock = AI->getParent();
  assert(EntryBlock == &EntryBlock->getParent()->getEntryBlock() &&
         "Not in the entry block!");
  EntryBlock->getInstList().remove(AI);  // Take the alloca out of the program.
  // Create and insert the alloca.
  AllocaInst *NewAI = new AllocaInst(ActualTy, 0, AI->getName(),
-                                     EntryBlock->begin());
+                                     AI->getParent()->begin());
  ConvertUsesToScalar(AI, NewAI, 0);
-  delete AI;
+  AI->eraseFromParent();
 }
@ -1378,22 +1280,19 @@ void SROA::ConvertToScalar(AllocationInst *AI, const Type *ActualTy) {
 ///
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.  By the end of this, there should be no uses of Ptr.
-void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset) {
+void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, uint64_t Offset) {
  while (!Ptr->use_empty()) {
    Instruction *User = cast<Instruction>(Ptr->use_back());
    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
-      Value *NV = ConvertUsesOfLoadToScalar(LI, NewAI, Offset);
+      LI->replaceAllUsesWith(ConvertUsesOfLoadToScalar(LI, NewAI, Offset));
      LI->replaceAllUsesWith(NV);
      LI->eraseFromParent();
      continue;
    }
    if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
      assert(SI->getOperand(0) != Ptr && "Consistency error!");
-
+      new StoreInst(ConvertUsesOfStoreToScalar(SI, NewAI, Offset), NewAI, SI);
      Value *SV = ConvertUsesOfStoreToScalar(SI, NewAI, Offset);
      new StoreInst(SV, NewAI, SI);
      SI->eraseFromParent();
      continue;
    }
@ -1405,45 +1304,14 @@ void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset) {
    }
    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
-      const PointerType *AggPtrTy = 
+      // Compute the offset that this GEP adds to the pointer.
-        cast<PointerType>(GEP->getOperand(0)->getType());
+      SmallVector<Value*, 8> Indices(GEP->op_begin()+1, GEP->op_end());
-      unsigned AggSizeInBits =
+      uint64_t GEPOffset = TD->getIndexedOffset(GEP->getOperand(0)->getType(),
-        TD->getTypePaddedSizeInBits(AggPtrTy->getElementType());
+                                                &Indices[0], Indices.size());
-
+      ConvertUsesToScalar(GEP, NewAI, Offset+GEPOffset*8);
      // Check to see if this is stepping over an element: GEP Ptr, int C
      unsigned NewOffset = Offset;
      if (GEP->getNumOperands() == 2) {
        unsigned Idx = cast<ConstantInt>(GEP->getOperand(1))->getZExtValue();
        unsigned BitOffset = Idx*AggSizeInBits;
        NewOffset += BitOffset;
        ConvertUsesToScalar(GEP, NewAI, NewOffset);
        GEP->eraseFromParent();
        continue;
      }
      assert(GEP->getNumOperands() == 3 && "Unsupported operation");
      // We know that operand #2 is zero.
      unsigned Idx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
      const Type *AggTy = AggPtrTy->getElementType();
      if (const SequentialType *SeqTy = dyn_cast<SequentialType>(AggTy)) {
        unsigned ElSizeBits =
          TD->getTypePaddedSizeInBits(SeqTy->getElementType());
        NewOffset += ElSizeBits*Idx;
      } else {
        const StructType *STy = cast<StructType>(AggTy);
        unsigned EltBitOffset =
          TD->getStructLayout(STy)->getElementOffsetInBits(Idx);
        NewOffset += EltBitOffset;
      }
      ConvertUsesToScalar(GEP, NewAI, NewOffset);
      GEP->eraseFromParent();
      continue;
    }
    assert(0 && "Unsupported operation!");
    abort();
  }
@ -1455,28 +1323,20 @@ void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset) {
 /// single integer scalar, or when we are converting a "vector union" to a
 /// vector with insert/extractelement instructions.
 ///
-/// Offset is an offset from the original alloca, in bits that need to be
+/// Offset is an offset from the original alloca, in bytes that need to be
 /// shifted to the right.  By the end of this, there should be no uses of Ptr.
 Value *SROA::ConvertUsesOfLoadToScalar(LoadInst *LI, AllocaInst *NewAI, 
-                                       unsigned Offset) {
+                                       uint64_t Offset) {
  // The load is a bit extract from NewAI shifted right by Offset bits.
  Value *NV = new LoadInst(NewAI, LI->getName(), LI);
-  if (NV->getType() == LI->getType() && Offset == 0) {
+  // If the load is of the whole new alloca, no conversion is needed.
-    // We win, no conversion needed.
+  if (NV->getType() == LI->getType() && Offset == 0)
    return NV;
  } 
-  // If the result type of the 'union' is a pointer, then this must be ptr->ptr
+  // If the result alloca is a vector type, this is either an element
-  // cast.  Anything else would result in NV being an integer.
+  // access or a bitcast to another vector type of the same size.
  if (isa<PointerType>(NV->getType())) {
    assert(isa<PointerType>(LI->getType()));
    return new BitCastInst(NV, LI->getType(), LI->getName(), LI);
  }
  if (const VectorType *VTy = dyn_cast<VectorType>(NV->getType())) {
    // If the result alloca is a vector type, this is either an element
    // access or a bitcast to another vector type.
    if (isa<VectorType>(LI->getType()))
      return new BitCastInst(NV, LI->getType(), LI->getName(), LI);
@ -1485,16 +1345,14 @@ Value *SROA::ConvertUsesOfLoadToScalar(LoadInst *LI, AllocaInst *NewAI,
    if (Offset) {
      unsigned EltSize = TD->getTypePaddedSizeInBits(VTy->getElementType());
      Elt = Offset/EltSize;
-      Offset -= EltSize*Elt;
+      assert(EltSize*Elt == Offset && "Invalid modulus in validity checking");
    }
-    NV = new ExtractElementInst(NV, ConstantInt::get(Type::Int32Ty, Elt),
+    // Return the element extracted out of it.
-                                "tmp", LI);
+    return new ExtractElementInst(NV, ConstantInt::get(Type::Int32Ty, Elt),
-    
+                                  "tmp", LI);
    // If we're done, return this element.
    if (NV->getType() == LI->getType() && Offset == 0)
      return NV;
  }
  // Otherwise, this must be a union that was converted to an integer value.
  const IntegerType *NTy = cast<IntegerType>(NV->getType());
  // If this is a big-endian system and the load is narrower than the
@ -1514,12 +1372,12 @@ Value *SROA::ConvertUsesOfLoadToScalar(LoadInst *LI, AllocaInst *NewAI,
  // We do this to support (f.e.) loads off the end of a structure where
  // only some bits are used.
  if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
-    NV = BinaryOperator::CreateLShr(NV, 
+    NV = BinaryOperator::CreateLShr(NV,
-                                    ConstantInt::get(NV->getType(),ShAmt),
+                                    ConstantInt::get(NV->getType(), ShAmt),
                                    LI->getName(), LI);
  else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
-    NV = BinaryOperator::CreateShl(NV, 
+    NV = BinaryOperator::CreateShl(NV,
-                                   ConstantInt::get(NV->getType(),-ShAmt),
+                                   ConstantInt::get(NV->getType(), -ShAmt),
                                   LI->getName(), LI);
  // Finally, unconditionally truncate the integer to the right width.
@ -1531,7 +1389,8 @@ Value *SROA::ConvertUsesOfLoadToScalar(LoadInst *LI, AllocaInst *NewAI,
  // If the result is an integer, this is a trunc or bitcast.
  if (isa<IntegerType>(LI->getType())) {
    // Should be done.
-  } else if (LI->getType()->isFloatingPoint()) {
+  } else if (LI->getType()->isFloatingPoint() ||
             isa<VectorType>(LI->getType())) {
    // Just do a bitcast, we know the sizes match up.
    NV = new BitCastInst(NV, LI->getType(), LI->getName(), LI);
  } else {
@ -1552,15 +1411,17 @@ Value *SROA::ConvertUsesOfLoadToScalar(LoadInst *LI, AllocaInst *NewAI,
 /// Offset is an offset from the original alloca, in bits that need to be
 /// shifted to the right.  By the end of this, there should be no uses of Ptr.
 Value *SROA::ConvertUsesOfStoreToScalar(StoreInst *SI, AllocaInst *NewAI, 
-                                        unsigned Offset) {
+                                        uint64_t Offset) {
  // Convert the stored type to the actual type, shift it left to insert
  // then 'or' into place.
  Value *SV = SI->getOperand(0);
  const Type *AllocaType = NewAI->getType()->getElementType();
  if (SV->getType() == AllocaType && Offset == 0) {
-    // All is well.
+    return SV;
-  } else if (const VectorType *PTy = dyn_cast<VectorType>(AllocaType)) {
+  }
  if (const VectorType *VTy = dyn_cast<VectorType>(AllocaType)) {
    Value *Old = new LoadInst(NewAI, NewAI->getName()+".in", SI);
    // If the result alloca is a vector type, this is either an element
@ -1569,72 +1430,68 @@ Value *SROA::ConvertUsesOfStoreToScalar(StoreInst *SI, AllocaInst *NewAI,
      SV = new BitCastInst(SV, AllocaType, SV->getName(), SI);
    } else {
      // Must be an element insertion.
-      unsigned Elt = Offset/TD->getTypePaddedSizeInBits(PTy->getElementType());
+      unsigned Elt = Offset/TD->getTypePaddedSizeInBits(VTy->getElementType());
      SV = InsertElementInst::Create(Old, SV,
                                     ConstantInt::get(Type::Int32Ty, Elt),
                                     "tmp", SI);
    }
-  } else if (isa<PointerType>(AllocaType)) {
+    return SV;
-    // If the alloca type is a pointer, then all the elements must be
+  }
-    // pointers.
+  
-    if (SV->getType() != AllocaType)
+  
-      SV = new BitCastInst(SV, AllocaType, SV->getName(), SI);
+  Value *Old = new LoadInst(NewAI, NewAI->getName()+".in", SI);
  // If SV is a float, convert it to the appropriate integer type.
  // If it is a pointer, do the same, and also handle ptr->ptr casts
  // here.
  unsigned SrcWidth = TD->getTypeSizeInBits(SV->getType());
  unsigned DestWidth = TD->getTypeSizeInBits(AllocaType);
  unsigned SrcStoreWidth = TD->getTypeStoreSizeInBits(SV->getType());
  unsigned DestStoreWidth = TD->getTypeStoreSizeInBits(AllocaType);
  if (SV->getType()->isFloatingPoint() || isa<VectorType>(SV->getType()))
    SV = new BitCastInst(SV, IntegerType::get(SrcWidth), SV->getName(), SI);
  else if (isa<PointerType>(SV->getType()))
    SV = new PtrToIntInst(SV, TD->getIntPtrType(), SV->getName(), SI);
  // Always zero extend the value if needed.
  if (SV->getType() != AllocaType)
    SV = new ZExtInst(SV, AllocaType, SV->getName(), SI);
  // If this is a big-endian system and the store is narrower than the
  // full alloca type, we need to do a shift to get the right bits.
  int ShAmt = 0;
  if (TD->isBigEndian()) {
    // On big-endian machines, the lowest bit is stored at the bit offset
    // from the pointer given by getTypeStoreSizeInBits.  This matters for
    // integers with a bitwidth that is not a multiple of 8.
    ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
  } else {
-    Value *Old = new LoadInst(NewAI, NewAI->getName()+".in", SI);
+    ShAmt = Offset;
-    
+  }
-    // If SV is a float, convert it to the appropriate integer type.
+  
-    // If it is a pointer, do the same, and also handle ptr->ptr casts
+  // Note: we support negative bitwidths (with shr) which are not defined.
-    // here.
+  // We do this to support (f.e.) stores off the end of a structure where
-    unsigned SrcWidth = TD->getTypeSizeInBits(SV->getType());
+  // only some bits in the structure are set.
-    unsigned DestWidth = TD->getTypeSizeInBits(AllocaType);
+  APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
-    unsigned SrcStoreWidth = TD->getTypeStoreSizeInBits(SV->getType());
+  if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
-    unsigned DestStoreWidth = TD->getTypeStoreSizeInBits(AllocaType);
+    SV = BinaryOperator::CreateShl(SV, 
-    if (SV->getType()->isFloatingPoint())
+                                   ConstantInt::get(SV->getType(), ShAmt),
-      SV = new BitCastInst(SV, IntegerType::get(SrcWidth),
+                                   SV->getName(), SI);
-                           SV->getName(), SI);
+    Mask <<= ShAmt;
-    else if (isa<PointerType>(SV->getType()))
+  } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
-      SV = new PtrToIntInst(SV, TD->getIntPtrType(), SV->getName(), SI);
+    SV = BinaryOperator::CreateLShr(SV,
-    
+                                    ConstantInt::get(SV->getType(),-ShAmt),
-    // Always zero extend the value if needed.
+                                    SV->getName(), SI);
-    if (SV->getType() != AllocaType)
+    Mask = Mask.lshr(ShAmt);
-      SV = new ZExtInst(SV, AllocaType, SV->getName(), SI);
+  }
-    
+  
-    // If this is a big-endian system and the store is narrower than the
+  // Mask out the bits we are about to insert from the old value, and or
-    // full alloca type, we need to do a shift to get the right bits.
+  // in the new bits.
-    int ShAmt = 0;
+  if (SrcWidth != DestWidth) {
-    if (TD->isBigEndian()) {
+    assert(DestWidth > SrcWidth);
-      // On big-endian machines, the lowest bit is stored at the bit offset
+    Old = BinaryOperator::CreateAnd(Old, ConstantInt::get(~Mask),
-      // from the pointer given by getTypeStoreSizeInBits.  This matters for
+                                    Old->getName()+".mask", SI);
-      // integers with a bitwidth that is not a multiple of 8.
+    SV = BinaryOperator::CreateOr(Old, SV, SV->getName()+".ins", SI);
      ShAmt = DestStoreWidth - SrcStoreWidth - Offset;
    } else {
      ShAmt = Offset;
    }
    // Note: we support negative bitwidths (with shr) which are not defined.
    // We do this to support (f.e.) stores off the end of a structure where
    // only some bits in the structure are set.
    APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
    if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
      SV = BinaryOperator::CreateShl(SV, 
                                     ConstantInt::get(SV->getType(), ShAmt),
                                     SV->getName(), SI);
      Mask <<= ShAmt;
    } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
      SV = BinaryOperator::CreateLShr(SV,
                                      ConstantInt::get(SV->getType(),-ShAmt),
                                      SV->getName(), SI);
      Mask = Mask.lshr(ShAmt);
    }
    // Mask out the bits we are about to insert from the old value, and or
    // in the new bits.
    if (SrcWidth != DestWidth) {
      assert(DestWidth > SrcWidth);
      Old = BinaryOperator::CreateAnd(Old, ConstantInt::get(~Mask),
                                      Old->getName()+".mask", SI);
      SV = BinaryOperator::CreateOr(Old, SV, SV->getName()+".ins", SI);
    }
  }
  return SV;
 }
--- a/llvm/test/Transforms/ScalarRepl/2003-05-29-ArrayFail.ll
+++ b/llvm/test/Transforms/ScalarRepl/2003-05-29-ArrayFail.ll
@ -1,9 +1,8 @@
-; RUN: llvm-as < %s | opt -scalarrepl | llvm-dis | \
+; RUN: llvm-as < %s | opt -scalarrepl | llvm-dis | grep {ret i32 undef}
 ; RUN:   grep alloca | grep {4 x}
-; Test that an array is not incorrectly deconstructed...
+; Test that an array is not incorrectly deconstructed.
-define i32 @test() {
+define i32 @test() nounwind {
 	%X = alloca [4 x i32]		; <[4 x i32]*> [#uses=1]
 	%Y = getelementptr [4 x i32]* %X, i64 0, i64 0		; <i32*> [#uses=1]
        ; Must preserve arrayness!
--- a/llvm/test/Transforms/ScalarRepl/2006-11-07-InvalidArrayPromote.ll
+++ b/llvm/test/Transforms/ScalarRepl/2006-11-07-InvalidArrayPromote.ll
@ -1,7 +1,6 @@
-; RUN: llvm-as < %s | opt -scalarrepl | llvm-dis | \
+; RUN: llvm-as < %s | opt -scalarrepl | llvm-dis | not grep alloca
 ; RUN:   grep -F {alloca \[2 x <4 x i32>\]}
-define i32 @func(<4 x float> %v0, <4 x float> %v1) {
+define i32 @func(<4 x float> %v0, <4 x float> %v1) nounwind {
 	%vsiidx = alloca [2 x <4 x i32>], align 16		; <[2 x <4 x i32>]*> [#uses=3]
 	%tmp = call <4 x i32> @llvm.x86.sse2.cvttps2dq( <4 x float> %v0 )		; <<4 x i32>> [#uses=2]
 	%tmp.upgrd.1 = bitcast <4 x i32> %tmp to <2 x i64>		; <<2 x i64>> [#uses=0]
--- a/llvm/test/Transforms/ScalarRepl/badarray.ll
+++ b/llvm/test/Transforms/ScalarRepl/badarray.ll
@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -scalarrepl -mem2reg | llvm-dis | grep alloca
+; RUN: llvm-as < %s | opt -scalarrepl -instcombine | llvm-dis | grep {ret i32 0}
 define i32 @test() {
 	%X = alloca [4 x i32]		; <[4 x i32]*> [#uses=1]
--- a/llvm/test/Transforms/ScalarRepl/bitfield-sroa.ll
+++ b/llvm/test/Transforms/ScalarRepl/bitfield-sroa.ll
@ -0,0 +1,16 @@
 ; RUN: llvm-as < %s | opt -scalarrepl | llvm-dis | not grep alloca        
 ; rdar://6532315
 %t = type { { i32, i16, i8, i8 } }
 define i8 @foo(i64 %A) {
        %ALL = alloca %t, align 8 
        %tmp59172 = bitcast %t* %ALL to i64*
        store i64 %A, i64* %tmp59172, align 8
        %C = getelementptr %t* %ALL, i32 0, i32 0, i32 1             
        %D = bitcast i16* %C to i32*    
        %E = load i32* %D, align 4     
        %F = bitcast %t* %ALL to i8* 
        %G = load i8* %F, align 8 
 	ret i8 %G
 }