diff --git a/llvm/.circleci/config.yml b/llvm/.circleci/config.yml new file mode 100644 index 00000000000000..ad6dfc714d34e1 --- /dev/null +++ b/llvm/.circleci/config.yml @@ -0,0 +1,30 @@ +version: 2 +jobs: + build: + resource_class: xlarge + docker: + - image: wsmoses/tapir:latest + + steps: + - checkout + - run: + name: submodules + command: | + git submodule sync + git submodule update --init --recursive + - run: + name: cmake + command: | + mkdir build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=host -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_LTO=OFF -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1 + - run: + name: make + command: | + cd build + make -j2 + - run: + name: test + command: | + cd build + make check-all diff --git a/llvm/.gitignore b/llvm/.gitignore index be58944c9b3a1a..ba3de277d5e2f7 100644 --- a/llvm/.gitignore +++ b/llvm/.gitignore @@ -80,3 +80,8 @@ docs/_build #==============================================================================# bindings/go/llvm/llvm_config.go bindings/go/llvm/workdir + +build/* +build +build-debug/* +build-debug diff --git a/llvm/898/sync_elimination_pfor_mb/main.c b/llvm/898/sync_elimination_pfor_mb/main.c new file mode 100644 index 00000000000000..b5e0ebc4281ad5 --- /dev/null +++ b/llvm/898/sync_elimination_pfor_mb/main.c @@ -0,0 +1,35 @@ +#include + +#include + +#define N 100000000 + +__attribute__((always_inline)) +int f(int x) { + return x * x; +} + +__attribute__((always_inline)) +int g(int x) { + return x + 3; +} + +int r1[N]; +int r2[N]; + +int main(void) +{ + int sum = 0; + + cilk_for (int i=0; i class detachaccess_def_iterator_base; +using detachaccess_def_iterator = detachaccess_def_iterator_base; +using const_detachaccess_def_iterator = + detachaccess_def_iterator_base; + +// \brief The base for all detach accesses, i.e., detaches (defs) and syncs +// (uses). +class DetachAccess + : public DerivedUser, + public ilist_node>, + public ilist_node> { +public: + using AllAccessType = + ilist_node>; + using DefsOnlyType = + ilist_node>; + + // Methods for support type inquiry through isa, cast, and + // dyn_cast + static inline bool classof(const Value *V) { + unsigned ID = V->getValueID(); + return ID == DetachUseVal || ID == DetachPhiVal || ID == DetachDefVal; + } + + DetachAccess(const DetachAccess &) = delete; + DetachAccess &operator=(const DetachAccess &) = delete; + + void *operator new(size_t, unsigned) = delete; + void *operator new(size_t) = delete; + + BasicBlock *getBlock() const { return Block; } + + void print(raw_ostream &OS) const; + void dump() const; + + /// \brief The user iterators for a detach access + typedef user_iterator iterator; + typedef const_user_iterator const_iterator; + + /// \brief This iterator walks over all of the defs in a given + /// DetachAccess. For DetachPhi nodes, this walks arguments. For + /// DetachUse/DetachDef, this walks the defining access. + detachaccess_def_iterator defs_begin(); + const_detachaccess_def_iterator defs_begin() const; + detachaccess_def_iterator defs_end(); + const_detachaccess_def_iterator defs_end() const; + + /// \brief Get the iterators for the all access list and the defs only list + /// We default to the all access list. + AllAccessType::self_iterator getIterator() { + return this->AllAccessType::getIterator(); + } + AllAccessType::const_self_iterator getIterator() const { + return this->AllAccessType::getIterator(); + } + AllAccessType::reverse_self_iterator getReverseIterator() { + return this->AllAccessType::getReverseIterator(); + } + AllAccessType::const_reverse_self_iterator getReverseIterator() const { + return this->AllAccessType::getReverseIterator(); + } + DefsOnlyType::self_iterator getDefsIterator() { + return this->DefsOnlyType::getIterator(); + } + DefsOnlyType::const_self_iterator getDefsIterator() const { + return this->DefsOnlyType::getIterator(); + } + DefsOnlyType::reverse_self_iterator getReverseDefsIterator() { + return this->DefsOnlyType::getReverseIterator(); + } + DefsOnlyType::const_reverse_self_iterator getReverseDefsIterator() const { + return this->DefsOnlyType::getReverseIterator(); + } + +protected: + friend class DetachSSA; + friend class DetachUseOrDef; + friend class DetachUse; + friend class DetachDef; + friend class DetachPhi; + + /// \brief Used by DetachSSA to change the block of a DetachAccess when it is + /// moved. + void setBlock(BasicBlock *BB) { Block = BB; } + + /// \brief Used for debugging and tracking things about DetachAccesses. + /// Guaranteed unique among DetachAccesses, no guarantees otherwise. + inline unsigned getID() const; + + DetachAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue, + BasicBlock *BB, unsigned NumOperands) + : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue), + Block(BB) {} + +private: + BasicBlock *Block; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const DetachAccess &DA) { + DA.print(OS); + return OS; +} + +/// \brief Class that has the common methods + fields of detach uses/defs. It's +/// a little awkward to have, but there are many cases where we want either a +/// use or def, and there are many cases where uses are needed (defs aren't +/// acceptable), and vice-versa. +/// +/// This class should never be instantiated directly; make a DetachUse or +/// DetachDef instead. +class DetachUseOrDef : public DetachAccess { +public: + void *operator new(size_t, unsigned) = delete; + void *operator new(size_t) = delete; + + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess); + + /// \brief Get the instruction that this DetachAccess represents. + Instruction *getDAInst() const { return DAInst; } + + /// \brief Get the access that produces the detach state used by this Use. + DetachAccess *getDefiningAccess() const { return getOperand(0); } + + static inline bool classof(const Value *DA) { + return DA->getValueID() == DetachUseVal || DA->getValueID() == DetachDefVal; + } + + // Sadly, these have to be public because they are needed in some of the + // iterators. + inline bool isOptimized() const; + inline DetachAccess *getOptimized() const; + inline void setOptimized(DetachAccess *); + + /// \brief Reset the ID of what this DetachUse was optimized to, causing it to + /// be rewalked by the walker if necessary. + /// This really should only be called by tests. + inline void resetOptimized(); + +protected: + friend class DetachSSA; + DetachUseOrDef(LLVMContext &C, DetachAccess *DDA, unsigned Vty, + DeleteValueTy DeleteValue, Instruction *TI, BasicBlock *BB) + : DetachAccess(C, Vty, DeleteValue, BB, 1), DAInst(TI) { + setDefiningAccess(DDA); + } + void setDefiningAccess(DetachAccess *DDA, bool Optimized = false) { + if (!Optimized) { + setOperand(0, DDA); + return; + } + setOptimized(DDA); + } + +private: + Instruction *DAInst; +}; + +template <> +struct OperandTraits + : public FixedNumOperandTraits {}; +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachUseOrDef, DetachAccess) + +/// \brief Represents a detach use, i.e., a sync instruction. +class DetachUse final : public DetachUseOrDef { +public: + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess); + + DetachUse(LLVMContext &C, DetachAccess *DDA, Instruction *SI, BasicBlock *BB) + : DetachUseOrDef(C, DDA, DetachUseVal, deleteMe, SI, BB), + OptimizedID(0) {} + + // allocate space for exactly one operand + void *operator new(size_t s) { return User::operator new(s, 1); } + void *operator new(size_t, unsigned) = delete; + + static inline bool classof(const Value *DA) { + return DA->getValueID() == DetachUseVal; + } + + void print(raw_ostream &OS) const; + + void setOptimized(DetachAccess *DDA) { + OptimizedID = DDA->getID(); + setOperand(0, DDA); + } + + bool isOptimized() const { + return getDefiningAccess() && OptimizedID == getDefiningAccess()->getID(); + } + + DetachAccess *getOptimized() const { + return getDefiningAccess(); + } + void resetOptimized() { + OptimizedID = INVALID_DETACHACCESS_ID; + } + +protected: + friend class DetachSSA; + +private: + static void deleteMe(DerivedUser *Self); + + unsigned int OptimizedID; +}; + +template <> +struct OperandTraits : public FixedNumOperandTraits {}; +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachUse, DetachAccess) + +/// \brief Represents a detach definition, i.e., a detach. +class DetachDef final : public DetachUseOrDef { +public: + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess); + + DetachDef(LLVMContext &C, DetachAccess *DDA, Instruction *DI, BasicBlock *BB, + unsigned Ver) + : DetachUseOrDef(C, DDA, DetachDefVal, deleteMe, DI, BB), + ID(Ver), Optimized(nullptr), OptimizedID(INVALID_DETACHACCESS_ID) {} + + // allocate space for exactly one operand + void *operator new(size_t s) { return User::operator new(s, 1); } + void *operator new(size_t, unsigned) = delete; + + static inline bool classof(const Value *DA) { + return DA->getValueID() == DetachDefVal; + } + + void setOptimized(DetachAccess *DA) { + Optimized = DA; + OptimizedID = getDefiningAccess()->getID(); + } + DetachAccess *getOptimized() const { return Optimized; } + bool isOptimized() const { + return getOptimized() && getDefiningAccess() && + OptimizedID == getDefiningAccess()->getID(); + } + void resetOptimized() { + OptimizedID = INVALID_DETACHACCESS_ID; + } + + void print(raw_ostream &OS) const; + + friend class DetachSSA; + + unsigned getID() const { return ID; } + +private: + static void deleteMe(DerivedUser *Self); + + const unsigned ID; + DetachAccess *Optimized; + unsigned int OptimizedID; +}; + +template <> +struct OperandTraits : public FixedNumOperandTraits {}; +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachDef, DetachAccess) + +/// \brief Represents phi nodes for detach accesses. +/// +/// These have the same semantics as regular phi nodes, with the exception that +/// only one phi will ever exist in a given basic block. +/// Guaranteeing one phi per block means guaranteeing there is only ever one +/// valid reaching DetachDef/DetachPHI along each path to the phi node. +/// This is ensured by not allowing disambiguation of the RHS of a DetachDef or +/// a DetachPhi's operands. +class DetachPhi final : public DetachAccess { + // allocate space for exactly zero operands + void *operator new(size_t s) { return User::operator new(s); } + +public: + /// Provide fast operand accessors + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess); + + DetachPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0) + : DetachAccess(C, DetachPhiVal, deleteMe, BB, 0), ID(Ver), + ReservedSpace(NumPreds) { + allocHungoffUses(ReservedSpace); + } + + void *operator new(size_t, unsigned) = delete; + + // Block iterator interface. This provides access to the list of incoming + // basic blocks, which parallels the list of incoming values. + typedef BasicBlock **block_iterator; + typedef BasicBlock *const *const_block_iterator; + + block_iterator block_begin() { + auto *Ref = reinterpret_cast(op_begin() + ReservedSpace); + return reinterpret_cast(Ref + 1); + } + + const_block_iterator block_begin() const { + const auto *Ref = + reinterpret_cast(op_begin() + ReservedSpace); + return reinterpret_cast(Ref + 1); + } + + block_iterator block_end() { return block_begin() + getNumOperands(); } + + const_block_iterator block_end() const { + return block_begin() + getNumOperands(); + } + + iterator_range blocks() { + return make_range(block_begin(), block_end()); + } + + iterator_range blocks() const { + return make_range(block_begin(), block_end()); + } + + op_range incoming_values() { return operands(); } + + const_op_range incoming_values() const { return operands(); } + + /// \brief Return the number of incoming edges + unsigned getNumIncomingValues() const { return getNumOperands(); } + + /// \brief Return incoming value number x + DetachAccess *getIncomingValue(unsigned I) const { return getOperand(I); } + void setIncomingValue(unsigned I, DetachAccess *V) { + assert(V && "PHI node got a null value!"); + setOperand(I, V); + } + static unsigned getOperandNumForIncomingValue(unsigned I) { return I; } + static unsigned getIncomingValueNumForOperand(unsigned I) { return I; } + + /// \brief Return incoming basic block number @p i. + BasicBlock *getIncomingBlock(unsigned I) const { return block_begin()[I]; } + + /// \brief Return incoming basic block corresponding + /// to an operand of the PHI. + BasicBlock *getIncomingBlock(const Use &U) const { + assert(this == U.getUser() && "Iterator doesn't point to PHI's Uses?"); + return getIncomingBlock(unsigned(&U - op_begin())); + } + + /// \brief Return incoming basic block corresponding + /// to value use iterator. + BasicBlock *getIncomingBlock(DetachAccess::const_user_iterator I) const { + return getIncomingBlock(I.getUse()); + } + + void setIncomingBlock(unsigned I, BasicBlock *BB) { + assert(BB && "PHI node got a null basic block!"); + block_begin()[I] = BB; + } + + /// \brief Add an incoming value to the end of the PHI list + void addIncoming(DetachAccess *V, BasicBlock *BB) { + if (getNumOperands() == ReservedSpace) + growOperands(); // Get more space! + // Initialize some new operands. + setNumHungOffUseOperands(getNumOperands() + 1); + setIncomingValue(getNumOperands() - 1, V); + setIncomingBlock(getNumOperands() - 1, BB); + } + + /// \brief Return the first index of the specified basic + /// block in the value list for this PHI. Returns -1 if no instance. + int getBasicBlockIndex(const BasicBlock *BB) const { + for (unsigned I = 0, E = getNumOperands(); I != E; ++I) + if (block_begin()[I] == BB) + return I; + return -1; + } + + Value *getIncomingValueForBlock(const BasicBlock *BB) const { + int Idx = getBasicBlockIndex(BB); + assert(Idx >= 0 && "Invalid basic block argument!"); + return getIncomingValue(Idx); + } + + static inline bool classof(const Value *V) { + return V->getValueID() == DetachPhiVal; + } + + void print(raw_ostream &OS) const; + + unsigned getID() const { return ID; } + +protected: + friend class DetachSSA; + + /// \brief this is more complicated than the generic + /// User::allocHungoffUses, because we have to allocate Uses for the incoming + /// values and pointers to the incoming blocks, all in one allocation. + void allocHungoffUses(unsigned N) { + User::allocHungoffUses(N, /* IsPhi */ true); + } + +private: + // For debugging only + const unsigned ID; + unsigned ReservedSpace; + + /// \brief This grows the operand list in response to a push_back style of + /// operation. This grows the number of ops by 1.5 times. + void growOperands() { + unsigned E = getNumOperands(); + // 2 op PHI nodes are VERY common, so reserve at least enough for that. + ReservedSpace = std::max(E + E / 2, 2u); + growHungoffUses(ReservedSpace, /* IsPhi */ true); + } + + static void deleteMe(DerivedUser *Self); +}; + +inline unsigned DetachAccess::getID() const { + assert((isa(this) || isa(this)) && + "only detach defs and phis have ids"); + if (const auto *DD = dyn_cast(this)) + return DD->getID(); + return cast(this)->getID(); +} + +inline bool DetachUseOrDef::isOptimized() const { + if (const auto *DD = dyn_cast(this)) + return DD->isOptimized(); + return cast(this)->isOptimized(); +} + +inline DetachAccess *DetachUseOrDef::getOptimized() const { + if (const auto *DD = dyn_cast(this)) + return DD->getOptimized(); + return cast(this)->getOptimized(); +} + +inline void DetachUseOrDef::setOptimized(DetachAccess *DA) { + if (auto *DD = dyn_cast(this)) + DD->setOptimized(DA); + else + cast(this)->setOptimized(DA); +} + +inline void DetachUseOrDef::resetOptimized() { + if (auto *DD = dyn_cast(this)) + DD->resetOptimized(); + else + cast(this)->resetOptimized(); +} + + +template <> struct OperandTraits : public HungoffOperandTraits<2> {}; +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachPhi, DetachAccess) + + +/// \brief Encapsulates DetachSSA, including all data associated with detach +/// accesses. +class DetachSSA { +public: + DetachSSA(Function &, DominatorTree *); + ~DetachSSA(); + + /// \brief Given a detach Mod/Ref'ing instruction, get the DetachSSA + /// access associated with it. If passed a basic block gets the detach phi + /// node that exists for that block, if there is one. Otherwise, this will get + /// a DetachUseOrDef. + DetachUseOrDef *getDetachAccess(const Instruction *) const; + DetachPhi *getDetachAccess(const BasicBlock *BB) const; + + void dump() const; + void print(raw_ostream &) const; + + /// \brief Return true if \p MA represents the live on entry value + inline bool isLiveOnEntryDef(const DetachAccess *DA) const { + return DA == LiveOnEntryDef.get(); + } + + inline DetachAccess *getLiveOnEntryDef() const { + return LiveOnEntryDef.get(); + } + + // Sadly, iplists, by default, owns and deletes pointers added to the + // list. It's not currently possible to have two iplists for the same type, + // where one owns the pointers, and one does not. This is because the traits + // are per-type, not per-tag. If this ever changes, we should make the + // DefList an iplist. + using AccessList = iplist>; + using DefsList = + simple_ilist>; + + /// \brief Return the list of MemoryAccess's for a given basic block. + /// + /// This list is not modifiable by the user. + const AccessList *getBlockAccesses(const BasicBlock *BB) const { + return getWritableBlockAccesses(BB); + } + + /// \brief Return the list of MemoryDef's and MemoryPhi's for a given basic + /// block. + /// + /// This list is not modifiable by the user. + const DefsList *getBlockDefs(const BasicBlock *BB) const { + return getWritableBlockDefs(BB); + } + + /// \brief Given two detach accesses in the same basic block, determine + /// whether DetachAccess \p A dominates DetachAccess \p B. + bool locallyDominates(const DetachAccess *A, const DetachAccess *B) const; + + /// \brief Given two detach accesses in potentially different blocks, + /// determine whether DetachAccess \p A dominates DetachAccess \p B. + bool dominates(const DetachAccess *A, const DetachAccess *B) const; + + /// \brief Given a DetachAccess and a Use, determine whether DetachAccess \p A + /// dominates Use \p B. + bool dominates(const DetachAccess *A, const Use &B) const; + + /// \brief Verify that DetachSSA is self consistent (IE definitions dominate + /// all uses, uses appear in the right places). This is used by unit tests. + void verifyDetachSSA() const; + + /// Used in various insertion functions to specify whether we are talking + /// about the beginning or end of a block. + enum InsertionPlace { Beginning, End }; + +protected: + // Used by Detach SSA annotater, dumpers, and wrapper pass + friend class DetachSSAAnnotatedWriter; + friend class DetachSSAPrinterLegacyPass; + + void verifyDefUses(Function &F) const; + void verifyDomination(Function &F) const; + void verifyOrdering(Function &F) const; + + AccessList *getWritableBlockAccesses(const BasicBlock *BB) const { + auto It = PerBlockAccesses.find(BB); + return It == PerBlockAccesses.end() ? nullptr : It->second.get(); + } + + DefsList *getWritableBlockDefs(const BasicBlock *BB) const { + auto It = PerBlockDefs.find(BB); + return It == PerBlockDefs.end() ? nullptr : It->second.get(); + } + + void moveTo(DetachUseOrDef *What, BasicBlock *BB, AccessList::iterator Where); + void moveTo(DetachUseOrDef *What, BasicBlock *BB, InsertionPlace Point); + // Rename the dominator tree branch rooted at BB. + void renamePass(BasicBlock *BB, DetachAccess *IncomingVal, + SmallPtrSetImpl &Visited) { + renamePass(DT->getNode(BB), IncomingVal, Visited, true, true); + } + void removeFromLookups(DetachAccess *); + void removeFromLists(DetachAccess *, bool ShouldDelete = true); + void insertIntoListsForBlock(DetachAccess *, const BasicBlock *, + InsertionPlace); + void insertIntoListsBefore(DetachAccess *, const BasicBlock *, + AccessList::iterator); + // DetachUseOrDef *createDefinedAccess(Instruction *, DetachAccess *); + +private: + // class CachingWalker; + + // CachingWalker *getWalkerImpl(); + void buildDetachSSA(); + + void verifyUseInDefs(DetachAccess *, DetachAccess *) const; + using AccessMap = DenseMap>; + using DefsMap = DenseMap>; + + void + determineInsertionPoint(const SmallPtrSetImpl &DefiningBlocks); + void markUnreachableAsLiveOnEntry(BasicBlock *BB); + bool dominatesUse(const DetachAccess *, const DetachAccess *) const; + DetachPhi *createDetachPhi(BasicBlock *BB); + // DetachUseOrDef *createNewAccess(Instruction *); + DetachAccess *findDominatingDef(BasicBlock *, enum InsertionPlace); + void placePHINodes(const SmallPtrSetImpl &, + const DenseMap &); + DetachAccess *renameBlock(BasicBlock *, DetachAccess *, bool); + void renameSuccessorPhis(BasicBlock *, DetachAccess *, bool); + void renamePass(DomTreeNode *, DetachAccess *IncomingVal, + SmallPtrSetImpl &Visited, + bool SkipVisited = false, bool RenameAllUses = false); + AccessList *getOrCreateAccessList(const BasicBlock *); + DefsList *getOrCreateDefsList(const BasicBlock *); + void renumberBlock(const BasicBlock *) const; + DominatorTree *DT; + Function &F; + + // Detach SSA mappings + DenseMap ValueToDetachAccess; + // These two mappings contain the main block to access/def mappings for + // DetachSSA. The list contained in PerBlockAccesses really owns all the + // DetachAccesses. + // Both maps maintain the invariant that if a block is found in them, the + // corresponding list is not empty, and if a block is not found in them, the + // corresponding list is empty. + AccessMap PerBlockAccesses; + DefsMap PerBlockDefs; + std::unique_ptr LiveOnEntryDef; + + // Domination mappings + // Note that the numbering is local to a block, even though the map is + // global. + mutable SmallPtrSet BlockNumberingValid; + mutable DenseMap BlockNumbering; + + // Memory SSA building info + // std::unique_ptr Walker; + unsigned NextID; +}; + +// This pass does eager building and then printing of DetachSSA. It is used by +// the tests to be able to build, dump, and verify Detach SSA. +class DetachSSAPrinterLegacyPass : public FunctionPass { +public: + DetachSSAPrinterLegacyPass(); + + bool runOnFunction(Function &) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + + static char ID; +}; + +/// An analysis that produces \c DetachSSA for a function. +/// +class DetachSSAAnalysis : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + + static AnalysisKey Key; + +public: + // Wrap DetachSSA result to ensure address stability of internal DetachSSA + // pointers after construction. Use a wrapper class instead of plain + // unique_ptr to avoid build breakage on MSVC. + struct Result { + Result(std::unique_ptr &&DSSA) : DSSA(std::move(DSSA)) {} + DetachSSA &getDSSA() { return *DSSA.get(); } + + std::unique_ptr DSSA; + }; + + Result run(Function &F, FunctionAnalysisManager &AM); +}; + +/// \brief Printer pass for \c DetachSSA. +class DetachSSAPrinterPass : public PassInfoMixin { + raw_ostream &OS; + +public: + explicit DetachSSAPrinterPass(raw_ostream &OS) : OS(OS) {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// \brief Verifier pass for \c DetachSSA. +struct DetachSSAVerifierPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +/// \brief Legacy analysis pass which computes \c DetachSSA. +class DetachSSAWrapperPass : public FunctionPass { +public: + DetachSSAWrapperPass(); + + static char ID; + + bool runOnFunction(Function &) override; + void releaseMemory() override; + DetachSSA &getDSSA() { return *DSSA; } + const DetachSSA &getDSSA() const { return *DSSA; } + + void getAnalysisUsage(AnalysisUsage &AU) const override; + + void verifyAnalysis() const override; + void print(raw_ostream &OS, const Module *M = nullptr) const override; + +private: + std::unique_ptr DSSA; +}; + +/// \brief Iterator base class used to implement const and non-const iterators +/// over the defining accesses of a DetachAccess. +template +class detachaccess_def_iterator_base + : public iterator_facade_base, + std::forward_iterator_tag, T, ptrdiff_t, T *, + T *> { + using BaseT = typename detachaccess_def_iterator_base::iterator_facade_base; + +public: + detachaccess_def_iterator_base(T *Start) : Access(Start) {} + detachaccess_def_iterator_base() = default; + + bool operator==(const detachaccess_def_iterator_base &Other) const { + return Access == Other.Access && (!Access || ArgNo == Other.ArgNo); + } + + // This is a bit ugly, but for DetachPHI's, unlike PHINodes, you can't get the + // block from the operand in constant time (In a PHINode, the uselist has + // both, so it's just subtraction). We provide it as part of the + // iterator to avoid callers having to linear walk to get the block. + // If the operation becomes constant time on DetachPHI's, this bit of + // abstraction breaking should be removed. + BasicBlock *getPhiArgBlock() const { + DetachPhi *DP = dyn_cast(Access); + assert(DP && "Tried to get phi arg block when not iterating over a PHI"); + return DP->getIncomingBlock(ArgNo); + } + typename BaseT::iterator::pointer operator*() const { + assert(Access && "Tried to access past the end of our iterator"); + // Go to the first argument for phis, and the defining access for everything + // else. + if (DetachPhi *DP = dyn_cast(Access)) + return DP->getIncomingValue(ArgNo); + return cast(Access)->getDefiningAccess(); + } + using BaseT::operator++; + detachaccess_def_iterator &operator++() { + assert(Access && "Hit end of iterator"); + if (DetachPhi *DP = dyn_cast(Access)) { + if (++ArgNo >= DP->getNumIncomingValues()) { + ArgNo = 0; + Access = nullptr; + } + } else { + Access = nullptr; + } + return *this; + } + +private: + T *Access = nullptr; + unsigned ArgNo = 0; +}; + +inline detachaccess_def_iterator DetachAccess::defs_begin() { + return detachaccess_def_iterator(this); +} + +inline const_detachaccess_def_iterator DetachAccess::defs_begin() const { + return const_detachaccess_def_iterator(this); +} + +inline detachaccess_def_iterator DetachAccess::defs_end() { + return detachaccess_def_iterator(); +} + +inline const_detachaccess_def_iterator DetachAccess::defs_end() const { + return const_detachaccess_def_iterator(); +} + +/// \brief GraphTraits for a DetachAccess, which walks defs in the normal case, +/// and uses in the inverse case. +template <> struct GraphTraits { + using NodeRef = DetachAccess *; + using ChildIteratorType = detachaccess_def_iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + static ChildIteratorType child_begin(NodeRef N) { return N->defs_begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->defs_end(); } +}; + +template <> struct GraphTraits> { + using NodeRef = DetachAccess *; + using ChildIteratorType = DetachAccess::iterator; + + static NodeRef getEntryNode(NodeRef N) { return N; } + static ChildIteratorType child_begin(NodeRef N) { return N->user_begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->user_end(); } +}; + +} // End namespace llvm + +#endif // LLVM_ANALYSIS_DETACHSSA_H diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h index f0d11e9c16894e..c53ca11aaae3ea 100644 --- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h +++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h @@ -534,6 +534,9 @@ enum FunctionCodes { // 54 is unused. FUNC_CODE_OPERAND_BUNDLE = 55, // OPERAND_BUNDLE: [tag#, value...] FUNC_CODE_INST_UNOP = 56, // UNOP: [opcode, ty, opval] + FUNC_CODE_INST_DETACH = 57, // DETACH: [bb#, bb#] + FUNC_CODE_INST_REATTACH = 58, // REATTACH + FUNC_CODE_INST_SYNC = 59, // SYNC: [bb#] }; enum UseListCodes { diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h index d1770bf6e4cead..3c9c9d4f76be62 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h @@ -282,6 +282,12 @@ class IRTranslator : public MachineFunctionPass { bool translateIndirectBr(const User &U, MachineIRBuilder &MIRBuilder); + bool translateDetach(const User &U, MachineIRBuilder &MIRBuilder); + + bool translateReattach(const User &U, MachineIRBuilder &MIRBuilder); + + bool translateSync(const User &U, MachineIRBuilder &MIRBuilder); + bool translateExtractValue(const User &U, MachineIRBuilder &MIRBuilder); bool translateInsertValue(const User &U, MachineIRBuilder &MIRBuilder); diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h index 99eac33f742ec2..bcab517887c790 100644 --- a/llvm/include/llvm/IR/BasicBlock.h +++ b/llvm/include/llvm/IR/BasicBlock.h @@ -382,6 +382,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also /// /// Also note that this doesn't preserve any passes. To split blocks while /// keeping loop information consistent, use the SplitBlock utility function. + BasicBlock *splitBasicBlockWithTerminator(const Twine &BBName = ""); BasicBlock *splitBasicBlock(iterator I, const Twine &BBName = ""); BasicBlock *splitBasicBlock(Instruction *I, const Twine &BBName = "") { return splitBasicBlock(I->getIterator(), BBName); diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h index 9526d6287d2f83..8c56973b517d6c 100644 --- a/llvm/include/llvm/IR/DerivedTypes.h +++ b/llvm/include/llvm/IR/DerivedTypes.h @@ -245,6 +245,10 @@ class StructType : public CompositeType { /// Create an empty structure type. static StructType *get(LLVMContext &Context, bool isPacked = false); + /// Try to lookup a structure type by name, and create one if one does not + /// exist. + static StructType *getOrCreate(LLVMContext &Context, StringRef Name); + /// This static method is a convenience method for creating structure types by /// specifying the elements as arguments. Note that this method always returns /// a non-packed struct, and requires at least one element type. diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index fac2ff46c4531a..d5746c86329fd4 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -979,6 +979,26 @@ class IRBuilder : public IRBuilderBase, public Inserter { return Insert(new UnreachableInst(Context)); } + /// \brief Create a detach instruction, 'detach within SyncRegion, Detached, + // Continue'. + DetachInst *CreateDetach(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, MDNode *BranchWeights = nullptr) { + return Insert(addBranchMetadata(DetachInst::Create(Detached, Continue, + SyncRegion), + BranchWeights, nullptr)); + } + + /// \brief Create a reattach instruction, 'reattach within SyncRegion, + /// DetachContinue'. + ReattachInst *CreateReattach(BasicBlock *DetachContinue, Value *SyncRegion) { + return Insert(ReattachInst::Create(DetachContinue, SyncRegion)); + } + + /// \brief Create a sync instruction, 'sync within SyncRegion, Continue'. + SyncInst *CreateSync(BasicBlock *Continue, Value *SyncRegion) { + return Insert(SyncInst::Create(Continue, SyncRegion)); + } + //===--------------------------------------------------------------------===// // Instruction creation methods: Binary Operators //===--------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h index c5b4c6f71d7d8e..f068b39f959215 100644 --- a/llvm/include/llvm/IR/InstVisitor.h +++ b/llvm/include/llvm/IR/InstVisitor.h @@ -166,6 +166,18 @@ class InstVisitor { // Specific Instruction type classes... note that all of the casts are // necessary because we use the instruction classes as opaque types... // + RetTy visitReturnInst(ReturnInst &I) { DELEGATE(TerminatorInst);} + RetTy visitBranchInst(BranchInst &I) { DELEGATE(TerminatorInst);} + RetTy visitSwitchInst(SwitchInst &I) { DELEGATE(TerminatorInst);} + RetTy visitIndirectBrInst(IndirectBrInst &I) { DELEGATE(TerminatorInst);} + RetTy visitResumeInst(ResumeInst &I) { DELEGATE(TerminatorInst);} + RetTy visitUnreachableInst(UnreachableInst &I) { DELEGATE(TerminatorInst);} + RetTy visitCleanupReturnInst(CleanupReturnInst &I) { DELEGATE(TerminatorInst);} + RetTy visitCatchReturnInst(CatchReturnInst &I) { DELEGATE(TerminatorInst); } + RetTy visitCatchSwitchInst(CatchSwitchInst &I) { DELEGATE(TerminatorInst);} + RetTy visitDetachInst(DetachInst &I) { DELEGATE(TerminatorInst);} + RetTy visitReattachInst(ReattachInst &I) { DELEGATE(TerminatorInst);} + RetTy visitSyncInst(SyncInst &I) { DELEGATE(TerminatorInst);} RetTy visitICmpInst(ICmpInst &I) { DELEGATE(CmpInst);} RetTy visitFCmpInst(FCmpInst &I) { DELEGATE(CmpInst);} RetTy visitAllocaInst(AllocaInst &I) { DELEGATE(UnaryInstruction);} diff --git a/llvm/include/llvm/IR/Instruction.def b/llvm/include/llvm/IR/Instruction.def index 58e4e2e1d6cc50..80297e4c7ab603 100644 --- a/llvm/include/llvm/IR/Instruction.def +++ b/llvm/include/llvm/IR/Instruction.def @@ -135,89 +135,92 @@ HANDLE_TERM_INST ( 7, Unreachable , UnreachableInst) HANDLE_TERM_INST ( 8, CleanupRet , CleanupReturnInst) HANDLE_TERM_INST ( 9, CatchRet , CatchReturnInst) HANDLE_TERM_INST (10, CatchSwitch , CatchSwitchInst) - LAST_TERM_INST (10) +HANDLE_TERM_INST (11, Detach , DetachInst) +HANDLE_TERM_INST (12, Reattach , ReattachInst) +HANDLE_TERM_INST (13, Sync , SyncInst) + LAST_TERM_INST (13) // Standard unary operators... - FIRST_UNARY_INST(11) -HANDLE_UNARY_INST(11, FNeg , UnaryOperator) - LAST_UNARY_INST(11) + FIRST_UNARY_INST(14) +HANDLE_UNARY_INST(14, FNeg , UnaryOperator) + LAST_UNARY_INST(14) // Standard binary operators... - FIRST_BINARY_INST(12) -HANDLE_BINARY_INST(12, Add , BinaryOperator) -HANDLE_BINARY_INST(13, FAdd , BinaryOperator) -HANDLE_BINARY_INST(14, Sub , BinaryOperator) -HANDLE_BINARY_INST(15, FSub , BinaryOperator) -HANDLE_BINARY_INST(16, Mul , BinaryOperator) -HANDLE_BINARY_INST(17, FMul , BinaryOperator) -HANDLE_BINARY_INST(18, UDiv , BinaryOperator) -HANDLE_BINARY_INST(19, SDiv , BinaryOperator) -HANDLE_BINARY_INST(20, FDiv , BinaryOperator) -HANDLE_BINARY_INST(21, URem , BinaryOperator) -HANDLE_BINARY_INST(22, SRem , BinaryOperator) -HANDLE_BINARY_INST(23, FRem , BinaryOperator) + FIRST_BINARY_INST(14) +HANDLE_BINARY_INST(14, Add , BinaryOperator) +HANDLE_BINARY_INST(15, FAdd , BinaryOperator) +HANDLE_BINARY_INST(16, Sub , BinaryOperator) +HANDLE_BINARY_INST(17, FSub , BinaryOperator) +HANDLE_BINARY_INST(18, Mul , BinaryOperator) +HANDLE_BINARY_INST(19, FMul , BinaryOperator) +HANDLE_BINARY_INST(20, UDiv , BinaryOperator) +HANDLE_BINARY_INST(21, SDiv , BinaryOperator) +HANDLE_BINARY_INST(22, FDiv , BinaryOperator) +HANDLE_BINARY_INST(23, URem , BinaryOperator) +HANDLE_BINARY_INST(24, SRem , BinaryOperator) +HANDLE_BINARY_INST(25, FRem , BinaryOperator) // Logical operators (integer operands) -HANDLE_BINARY_INST(24, Shl , BinaryOperator) // Shift left (logical) -HANDLE_BINARY_INST(25, LShr , BinaryOperator) // Shift right (logical) -HANDLE_BINARY_INST(26, AShr , BinaryOperator) // Shift right (arithmetic) -HANDLE_BINARY_INST(27, And , BinaryOperator) -HANDLE_BINARY_INST(28, Or , BinaryOperator) -HANDLE_BINARY_INST(29, Xor , BinaryOperator) - LAST_BINARY_INST(29) +HANDLE_BINARY_INST(26, Shl , BinaryOperator) // Shift left (logical) +HANDLE_BINARY_INST(27, LShr , BinaryOperator) // Shift right (logical) +HANDLE_BINARY_INST(28, AShr , BinaryOperator) // Shift right (arithmetic) +HANDLE_BINARY_INST(29, And , BinaryOperator) +HANDLE_BINARY_INST(30, Or , BinaryOperator) +HANDLE_BINARY_INST(31, Xor , BinaryOperator) + LAST_BINARY_INST(31) // Memory operators... - FIRST_MEMORY_INST(30) -HANDLE_MEMORY_INST(30, Alloca, AllocaInst) // Stack management -HANDLE_MEMORY_INST(31, Load , LoadInst ) // Memory manipulation instrs -HANDLE_MEMORY_INST(32, Store , StoreInst ) -HANDLE_MEMORY_INST(33, GetElementPtr, GetElementPtrInst) -HANDLE_MEMORY_INST(34, Fence , FenceInst ) -HANDLE_MEMORY_INST(35, AtomicCmpXchg , AtomicCmpXchgInst ) -HANDLE_MEMORY_INST(36, AtomicRMW , AtomicRMWInst ) - LAST_MEMORY_INST(36) + FIRST_MEMORY_INST(32) +HANDLE_MEMORY_INST(32, Alloca, AllocaInst) // Stack management +HANDLE_MEMORY_INST(33, Load , LoadInst ) // Memory manipulation instrs +HANDLE_MEMORY_INST(34, Store , StoreInst ) +HANDLE_MEMORY_INST(35, GetElementPtr, GetElementPtrInst) +HANDLE_MEMORY_INST(36, Fence , FenceInst ) +HANDLE_MEMORY_INST(37, AtomicCmpXchg , AtomicCmpXchgInst ) +HANDLE_MEMORY_INST(38, AtomicRMW , AtomicRMWInst ) + LAST_MEMORY_INST(38) // Cast operators ... // NOTE: The order matters here because CastInst::isEliminableCastPair // NOTE: (see Instructions.cpp) encodes a table based on this ordering. - FIRST_CAST_INST(37) -HANDLE_CAST_INST(37, Trunc , TruncInst ) // Truncate integers -HANDLE_CAST_INST(38, ZExt , ZExtInst ) // Zero extend integers -HANDLE_CAST_INST(39, SExt , SExtInst ) // Sign extend integers -HANDLE_CAST_INST(40, FPToUI , FPToUIInst ) // floating point -> UInt -HANDLE_CAST_INST(41, FPToSI , FPToSIInst ) // floating point -> SInt -HANDLE_CAST_INST(42, UIToFP , UIToFPInst ) // UInt -> floating point -HANDLE_CAST_INST(43, SIToFP , SIToFPInst ) // SInt -> floating point -HANDLE_CAST_INST(44, FPTrunc , FPTruncInst ) // Truncate floating point -HANDLE_CAST_INST(45, FPExt , FPExtInst ) // Extend floating point -HANDLE_CAST_INST(46, PtrToInt, PtrToIntInst) // Pointer -> Integer -HANDLE_CAST_INST(47, IntToPtr, IntToPtrInst) // Integer -> Pointer -HANDLE_CAST_INST(48, BitCast , BitCastInst ) // Type cast -HANDLE_CAST_INST(49, AddrSpaceCast, AddrSpaceCastInst) // addrspace cast - LAST_CAST_INST(49) - - FIRST_FUNCLETPAD_INST(50) -HANDLE_FUNCLETPAD_INST(50, CleanupPad, CleanupPadInst) -HANDLE_FUNCLETPAD_INST(51, CatchPad , CatchPadInst) - LAST_FUNCLETPAD_INST(51) + FIRST_CAST_INST(39) +HANDLE_CAST_INST(39, Trunc , TruncInst ) // Truncate integers +HANDLE_CAST_INST(40, ZExt , ZExtInst ) // Zero extend integers +HANDLE_CAST_INST(41, SExt , SExtInst ) // Sign extend integers +HANDLE_CAST_INST(42, FPToUI , FPToUIInst ) // floating point -> UInt +HANDLE_CAST_INST(43, FPToSI , FPToSIInst ) // floating point -> SInt +HANDLE_CAST_INST(44, UIToFP , UIToFPInst ) // UInt -> floating point +HANDLE_CAST_INST(45, SIToFP , SIToFPInst ) // SInt -> floating point +HANDLE_CAST_INST(46, FPTrunc , FPTruncInst ) // Truncate floating point +HANDLE_CAST_INST(47, FPExt , FPExtInst ) // Extend floating point +HANDLE_CAST_INST(48, PtrToInt, PtrToIntInst) // Pointer -> Integer +HANDLE_CAST_INST(49, IntToPtr, IntToPtrInst) // Integer -> Pointer +HANDLE_CAST_INST(50, BitCast , BitCastInst ) // Type cast +HANDLE_CAST_INST(51, AddrSpaceCast, AddrSpaceCastInst) // addrspace cast + LAST_CAST_INST(51) + + FIRST_FUNCLETPAD_INST(52) +HANDLE_FUNCLETPAD_INST(52, CleanupPad, CleanupPadInst) +HANDLE_FUNCLETPAD_INST(53, CatchPad , CatchPadInst) + LAST_FUNCLETPAD_INST(53) // Other operators... - FIRST_OTHER_INST(52) -HANDLE_OTHER_INST(52, ICmp , ICmpInst ) // Integer comparison instruction -HANDLE_OTHER_INST(53, FCmp , FCmpInst ) // Floating point comparison instr. -HANDLE_OTHER_INST(54, PHI , PHINode ) // PHI node instruction -HANDLE_OTHER_INST(55, Call , CallInst ) // Call a function -HANDLE_OTHER_INST(56, Select , SelectInst ) // select instruction -HANDLE_USER_INST (57, UserOp1, Instruction) // May be used internally in a pass -HANDLE_USER_INST (58, UserOp2, Instruction) // Internal to passes only -HANDLE_OTHER_INST(59, VAArg , VAArgInst ) // vaarg instruction -HANDLE_OTHER_INST(60, ExtractElement, ExtractElementInst)// extract from vector -HANDLE_OTHER_INST(61, InsertElement, InsertElementInst) // insert into vector -HANDLE_OTHER_INST(62, ShuffleVector, ShuffleVectorInst) // shuffle two vectors. -HANDLE_OTHER_INST(63, ExtractValue, ExtractValueInst)// extract from aggregate -HANDLE_OTHER_INST(64, InsertValue, InsertValueInst) // insert into aggregate -HANDLE_OTHER_INST(65, LandingPad, LandingPadInst) // Landing pad instruction. - LAST_OTHER_INST(65) + FIRST_OTHER_INST(54) +HANDLE_OTHER_INST(54, ICmp , ICmpInst ) // Integer comparison instruction +HANDLE_OTHER_INST(55, FCmp , FCmpInst ) // Floating point comparison instr. +HANDLE_OTHER_INST(56, PHI , PHINode ) // PHI node instruction +HANDLE_OTHER_INST(57, Call , CallInst ) // Call a function +HANDLE_OTHER_INST(58, Select , SelectInst ) // select instruction +HANDLE_USER_INST (59, UserOp1, Instruction) // May be used internally in a pass +HANDLE_USER_INST (60, UserOp2, Instruction) // Internal to passes only +HANDLE_OTHER_INST(61, VAArg , VAArgInst ) // vaarg instruction +HANDLE_OTHER_INST(62, ExtractElement, ExtractElementInst)// extract from vector +HANDLE_OTHER_INST(63, InsertElement, InsertElementInst) // insert into vector +HANDLE_OTHER_INST(64, ShuffleVector, ShuffleVectorInst) // shuffle two vectors. +HANDLE_OTHER_INST(65, ExtractValue, ExtractValueInst)// extract from aggregate +HANDLE_OTHER_INST(66, InsertValue, InsertValueInst) // insert into aggregate +HANDLE_OTHER_INST(67, LandingPad, LandingPadInst) // Landing pad instruction. + LAST_OTHER_INST(67) #undef FIRST_TERM_INST #undef HANDLE_TERM_INST diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h index 5e78cb1edf02b1..f2f161730c7084 100644 --- a/llvm/include/llvm/IR/Instruction.h +++ b/llvm/include/llvm/IR/Instruction.h @@ -545,6 +545,7 @@ class Instruction : public User, // This list should be kept in sync with the list in mayWriteToMemory for // all opcodes which don't have a memory location. case Instruction::Fence: + case Instruction::Sync: // Like Instruction::Fence case Instruction::CatchPad: case Instruction::CatchRet: case Instruction::Call: diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h index 0ff8f56f213ad0..5557cb4fdae59c 100644 --- a/llvm/include/llvm/IR/Instructions.h +++ b/llvm/include/llvm/IR/Instructions.h @@ -4403,6 +4403,255 @@ class UnreachableInst : public Instruction { } }; +//===----------------------------------------------------------------------===// +// DetachInst Class +//===----------------------------------------------------------------------===// + +//===--------------------------------------------------------------------------- +/// DetachInst - Detach instruction +/// +class DetachInst : public TerminatorInst { + /// Ops list - The operands are ordered: Detached, Continue. + DetachInst(const DetachInst &DI); + void AssertOK(); + // DetachInst constructors (where {D, C} are blocks and SR is a token): + // DetachInst(BB *D, BB *C, Value *SR) - 'detach SR, D, C' + // DetachInst(BB *D, BB *C, Value *SR, Inst *I) + // - 'detach SR, D, C', insert before I + // DetachInst(BB *D, BB *C, Value *SR, BB *I) + // - 'detach SR, D, C', insert at end + DetachInst(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, + Instruction *InsertBefore = nullptr); + DetachInst(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, + BasicBlock *InsertAtEnd); +protected: + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + DetachInst *cloneImpl() const; + +public: + static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, + Instruction *InsertBefore = nullptr) { + return new(3) DetachInst(Detached, Continue, SyncRegion, InsertBefore); + } + static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, + BasicBlock *InsertAtEnd) { + return new(3) DetachInst(Detached, Continue, SyncRegion, InsertAtEnd); + } + + /// Provide fast operand accessors + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + Value *getSyncRegion() const { + return Op<-3>(); + } + + void setSyncRegion(Value *SyncRegion) { + Op<-3>() = SyncRegion; + } + + unsigned getNumSuccessors() const { return 2; } + + BasicBlock *getSuccessor(unsigned i) const { + assert(i < getNumSuccessors() && "Successor # out of range for detach!"); + return cast_or_null((&Op<-1>() - i)->get()); + } + + void setSuccessor(unsigned idx, BasicBlock *NewSucc) { + assert(idx < getNumSuccessors() && "Successor # out of range for detach!"); + *(&Op<-1>() - idx) = (Value*)NewSucc; + } + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Instruction *I) { + return (I->getOpcode() == Instruction::Detach); + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + inline BasicBlock* getDetached() const { return getSuccessor(0); } + inline BasicBlock* getContinue() const { return getSuccessor(1); } +private: + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); +}; + +template <> +struct OperandTraits : public VariadicOperandTraits { +}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachInst, Value) + +//===----------------------------------------------------------------------===// +// ReattachInst Class +//===----------------------------------------------------------------------===// + +//===--------------------------------------------------------------------------- +/// ReattachInst - Reattach instruction. This instruction terminates +/// a subCFG and has no successors. The DetachContinue field +/// maintains the continue block after the detach instruction +/// corresponding to this reattach. +/// +class ReattachInst : public TerminatorInst { + ReattachInst(const ReattachInst &RI); + void AssertOK(); + // ReattachInst constructors (where C is a block and SR is a token): + // ReattachInst(BB *C, Value *SR) - 'reattach SR, C' + // ReattachInst(BB *C, Value *SR, Inst *I) - 'reattach SR, C', insert before I + // ReattachInst(BB *C, Value *SR, BB *I) - 'reattach SR, C', insert at end + explicit ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + Instruction *InsertBefore = nullptr); + ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + BasicBlock *InsertAtEnd); +protected: + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + ReattachInst *cloneImpl() const; + +public: + static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion, + Instruction *InsertBefore = nullptr) { + return new(2) ReattachInst(DetachContinue, SyncRegion, InsertBefore); + } + + static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion, + BasicBlock *InsertAtEnd) { + return new(2) ReattachInst(DetachContinue, SyncRegion, InsertAtEnd); + } + + /// Transparently provide more efficient getOperand methods. + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + Value *getSyncRegion() const { + return Op<-2>(); + } + + void setSyncRegion(Value *SyncRegion) { + Op<-2>() = SyncRegion; + } + + unsigned getNumSuccessors() const { return 1; } + + BasicBlock *getDetachContinue() const { + return cast_or_null((&Op<-1>())->get()); + } + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Instruction *I) { + return I->getOpcode() == Instruction::Reattach; + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + BasicBlock *getSuccessor(unsigned i) const { + assert(i < getNumSuccessors() && "Successor # out of range for reattach!"); + return cast_or_null((&Op<-1>() - i)->get()); + } + void setSuccessor(unsigned idx, BasicBlock *NewSucc) { + assert(idx < getNumSuccessors() && + "Successor # out of range for reattach!"); + *(&Op<-1>() - idx) = NewSucc; + } +private: + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); +}; + +template <> +struct OperandTraits : public VariadicOperandTraits { +}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReattachInst, Value) + +//===----------------------------------------------------------------------===// +// SyncInst Class +//===----------------------------------------------------------------------===// + +//===--------------------------------------------------------------------------- +/// SyncInst - Sync instruction. +/// +class SyncInst : public TerminatorInst { + /// Ops list - A sync looks like an unconditional branch to its continuation. + SyncInst(const SyncInst &SI); + void AssertOK(); + // SyncInst constructor (where C is a block and SR is a token): + // SyncInst(BB *C, Value *SR) - 'sync SR, C' + // SyncInst(BB *C, Value *SR, Inst *I) - 'sync SR, C' insert before I + // SyncInst(BB *C, Value *SR, BB *I) - 'sync SR, C' insert at end + explicit SyncInst(BasicBlock *Continue, Value *SyncRegion, + Instruction *InsertBefore = nullptr); + SyncInst(BasicBlock *Continue, Value *SyncRegion, + BasicBlock *InsertAtEnd); +protected: + // Note: Instruction needs to be a friend here to call cloneImpl. + friend class Instruction; + SyncInst *cloneImpl() const; + +public: + static SyncInst *Create(BasicBlock *Continue, + Value *SyncRegion, + Instruction *InsertBefore = nullptr) { + return new(2) SyncInst(Continue, SyncRegion, InsertBefore); + } + static SyncInst *Create(BasicBlock *Continue, + Value *SyncRegion, BasicBlock *InsertAtEnd) { + return new(2) SyncInst(Continue, SyncRegion, InsertAtEnd); + } + + /// Transparently provide more efficient getOperand methods. + DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + + Value *getSyncRegion() const { + return Op<-2>(); + } + + void setSyncRegion(Value *SyncRegion) { + Op<-2>() = SyncRegion; + } + + unsigned getNumSuccessors() const { return 1; } + + // Methods for support type inquiry through isa, cast, and dyn_cast: + static inline bool classof(const Instruction *I) { + return I->getOpcode() == Instruction::Sync; + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + + BasicBlock *getSuccessor(unsigned i) const { + assert(i < getNumSuccessors() && "Successor # out of range for sync!"); + return cast_or_null((&Op<-1>() - i)->get()); + } + void setSuccessor(unsigned idx, BasicBlock *NewSucc) { + assert(idx < getNumSuccessors() && "Successor # out of range for sync!"); + *(&Op<-1>() - idx) = NewSucc; + } +private: + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); +}; + +template <> +struct OperandTraits : public VariadicOperandTraits { +}; + +DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SyncInst, Value) + //===----------------------------------------------------------------------===// // TruncInst Class //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index 64603d8ea03091..0eedd5e98f83e9 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -952,6 +952,13 @@ def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty], [IntrReadMem, IntrArgMemOnly, ReadOnly<0>, NoCapture<0>]>; +///===-------------------------- Tapir Intrinsics --------------------------===// +// +def int_syncregion_start : Intrinsic<[llvm_token_ty], [], + [IntrArgMemOnly]>; + +def int_detached_rethrow : Intrinsic<[], [], [Throws]>; + ///===-------------------------- Other Intrinsics --------------------------===// // def int_flt_rounds : Intrinsic<[llvm_i32_ty]>, diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def index e2ddba0aa1596e..22ca38793f5278 100644 --- a/llvm/include/llvm/IR/Value.def +++ b/llvm/include/llvm/IR/Value.def @@ -103,6 +103,10 @@ HANDLE_MEMORY_VALUE(MemoryUse) HANDLE_MEMORY_VALUE(MemoryDef) HANDLE_MEMORY_VALUE(MemoryPhi) +HANDLE_MEMORY_VALUE(DetachUse) +HANDLE_MEMORY_VALUE(DetachDef) +HANDLE_MEMORY_VALUE(DetachPhi) + HANDLE_INSTRUCTION(Instruction) // Enum values starting at InstructionVal are used for Instructions; // don't add new values here! diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h index 037c0dbb56ecec..3843050f205dfc 100644 --- a/llvm/include/llvm/InitializePasses.h +++ b/llvm/include/llvm/InitializePasses.h @@ -61,6 +61,9 @@ void initializeGlobalISel(PassRegistry&); /// Initialize all passes linked into the CodeGen library. void initializeTarget(PassRegistry&); +/// Initialize all passes linked into the TapirOpts library. +void initializeTapirOpts(PassRegistry&); + void initializeAAEvalLegacyPassPass(PassRegistry&); void initializeAAResultsWrapperPassPass(PassRegistry&); void initializeADCELegacyPassPass(PassRegistry&); @@ -100,7 +103,9 @@ void initializeCallGraphViewerPass(PassRegistry&); void initializeCallGraphWrapperPassPass(PassRegistry&); void initializeCallSiteSplittingLegacyPassPass(PassRegistry&); void initializeCalledValuePropagationLegacyPassPass(PassRegistry &); +void initializeCilkSanitizerPass(PassRegistry&); void initializeCodeGenPreparePass(PassRegistry&); +void initializeComprehensiveStaticInstrumentationPass(PassRegistry&); void initializeConstantHoistingLegacyPassPass(PassRegistry&); void initializeConstantMergeLegacyPassPass(PassRegistry&); void initializeConstantPropagationPass(PassRegistry&); @@ -119,6 +124,8 @@ void initializeDelinearizationPass(PassRegistry&); void initializeDemandedBitsWrapperPassPass(PassRegistry&); void initializeDependenceAnalysisPass(PassRegistry&); void initializeDependenceAnalysisWrapperPassPass(PassRegistry&); +void initializeDetachSSAPrinterLegacyPassPass(PassRegistry&); +void initializeDetachSSAWrapperPassPass(PassRegistry&); void initializeDetectDeadLanesPass(PassRegistry&); void initializeDivRemPairsLegacyPassPass(PassRegistry&); void initializeDomOnlyPrinterPass(PassRegistry&); @@ -219,6 +226,7 @@ void initializeLoopDeletionLegacyPassPass(PassRegistry&); void initializeLoopDistributeLegacyPass(PassRegistry&); void initializeLoopExtractorPass(PassRegistry&); void initializeLoopGuardWideningLegacyPassPass(PassRegistry&); +void initializeLoopFusePass(PassRegistry&); void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&); void initializeLoopInfoWrapperPassPass(PassRegistry&); void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&); @@ -230,6 +238,7 @@ void initializeLoopRerollPass(PassRegistry&); void initializeLoopRotateLegacyPassPass(PassRegistry&); void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&); void initializeLoopSimplifyPass(PassRegistry&); +void initializeLoopSpawningPass(PassRegistry&); void initializeLoopStrengthReducePass(PassRegistry&); void initializeLoopUnrollAndJamPass(PassRegistry&); void initializeLoopUnrollPass(PassRegistry&); @@ -244,6 +253,7 @@ void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&); void initializeLowerIntrinsicsPass(PassRegistry&); void initializeLowerInvokeLegacyPassPass(PassRegistry&); void initializeLowerSwitchPass(PassRegistry&); +void initializeLowerTapirToCilkPass(PassRegistry&); void initializeLowerTypeTestsPass(PassRegistry&); void initializeMIRCanonicalizerPass(PassRegistry &); void initializeMIRPrintingPassPass(PassRegistry&); diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 0851c2f8d265bc..8564d42e5609d3 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -52,6 +52,7 @@ #include "llvm/Transforms/Scalar/InstSimplifyPass.h" #include "llvm/Transforms/Scalar/Scalarizer.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Tapir.h" #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Vectorize.h" @@ -131,6 +132,7 @@ namespace { (void) llvm::createLoopPredicationPass(); (void) llvm::createLoopSimplifyPass(); (void) llvm::createLoopSimplifyCFGPass(); + (void) llvm::createLoopSpawningPass(); (void) llvm::createLoopStrengthReducePass(); (void) llvm::createLoopRerollPass(); (void) llvm::createLoopUnrollPass(); @@ -142,6 +144,7 @@ namespace { (void) llvm::createLowerExpectIntrinsicPass(); (void) llvm::createLowerInvokePass(); (void) llvm::createLowerSwitchPass(); + (void) llvm::createLowerTapirToCilkPass(false,false); (void) llvm::createNaryReassociatePass(); (void) llvm::createObjCARCAAWrapperPass(); (void) llvm::createObjCARCAPElimPass(); @@ -221,6 +224,11 @@ namespace { (void) llvm::createEliminateAvailableExternallyPass(); (void) llvm::createScalarizeMaskedMemIntrinPass(); (void) llvm::createWarnMissedTransformationsPass(); + (void) llvm::createSmallBlockPass(); + (void) llvm::createRedundantSpawnPass(); + (void) llvm::createSpawnRestructurePass(); + (void) llvm::createSyncEliminationPass(); + (void) llvm::createSpawnUnswitchPass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); diff --git a/llvm/include/llvm/Transforms/CSI.h b/llvm/include/llvm/Transforms/CSI.h new file mode 100644 index 00000000000000..a357324d013b3e --- /dev/null +++ b/llvm/include/llvm/Transforms/CSI.h @@ -0,0 +1,610 @@ +//===-- CSI.h ------------------------instrumentation hooks --*- C++ -*----===// +// +// The LLVM Compiler Infrastructure +// +// TODO: License +//===----------------------------------------------------------------------===// +// +// This file is part of CSI, a framework that provides comprehensive static +// instrumentation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_CSI_H +#define LLVM_TRANSFORMS_CSI_H + +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" + +namespace llvm { + +static const char *const CsiRtUnitInitName = "__csirt_unit_init"; +static const char *const CsiRtUnitCtorName = "csirt.unit_ctor"; +static const char *const CsiFunctionBaseIdName = "__csi_unit_func_base_id"; +static const char *const CsiFunctionExitBaseIdName = "__csi_unit_func_exit_base_id"; +static const char *const CsiBasicBlockBaseIdName = "__csi_unit_bb_base_id"; +static const char *const CsiCallsiteBaseIdName = "__csi_unit_callsite_base_id"; +static const char *const CsiLoadBaseIdName = "__csi_unit_load_base_id"; +static const char *const CsiStoreBaseIdName = "__csi_unit_store_base_id"; +static const char *const CsiUnitFedTableName = "__csi_unit_fed_table"; +static const char *const CsiFuncIdVariablePrefix = "__csi_func_id_"; +static const char *const CsiUnitFedTableArrayName = "__csi_unit_fed_tables"; +static const char *const CsiInitCallsiteToFunctionName = + "__csi_init_callsite_to_function"; +static const char *const CsiDisableInstrumentationName = + "__csi_disable_instrumentation"; + +static const int64_t CsiCallsiteUnknownTargetId = -1; +// See llvm/tools/clang/lib/CodeGen/CodeGenModule.h: +static const int CsiUnitCtorPriority = 65535; + +/// Maintains a mapping from CSI ID to static data for that ID. +class ForensicTable { +public: + ForensicTable() : BaseId(nullptr), IdCounter(0) {} + ForensicTable(Module &M, StringRef BaseIdName); + + /// The number of entries in this forensic table + uint64_t size() const { return IdCounter; } + + /// Get the local ID of the given Value. + uint64_t getId(const Value *V); + + /// The GlobalVariable holding the base ID for this forensic table. + GlobalVariable *baseId() const { return BaseId; } + + /// Converts a local to global ID conversion. + /// + /// This is done by using the given IRBuilder to insert a load to the base ID + /// global variable followed by an add of the base value and the local ID. + /// + /// \returns A Value holding the global ID corresponding to the + /// given local ID. + Value *localToGlobalId(uint64_t LocalId, IRBuilder<> &IRB) const; + +protected: + /// The GlobalVariable holding the base ID for this FED table. + GlobalVariable *BaseId; + /// Counter of local IDs used so far. + uint64_t IdCounter; + /// Map of Value to Local ID. + DenseMap ValueToLocalIdMap; +}; + +/// Maintains a mapping from CSI ID to front-end data for that ID. +/// +/// The front-end data currently is the source location that a given +/// CSI ID corresponds to. +class FrontEndDataTable : public ForensicTable { +public: + FrontEndDataTable() : ForensicTable() {} + FrontEndDataTable(Module &M, StringRef BaseIdName) + : ForensicTable(M, BaseIdName) {} + + /// The number of entries in this FED table + uint64_t size() const { return LocalIdToSourceLocationMap.size(); } + + /// Add the given Function to this FED table. + /// \returns The local ID of the Function. + uint64_t add(const Function &F); + + /// Add the given BasicBlock to this FED table. + /// \returns The local ID of the BasicBlock. + uint64_t add(const BasicBlock &BB); + + /// Add the given Instruction to this FED table. + /// \returns The local ID of the Instruction. + uint64_t add(const Instruction &I); + + /// Get the Type for a pointer to a FED table entry. + /// + /// A FED table entry is just a source location. + static PointerType *getPointerType(LLVMContext &C); + + /// Insert this FED table into the given Module. + /// + /// The FED table is constructed as a ConstantArray indexed by local + /// IDs. The runtime is responsible for performing the mapping that + /// allows the table to be indexed by global ID. + Constant *insertIntoModule(Module &M) const; + +private: + struct SourceLocation { + StringRef Name; + int32_t Line; + int32_t Column; + StringRef Filename; + StringRef Directory; + }; + + /// Map of local ID to SourceLocation. + DenseMap LocalIdToSourceLocationMap; + + /// Create a struct type to match the "struct SourceLocation" type. + /// (and the source_loc_t type in csi.h). + static StructType *getSourceLocStructType(LLVMContext &C); + + /// Append the debug information to the table, assigning it the next + /// available ID. + /// + /// \returns The local ID of the appended information. + /// @{ + void add(uint64_t ID, const DILocation *Loc); + void add(uint64_t ID, const DISubprogram *Subprog); + /// @} + + /// Append the line and file information to the table, assigning it + /// the next available ID. + /// + /// \returns The new local ID of the DILocation. + void add(uint64_t ID, int32_t Line = -1, int32_t Column = -1, + StringRef Filename = "", StringRef Directory = "", + StringRef Name = ""); +}; + +/// Represents a property value passed to hooks. +class CsiProperty { +public: + CsiProperty() {} + + /// Return the coerced type of a property. + /// + /// TODO: Right now, this function simply returns a 64-bit integer. Although + /// this solution works for x86_64, it should be generalized to handle other + /// architectures in the future. + static Type *getCoercedType(LLVMContext &C, StructType *Ty) { + // Must match the definition of property type in csi.h + // return StructType::get(IntegerType::get(C, 64), + // nullptr); + // We return an integer type, rather than a struct type, to deal with x86_64 + // type coercion on struct bit fields. + return IntegerType::get(C, 64); + } + + /// Return a constant value holding this property. + virtual Constant *getValueImpl(LLVMContext &C) const = 0; + + Constant *getValue(IRBuilder<> &IRB) const { + return getValueImpl(IRB.getContext()); + } +}; + +class CsiFuncProperty : public CsiProperty { +public: + CsiFuncProperty() { + PropValue.Bits = 0; + } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.MaySpawn), + IntegerType::get(C, PropBits.Padding))); + } + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the MightDetach property. + void setMaySpawn(bool v) { + PropValue.Fields.MaySpawn = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned MaySpawn : 1; + uint64_t Padding : 63; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int MaySpawn; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = { 1, (64-1) }; +}; + +class CsiFuncExitProperty : public CsiProperty { +public: + CsiFuncExitProperty() { + PropValue.Bits = 0; + } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.MaySpawn), + IntegerType::get(C, PropBits.Padding))); + } + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the MightDetach property. + void setMaySpawn(bool v) { + PropValue.Fields.MaySpawn = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned MaySpawn : 1; + uint64_t Padding : 63; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int MaySpawn; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = { 1, (64-1) }; +}; + +class CsiBBProperty : public CsiProperty { +public: + CsiBBProperty() { + PropValue.Bits = 0; + } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.IsLandingPad), + IntegerType::get(C, PropBits.IsEHPad), + IntegerType::get(C, PropBits.Padding))); + } + + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get(StructTy, + // ConstantInt::get(IntegerType::get(C, 64), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsLandingPad property. + void setIsLandingPad(bool v) { + PropValue.Fields.IsLandingPad = v; + } + + /// Set the value of the IsEHPad property. + void setIsEHPad(bool v) { + PropValue.Fields.IsEHPad = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsLandingPad : 1; + unsigned IsEHPad : 1; + uint64_t Padding : 62; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsLandingPad; + int IsEHPad; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = { 1, 1, (64-1-1) }; +}; + +class CsiCallProperty : public CsiProperty { +public: + CsiCallProperty() { + PropValue.Bits = 0; + } + + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.IsIndirect), + IntegerType::get(C, PropBits.Padding))); + } + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // StructType *StructTy = getType(C); + // return ConstantStruct::get( + // StructTy, + // ConstantInt::get(IntegerType::get(C, PropBits.IsIndirect), + // PropValue.IsIndirect), + // ConstantInt::get(IntegerType::get(C, PropBits.Padding), 0), + // nullptr); + // TODO: This solution works for x86, but should be generalized to support + // other architectures in the future. + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the IsIndirect property. + void setIsIndirect(bool v) { + PropValue.Fields.IsIndirect = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned IsIndirect : 1; + uint64_t Padding : 63; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int IsIndirect; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = { 1, (64-1) }; +}; + +class CsiLoadStoreProperty : public CsiProperty { +public: + CsiLoadStoreProperty() { + PropValue.Bits = 0; + } + /// Return the Type of a property. + static Type *getType(LLVMContext &C) { + // Must match the definition of property type in csi.h + return CsiProperty::getCoercedType( + C, StructType::get(IntegerType::get(C, PropBits.Alignment), + IntegerType::get(C, PropBits.IsVtableAccess), + IntegerType::get(C, PropBits.IsConstant), + IntegerType::get(C, PropBits.IsOnStack), + IntegerType::get(C, PropBits.MayBeCaptured), + IntegerType::get(C, PropBits.LoadReadBeforeWriteInBB), + IntegerType::get(C, PropBits.Padding))); + } + /// Return a constant value holding this property. + Constant *getValueImpl(LLVMContext &C) const override { + // Must match the definition of property type in csi.h + // return ConstantStruct::get( + // StructTy, + // ConstantInt::get(IntegerType::get(C, PropBits.Alignment), + // PropValue.Alignment), + // ConstantInt::get(IntegerType::get(C, PropBits.IsVtableAccess), + // PropValue.IsVtableAccess), + // ConstantInt::get(IntegerType::get(C, PropBits.IsConstant), + // PropValue.IsVtableAccess), + // ConstantInt::get(IntegerType::get(C, PropBits.IsOnStack), + // PropValue.IsVtableAccess), + // ConstantInt::get(IntegerType::get(C, PropBits.MayBeCaptured), + // PropValue.IsVtableAccess), + // ConstantInt::get(IntegerType::get(C, PropBits.LoadReadBeforeWriteInBB), + // PropValue.LoadReadBeforeWriteInBB), + // ConstantInt::get(IntegerType::get(C, PropBits.Padding), 0), + // nullptr); + return ConstantInt::get(getType(C), PropValue.Bits); + } + + /// Set the value of the Alignment property. + void setAlignment(char v) { + PropValue.Fields.Alignment = v; + } + /// Set the value of the IsVtableAccess property. + void setIsVtableAccess(bool v) { + PropValue.Fields.IsVtableAccess = v; + } + /// Set the value of the IsConstant property. + void setIsConstant(bool v) { + PropValue.Fields.IsConstant = v; + } + /// Set the value of the IsOnStack property. + void setIsOnStack(bool v) { + PropValue.Fields.IsOnStack = v; + } + /// Set the value of the MayBeCaptured property. + void setMayBeCaptured(bool v) { + PropValue.Fields.MayBeCaptured = v; + } + /// Set the value of the LoadReadBeforeWriteInBB property. + void setLoadReadBeforeWriteInBB(bool v) { + PropValue.Fields.LoadReadBeforeWriteInBB = v; + } + +private: + typedef union { + // Must match the definition of property type in csi.h + struct { + unsigned Alignment : 8; + unsigned IsVtableAccess : 1; + unsigned IsConstant : 1; + unsigned IsOnStack : 1; + unsigned MayBeCaptured : 1; + unsigned LoadReadBeforeWriteInBB : 1; + uint64_t Padding : 53; + } Fields; + uint64_t Bits; + } Property; + + /// The underlying values of the properties. + Property PropValue; + + typedef struct { + int Alignment; + int IsVtableAccess; + int IsConstant; + int IsOnStack; + int MayBeCaptured; + int LoadReadBeforeWriteInBB; + int Padding; + } PropertyBits; + + /// The number of bits representing each property. + static constexpr PropertyBits PropBits = { 8, 1, 1, 1, 1, 1, (64-8-1-1-1-1-1) }; +}; + +struct CSIImpl { +public: + CSIImpl(Module &M, CallGraph *CG, + const CSIOptions &Options = CSIOptions()) + : M(M), DL(M.getDataLayout()), CG(CG), Options(Options), + CsiFuncEntry(nullptr), CsiFuncExit(nullptr), CsiBBEntry(nullptr), + CsiBBExit(nullptr), CsiBeforeCallsite(nullptr), + CsiAfterCallsite(nullptr), CsiBeforeRead(nullptr), + CsiAfterRead(nullptr), CsiBeforeWrite(nullptr), CsiAfterWrite(nullptr), + MemmoveFn(nullptr), MemcpyFn(nullptr), MemsetFn(nullptr), + InitCallsiteToFunction(nullptr), RTUnitInit(nullptr) + {} + + bool run(); + + /// Get the number of bytes accessed via the given address. + static int getNumBytesAccessed(Value *Addr, const DataLayout &DL); + + /// Members to extract properties of loads/stores. + static bool isVtableAccess(Instruction *I); + static bool addrPointsToConstantData(Value *Addr); + static bool isAtomic(Instruction *I); + +protected: + /// Initialize the CSI pass. + void initializeCsi(); + /// Finalize the CSI pass. + void finalizeCsi(); + + /// Initialize llvm::Functions for the CSI hooks. + /// @{ + void initializeLoadStoreHooks(); + void initializeFuncHooks(); + void initializeBasicBlockHooks(); + void initializeCallsiteHooks(); + void initializeMemIntrinsicsHooks(); + /// @} + + static StructType *getUnitFedTableType(LLVMContext &C, + PointerType *EntryPointerType); + static Constant *fedTableToUnitFedTable(Module &M, + StructType *UnitFedTableType, + FrontEndDataTable &FedTable); + /// Initialize the front-end data table structures. + void initializeFEDTables(); + /// Collect unit front-end data table structures for finalization. + void collectUnitFEDTables(); + + virtual CallInst *createRTUnitInitCall(IRBuilder<> &IRB); + + // Get the local ID of the given function. + uint64_t getLocalFunctionID(Function &F); + /// Generate a function that stores global function IDs into a set + /// of externally-visible global variables. + void generateInitCallsiteToFunction(); + + /// Compute CSI properties on the given ordered list of loads and stores. + void computeLoadAndStoreProperties( + SmallVectorImpl> + &LoadAndStoreProperties, + SmallVectorImpl &BBLoadsAndStores, + const DataLayout &DL); + + /// Insert calls to the instrumentation hooks. + /// @{ + void addLoadStoreInstrumentation(Instruction *I, Function *BeforeFn, + Function *AfterFn, Value *CsiId, + Type *AddrType, Value *Addr, int NumBytes, + CsiLoadStoreProperty &Prop); + void instrumentLoadOrStore(Instruction *I, CsiLoadStoreProperty &Prop, + const DataLayout &DL); + void instrumentAtomic(Instruction *I, const DataLayout &DL); + bool instrumentMemIntrinsic(Instruction *I); + void instrumentCallsite(Instruction *I); + void instrumentBasicBlock(BasicBlock &BB); + void instrumentFunction(Function &F); + /// @} + + /// Insert a conditional call to the given hook function before the + /// given instruction. The condition is based on the value of + /// __csi_disable_instrumentation. + void insertConditionalHookCall(Instruction *I, Function *HookFunction, + ArrayRef HookArgs); + + /// Return true if the given function should not be instrumented. + bool shouldNotInstrumentFunction(Function &F); + + Module &M; + const DataLayout &DL; + CallGraph *CG; + CSIOptions Options; + + FrontEndDataTable FunctionFED, FunctionExitFED, BasicBlockFED, CallsiteFED, + LoadFED, StoreFED; + + SmallVector UnitFedTables; + + // Instrumentation hooks + Function *CsiFuncEntry, *CsiFuncExit; + Function *CsiBBEntry, *CsiBBExit; + Function *CsiBeforeCallsite, *CsiAfterCallsite; + Function *CsiBeforeRead, *CsiAfterRead; + Function *CsiBeforeWrite, *CsiAfterWrite; + + Function *MemmoveFn, *MemcpyFn, *MemsetFn; + Function *InitCallsiteToFunction; + // GlobalVariable *DisableInstrGV; + + // Runtime unit initialization + Function *RTUnitInit; + + Type *IntptrTy; + DenseMap FuncOffsetMap; +}; + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_CSI_H diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h index 276306f686ffac..34170aff4f44ff 100644 --- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -113,8 +113,15 @@ class PassManagerBuilder { /// passes at the end of the main CallGraphSCC passes and before any /// function simplification passes run by CGPassManager. EP_CGSCCOptimizerLate, + + /// EP_TapirLate - This extension point allows adding passes just before + /// Tapir instructions are lowered to calls into a parallel runtime system. + EP_TapirLate, }; + /// Whether the Cilk Calls should be instrumented + bool InstrumentCilk; + /// The Optimization Level - Specify the basic optimization level. /// 0 = -O0, 1 = -O1, 2 = -O2, 3 = -O3 unsigned OptLevel; @@ -123,6 +130,12 @@ class PassManagerBuilder { /// 0 = none, 1 = -Os, 2 = -Oz unsigned SizeLevel; + /// The Pre-lowering to parallel runtime calls optimization level + /// 0 = -P0 = leave with detach instructions, 1 = no optimizations before conversion, 2 = optimize before conversion + unsigned ParallelLevel; + + bool Rhino; + /// LibraryInfo - Specifies information about the runtime library for the /// optimizer. If this is non-null, it is added to both the function and /// per-module pass pipeline. @@ -189,6 +202,7 @@ class PassManagerBuilder { void addPGOInstrPasses(legacy::PassManagerBase &MPM); void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM); void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const; + void prepopulateModulePassManager(legacy::PassManagerBase &MPM); public: /// populateFunctionPassManager - This fills in the function pass manager, diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h index 017cab0a7750df..78dca4e1ef0ffd 100644 --- a/llvm/include/llvm/Transforms/Instrumentation.h +++ b/llvm/include/llvm/Transforms/Instrumentation.h @@ -203,7 +203,26 @@ struct SanitizerCoverageOptions { ModulePass *createSanitizerCoverageModulePass( const SanitizerCoverageOptions &Options = SanitizerCoverageOptions()); -/// Calculate what to divide by to scale counts. +// Insert CilkSanitizer (Cilk determinacy race detection) instrumentation +ModulePass *createCilkSanitizerPass(); + +// Options for comprehensive static instrumentation +struct CSIOptions { + bool InstrumentFuncEntryExit = true; + bool InstrumentBasicBlocks = true; + bool InstrumentMemoryAccesses = true; + bool InstrumentCalls = true; + bool InstrumentAtomics = true; + bool InstrumentMemIntrinsics = true; + + CSIOptions() = default; +}; + +// Insert ComprehensiveStaticInstrumentation instrumentation +ModulePass *createComprehensiveStaticInstrumentationPass( + const CSIOptions &Options = CSIOptions()); + +/// \brief Calculate what to divide by to scale counts. /// /// Given the maximum count, calculate a divisor that will scale all the /// weights to strictly less than std::numeric_limits::max(). diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h index 8fcf9296ba47c6..1808ba38ae7e8a 100644 --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -451,6 +451,12 @@ FunctionPass *createNaryReassociatePass(); // FunctionPass *createLoopDistributePass(); +//===----------------------------------------------------------------------===// +// +// LoopFuse - Fuse loops. +// +FunctionPass *createLoopFusePass(); + //===----------------------------------------------------------------------===// // // LoopLoadElimination - Perform loop-aware load elimination. diff --git a/llvm/include/llvm/Transforms/Scalar/LoopFuse.h b/llvm/include/llvm/Transforms/Scalar/LoopFuse.h new file mode 100644 index 00000000000000..5b7011e3b432a5 --- /dev/null +++ b/llvm/include/llvm/Transforms/Scalar/LoopFuse.h @@ -0,0 +1,130 @@ +//===------------- LoopFuse.h - Loop Fusion Utility -----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// Fuse two adjacent loops to improve cache locality. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopVersioning.h" +#include + +namespace llvm { +/// \brief The pass class. +class LoopFuse : public FunctionPass { + +public: + // Kind of fusion made. + enum Kind { + NO_FUSION = 0, // Fusion was not made even to check dependence legality. + // This is when loops had failed basic structure checks. + REVERTED_FUSION, // Fusion was reverted due to failed dependence legality. + PURE_FUSION, // Fusion succeeded with removal of original loops. + VERSIONED_FUSION // Fusion succeeded with versioning due to runtime checks. + }; + +private: + // Analyses used. + LoopInfo *LI; + LoopAccessLegacyAnalysis *LAA; + DominatorTree *DT; + ScalarEvolution *SE; + + // FusionSwitcher - Branch instruction that controls switching between + // original and fused versions. This gets initialized to true when loops are + // multiversioned to check fusion legality. By default, it points to original + // version. + BranchInst *FusionSwitcher; + + Loop *FusedLoop; + + // LAI for FusedLoop. + const LoopAccessInfo *LAI; + + // Kind of fusion that happened. + Kind FusionKind = NO_FUSION; + + // CustomVMap: VMap of BBs for fused loop. The problem about having + // ValueToValueMapTy passed from a client is that it gets updated when the + // loops are removed based on fusion success and this is undesirable. Also + // a ValueToValueMapTy is used when both Values are present. So, only a + // normal llvm::Value* is maintained as map's value in contrast with + // ValueToValueMapTy's WeakVH. Clients can use this mapping as a VMap. + typedef std::map CustomVMap; + CustomVMap VMap; + + // Rewrite IncomingBlocks in PHIs of @Br's successor blocks from Br's parent + // to @To. + void RewritePHI(BranchInst *Br, BasicBlock *To); + + // Fuse loops - @L1 and @L2 and return the fused loop. + Loop *FuseLoops(Loop &L1, Loop &L2); + + // Legality and profitability checks. + bool DependenceLegal(Loop &L1, Loop &L2); + bool DefsUsedAcrossLoops(Loop &L1, Loop &L2); + bool IsLegalAndProfitable(Loop &L1, Loop &L2); + + // Removal routines based on fusion success. + void RemoveLoopCompletelyWithPreheader(Loop &L); + void RemoveFusionSwitcher(Loop &L); + + // Outside use updates. + void UpdateUsesOutsideLoop(Loop &L); + void AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock); + +public: + LoopFuse() : FunctionPass(ID) { + initializeLoopFusePass(*PassRegistry::getPassRegistry()); + } + + // Initialization interface when this pass is used as a utility. + LoopFuse(LoopInfo *_LI, LoopAccessLegacyAnalysis *_LAA, DominatorTree *_DT, + ScalarEvolution *_SE) + : FunctionPass(ID), LI(_LI), LAA(_LAA), DT(_DT), SE(_SE) {} + + Loop *getFusedLoop() { return FusedLoop; } + + const CustomVMap &getVMap() { return VMap; } + + unsigned getFusionKind() { return FusionKind; } + + // Interface; when this pass is used as a utility. + bool run(Loop &L1, Loop &L2); + + bool runOnFunction(Function &F) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + + AU.addPreserved(); + AU.addPreserved(); + AU.addPreserved(); + } + + static char ID; +}; +} // anonymous namespace diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h index b36c6f492be12a..fcd43fad841f27 100644 --- a/llvm/include/llvm/Transforms/Scalar/SROA.h +++ b/llvm/include/llvm/Transforms/Scalar/SROA.h @@ -64,6 +64,7 @@ class SROALegacyPass; /// this form. By doing so, it will enable promotion of vector aggregates to /// SSA vector values. class SROA : public PassInfoMixin { + bool FunctionContainsDetach = false; LLVMContext *C = nullptr; DominatorTree *DT = nullptr; AssumptionCache *AC = nullptr; diff --git a/llvm/include/llvm/Transforms/Tapir.h b/llvm/include/llvm/Transforms/Tapir.h new file mode 100644 index 00000000000000..96626c283bf40c --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir.h @@ -0,0 +1,68 @@ +//===-- Tapir.h - Tapir Transformations -------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This header file defines prototypes for accessor functions that expose passes +// in the Tapir transformations library. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_H +#define LLVM_TRANSFORMS_TAPIR_H + +namespace llvm { +class Pass; +class ModulePass; +class FunctionPass; + +//===----------------------------------------------------------------------===// +// +// LoopSpawning - Create a loop spawning pass. +// +Pass *createLoopSpawningPass(); + +//===----------------------------------------------------------------------===// +// +// SmallBlock - Do SmallBlock Pass +// +FunctionPass *createSmallBlockPass(); + +//===----------------------------------------------------------------------===// +// +// SyncElimination - TODO +// +FunctionPass *createSyncEliminationPass(); + +//===----------------------------------------------------------------------===// +// +// RedundantSpawn - Do RedundantSpawn Pass +// +FunctionPass *createRedundantSpawnPass(); + +//===----------------------------------------------------------------------===// +// +// SpawnRestructure - Do SpawnRestructure Pass +// +FunctionPass *createSpawnRestructurePass(); + +//===----------------------------------------------------------------------===// +// +// SpawnUnswitch - Do SpawnUnswitch Pass +// +FunctionPass *createSpawnUnswitchPass(); + +//===----------------------------------------------------------------------===// +// +// PromoteDetachToCilk +// +ModulePass *createLowerTapirToCilkPass(bool DisablePostOpts = false, + bool Instrument = false); + +} // End llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/CilkABI.h b/llvm/include/llvm/Transforms/Tapir/CilkABI.h new file mode 100644 index 00000000000000..6c6bd7f4b21f51 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/CilkABI.h @@ -0,0 +1,368 @@ +//===- CilkABI.h - Interface to the Intel Cilk Plus runtime ----*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass is a simple pass wrapper around the PromoteMemToReg function call +// exposed by the Utils library. +// +//===----------------------------------------------------------------------===// +#ifndef CILK_ABI_H_ +#define CILK_ABI_H_ + +#include "llvm/Transforms/Scalar.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/TypeBuilder.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include + +extern llvm::cl::opt fastCilk; + +namespace { + +typedef void *__CILK_JUMP_BUFFER[5]; + +struct __cilkrts_pedigree {}; +struct __cilkrts_stack_frame {}; +struct __cilkrts_worker {}; +struct global_state_t {}; + +enum { + __CILKRTS_ABI_VERSION = 1 +}; + +enum { + CILK_FRAME_STOLEN = 0x01, + CILK_FRAME_UNSYNCHED = 0x02, + CILK_FRAME_DETACHED = 0x04, + CILK_FRAME_EXCEPTION_PROBED = 0x08, + CILK_FRAME_EXCEPTING = 0x10, + CILK_FRAME_LAST = 0x80, + CILK_FRAME_EXITING = 0x0100, + CILK_FRAME_SUSPENDED = 0x8000, + CILK_FRAME_UNWINDING = 0x10000 +}; + +#define CILK_FRAME_VERSION (__CILKRTS_ABI_VERSION << 24) +#define CILK_FRAME_VERSION_MASK 0xFF000000 +#define CILK_FRAME_FLAGS_MASK 0x00FFFFFF +#define CILK_FRAME_VERSION_VALUE(_flags) (((_flags) & CILK_FRAME_VERSION_MASK) >> 24) +#define CILK_FRAME_MBZ (~ (CILK_FRAME_STOLEN | \ + CILK_FRAME_UNSYNCHED | \ + CILK_FRAME_DETACHED | \ + CILK_FRAME_EXCEPTION_PROBED | \ + CILK_FRAME_EXCEPTING | \ + CILK_FRAME_LAST | \ + CILK_FRAME_EXITING | \ + CILK_FRAME_SUSPENDED | \ + CILK_FRAME_UNWINDING | \ + CILK_FRAME_VERSION_MASK)) + + +typedef uint32_t cilk32_t; +typedef uint64_t cilk64_t; +typedef void (*__cilk_abi_f32_t)(void *data, cilk32_t low, cilk32_t high); +typedef void (*__cilk_abi_f64_t)(void *data, cilk64_t low, cilk64_t high); + +typedef void (__cilkrts_init)(); + +typedef void (__cilkrts_enter_frame_1)(__cilkrts_stack_frame *sf); +typedef void (__cilkrts_enter_frame_fast_1)(__cilkrts_stack_frame *sf); +typedef void (__cilkrts_leave_frame)(__cilkrts_stack_frame *sf); +typedef void (__cilkrts_rethrow)(__cilkrts_stack_frame *sf); +typedef void (__cilkrts_sync)(__cilkrts_stack_frame *sf); +typedef void (__cilkrts_detach)(__cilkrts_stack_frame *sf); +typedef void (__cilkrts_pop_frame)(__cilkrts_stack_frame *sf); +typedef int (__cilkrts_get_nworkers)(); +typedef __cilkrts_worker *(__cilkrts_get_tls_worker)(); +typedef __cilkrts_worker *(__cilkrts_get_tls_worker_fast)(); +typedef __cilkrts_worker *(__cilkrts_bind_thread_1)(); + +typedef void (cilk_func)(__cilkrts_stack_frame *); + +typedef void (cilk_enter_begin)(uint32_t, __cilkrts_stack_frame *, void *, void *); +typedef void (cilk_enter_helper_begin)(__cilkrts_stack_frame *, void *, void *); +typedef void (cilk_enter_end)(__cilkrts_stack_frame *, void *); +typedef void (cilk_detach_begin)(__cilkrts_stack_frame *); +typedef void (cilk_detach_end)(); +typedef void (cilk_spawn_prepare)(__cilkrts_stack_frame *); +typedef void (cilk_spawn_or_continue)(int); +typedef void (cilk_sync_begin)(__cilkrts_stack_frame *); +typedef void (cilk_sync_end)(__cilkrts_stack_frame *); +typedef void (cilk_leave_begin)(__cilkrts_stack_frame *); +typedef void (cilk_leave_end)(); +typedef void (__cilkrts_cilk_for_32)(__cilk_abi_f32_t body, void *data, + cilk32_t count, int grain); +typedef void (__cilkrts_cilk_for_64)(__cilk_abi_f64_t body, void *data, + cilk64_t count, int grain); + +#define CILKRTS_FUNC(name, CGF) Get__cilkrts_##name(CGF) + +#define DEFAULT_GET_CILKRTS_FUNC(name) \ + static llvm::Function *Get__cilkrts_##name(llvm::Module& M) { \ + return llvm::cast(M.getOrInsertFunction( \ + "__cilkrts_"#name, \ + llvm::TypeBuilder<__cilkrts_##name, false>::get(M.getContext()) \ + )); \ + } + +//DEFAULT_GET_CILKRTS_FUNC(get_nworkers) +#pragma GCC diagnostic ignored "-Wunused-function" +static llvm::Function *Get__cilkrts_get_nworkers(llvm::Module& M) { + llvm::LLVMContext &C = M.getContext(); + llvm::AttributeList AL; + AL = AL.addAttribute(C, llvm::AttributeList::FunctionIndex, + llvm::Attribute::ReadNone); + // AL = AL.addAttribute(C, llvm::AttributeSet::FunctionIndex, + // llvm::Attribute::InaccessibleMemOnly); + AL = AL.addAttribute(C, llvm::AttributeList::FunctionIndex, + llvm::Attribute::NoUnwind); + llvm::Function *F = llvm::cast( + M.getOrInsertFunction( + "__cilkrts_get_nworkers", + llvm::TypeBuilder<__cilkrts_get_nworkers, false>::get(C), + AL)); + return F; +} + +// TODO: set up these CILKRTS and CILK_CSI functions in a cleaner +// way so we don't need these pragmas. +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(init) +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(sync) +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(rethrow) +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(leave_frame) +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(get_tls_worker) +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(get_tls_worker_fast) +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(bind_thread_1) + +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(cilk_for_32) +#pragma GCC diagnostic ignored "-Wunused-function" +DEFAULT_GET_CILKRTS_FUNC(cilk_for_64) + +#define CILK_CSI_FUNC(name, CGF) Get_cilk_##name(CGF) + +#define GET_CILK_CSI_FUNC(name) \ + static llvm::Function *Get_cilk_##name(llvm::Module& M) { \ + return llvm::cast(M.getOrInsertFunction( \ + "cilk_"#name, \ + llvm::TypeBuilder::get(M.getContext()) \ + )); \ + } + +#define GET_CILK_CSI_FUNC2(name) \ + static llvm::Function *Get_cilk_##name(llvm::Module& M) { \ + return llvm::cast(M.getOrInsertFunction( \ + "cilk_"#name, \ + llvm::TypeBuilder::get(M.getContext()) \ + )); \ + } + +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(enter_begin) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(enter_helper_begin) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(enter_end) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(detach_begin) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(detach_end) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC2(spawn_prepare) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC2(spawn_or_continue) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(sync_begin) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(sync_end) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(leave_begin) +#pragma GCC diagnostic ignored "-Wunused-function" +GET_CILK_CSI_FUNC(leave_end) + + typedef std::map TypeBuilderCache; + +} // namespace + +namespace llvm { + +/// Specializations of llvm::TypeBuilder for: +/// __cilkrts_pedigree, +/// __cilkrts_worker, +/// __cilkrts_stack_frame +template +class TypeBuilder<__cilkrts_pedigree, X> { +public: + static StructType *get(LLVMContext &C) { + static TypeBuilderCache cache; + TypeBuilderCache::iterator I = cache.find(&C); + if (I != cache.end()) + return I->second; + StructType *ExistingTy = StructType::getOrCreate(C, "struct.__cilkrts_pedigree"); + cache[&C] = ExistingTy; + StructType *NewTy = StructType::create(C); + NewTy->setBody( + TypeBuilder::get(C), // rank + TypeBuilder<__cilkrts_pedigree*, X>::get(C) // next + ); + if (ExistingTy->isOpaque()) + ExistingTy->setBody(NewTy->elements()); + else + assert(ExistingTy->isLayoutIdentical(NewTy) && + "Conflicting definition of tye struct.__cilkrts_pedigree"); + return ExistingTy; + } + enum { + rank, + next + }; +}; + +template +class TypeBuilder<__cilkrts_worker, X> { +public: + static StructType *get(LLVMContext &C) { + static TypeBuilderCache cache; + TypeBuilderCache::iterator I = cache.find(&C); + if (I != cache.end()) + return I->second; + // Try looking up this type by name. + StructType *Ty = StructType::getOrCreate(C, "struct.__cilkrts_worker"); + assert(Ty->isOpaque() && + "Conflicting definition of type struct.__cilkrts_worker."); + cache[&C] = Ty; + Ty->setBody( + TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // tail + TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // head + TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // exc + TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // protected_tail + TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // ltq_limit + TypeBuilder::get(C), // self + TypeBuilder::get(C), // g + TypeBuilder::get(C), // l + TypeBuilder::get(C), // reducer_map + TypeBuilder<__cilkrts_stack_frame*, X>::get(C), // current_stack_frame + TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // saved_protected_tail + TypeBuilder::get(C), // sysdep + TypeBuilder<__cilkrts_pedigree, X>::get(C) // pedigree + ); + return Ty; + } + enum { + tail, + head, + exc, + protected_tail, + ltq_limit, + self, + g, + l, + reducer_map, + current_stack_frame, + saved_protected_tail, + sysdep, + pedigree + }; +}; + +template +class TypeBuilder<__cilkrts_stack_frame, X> { +public: + static StructType *get(LLVMContext &C) { + static TypeBuilderCache cache; + TypeBuilderCache::iterator I = cache.find(&C); + if (I != cache.end()) + return I->second; + StructType *Ty = StructType::create(C, "struct.__cilkrts_stack_frame"); + cache[&C] = Ty; + Ty->setBody( + TypeBuilder::get(C), // flags + TypeBuilder::get(C), // size + TypeBuilder<__cilkrts_stack_frame*, X>::get(C), // call_parent + TypeBuilder<__cilkrts_worker*, X>::get(C), // worker + TypeBuilder::get(C), // except_data + TypeBuilder<__CILK_JUMP_BUFFER, X>::get(C), // ctx + TypeBuilder::get(C), // mxcsr + TypeBuilder::get(C), // fpcsr + TypeBuilder::get(C), // reserved + TypeBuilder<__cilkrts_pedigree, X>::get(C) // parent_pedigree + ); + return Ty; + } + enum { + flags, + size, + call_parent, + worker, + except_data, + ctx, + mxcsr, + fpcsr, + reserved, + parent_pedigree + }; +}; + +} // namespace llvm + + +//////////////////////////////////////////////////////////////////////////////// + +namespace llvm { +namespace cilk { + +Value *GetOrCreateWorker8(Function &F); +void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame, + bool instrument = false); + +bool verifyDetachedCFG(const DetachInst &Detach, DominatorTree &DT, + bool error = true); + +bool populateDetachedCFG(const DetachInst &Detach, DominatorTree &DT, + SmallPtrSetImpl &functionPieces, + SmallVectorImpl &reattachB, + SmallPtrSetImpl &ExitBlocks, + bool replace, bool error = true); + +Function *extractDetachBodyToFunction(DetachInst &Detach, + DominatorTree &DT, AssumptionCache &AC, + CallInst **call = nullptr); + +Function *createDetach(DetachInst &Detach, + ValueToValueMapTy &DetachCtxToStackFrame, + DominatorTree &DT, AssumptionCache &AC, + bool instrument = false); + +} // end of cilk namespace +} // end of llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h b/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h new file mode 100644 index 00000000000000..df6718c99418c1 --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h @@ -0,0 +1,37 @@ +//===---- LoopSpawning.h ----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass modifies Tapir loops to spawn their iterations efficiently. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H +#define LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H + +#include "llvm/ADT/MapVector.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BasicAliasAnalysis.h" +#include "llvm/Analysis/LoopAccessAnalysis.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +/// The LoopSpawning Pass. +struct LoopSpawningPass : public PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} + +#endif // LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H diff --git a/llvm/include/llvm/Transforms/Tapir/Outline.h b/llvm/include/llvm/Transforms/Tapir/Outline.h new file mode 100644 index 00000000000000..a11ef83007556d --- /dev/null +++ b/llvm/include/llvm/Transforms/Tapir/Outline.h @@ -0,0 +1,88 @@ +//===- llvm/Transforms/Tapir/Outline.h - Outlining for Tapir -*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines helper functions for outlining portions of code containing +// Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_TAPIR_OUTLINE_H +#define LLVM_TRANSFORMS_TAPIR_OUTLINE_H + +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/ValueMapper.h" + +namespace llvm { + +typedef SetVector ValueSet; + +/// Find the inputs and outputs for a function outlined from the gives set of +/// basic blocks. +void findInputsOutputs(const SmallPtrSetImpl &Blocks, + ValueSet &Inputs, + ValueSet &Outputs, + const SmallPtrSetImpl *ExitBlocks = + nullptr); + +/// Clone Blocks into NewFunc, transforming the old arguments into references to +/// VMap values. +/// +/// TODO: Fix the std::vector part of the type of this function. +void CloneIntoFunction(Function *NewFunc, const Function *OldFunc, + std::vector Blocks, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + SmallVectorImpl &Returns, + const StringRef NameSuffix, + SmallPtrSetImpl *ExitBlocks = nullptr, + DISubprogram *SP = nullptr, + ClonedCodeInfo *CodeInfo = nullptr, + ValueMapTypeRemapper *TypeMapper = nullptr, + ValueMaterializer *Materializer = nullptr); + +/// Create a helper function whose signature is based on Inputs and +/// Outputs as follows: f(in0, ..., inN, out0, ..., outN) +/// +/// TODO: Fix the std::vector part of the type of this function. +Function *CreateHelper(const ValueSet &Inputs, + const ValueSet &Outputs, + std::vector Blocks, + BasicBlock *Header, + const BasicBlock *OldEntry, + const BasicBlock *OldExit, + ValueToValueMapTy &VMap, + Module *DestM, + bool ModuleLevelChanges, + SmallVectorImpl &Returns, + const StringRef NameSuffix, + SmallPtrSetImpl *ExitBlocks = nullptr, + const Instruction *InputSyncRegion = nullptr, + ClonedCodeInfo *CodeInfo = nullptr, + ValueMapTypeRemapper *TypeMapper = nullptr, + ValueMaterializer *Materializer = nullptr); + +// Add alignment assumptions to parameters of outlined function, based on known +// alignment data in the caller. +void AddAlignmentAssumptions(const Function *Caller, + const ValueSet &Inputs, + ValueToValueMapTy &VMap, + const Instruction *CallSite, + AssumptionCache *AC, + DominatorTree *DT); + +} // End llvm namespace + +#endif diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h index 5b16a2c0d0b1a3..4bc6bdc3378a27 100644 --- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h +++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h @@ -97,6 +97,7 @@ struct CriticalEdgeSplittingOptions { bool MergeIdenticalEdges = false; bool DontDeleteUselessPHIs = false; bool PreserveLCSSA = false; + bool SplitDetachContinue = false; CriticalEdgeSplittingOptions(DominatorTree *DT = nullptr, LoopInfo *LI = nullptr, @@ -117,6 +118,11 @@ struct CriticalEdgeSplittingOptions { PreserveLCSSA = true; return *this; } + + CriticalEdgeSplittingOptions &setSplitDetachContinue() { + SplitDetachContinue = true; + return *this; + } }; /// If this edge is a critical edge, insert a new node to split the critical diff --git a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h index fee492be2a9023..5e33ba151fc592 100644 --- a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h +++ b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h @@ -40,6 +40,13 @@ void appendToGlobalCtors(Module &M, Function *F, int Priority, void appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data = nullptr); +// Validate the result of Module::getOrInsertFunction called for an +// interface function of ComprehensiveStaticInstrumentation. If the +// instrumented module defines a function with the same name, their +// prototypes must match, otherwise getOrInsertFunction returns a +// bitcast. +Function *checkCsiInterfaceFunction(Constant *FuncOrBitcast); + // Validate the result of Module::getOrInsertFunction called for an interface // function of given sanitizer. If the instrumented module defines a function // with the same name, their prototypes must match, otherwise diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h index 5ddfbe2bf05881..5342bd1c418123 100644 --- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h +++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h @@ -30,6 +30,7 @@ class AssumptionCache; /// ever one layer of bitcasts or GEPs between the alloca and the lifetime /// markers. bool isAllocaPromotable(const AllocaInst *AI); +bool isAllocaParallelPromotable(const AllocaInst *AI, DominatorTree &DT); /// Promote the specified list of alloca instructions into scalar /// registers, inserting PHI nodes as appropriate. diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h index d02607acbbb579..355422e0e4b46f 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h @@ -54,6 +54,9 @@ class SSAUpdater { /// the vector. SmallVectorImpl *InsertedPHIs; + /// This keeps track of which values are defined in detached blocks. + void *VID = nullptr; + public: /// If InsertedPHIs is specified, it will be filled /// in with all PHI Nodes created by rewriting. @@ -106,6 +109,8 @@ class SSAUpdater { /// merge the appropriate values, and this value isn't live out of the block. Value *GetValueInMiddleOfBlock(BasicBlock *BB); + bool GetValueIsDetachedInBlock(BasicBlock *BB); + /// Rewrite a use of the symbolic value. /// /// This handles PHI nodes, which use their value in the corresponding diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h index cab0f3e7157578..2b2d7a168ae729 100644 --- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h @@ -66,6 +66,9 @@ class SSAUpdaterImpl { // Marker for existing PHIs that match. PhiT *PHITag = nullptr; + // Flag to indicate that the AvailableVal would be used after a Reattach. + bool DetachedUse = false; + BBInfo(BlkT *ThisBB, ValT V) : BB(ThisBB), AvailableVal(V), DefBB(V ? this : nullptr) {} }; @@ -76,6 +79,10 @@ class SSAUpdaterImpl { SmallVectorImpl *InsertedPHIs; + using ValIsDetachedTy = DenseMap; + + ValIsDetachedTy *ValIsDetached; + using BlockListTy = SmallVectorImpl; using BBMapTy = DenseMap; @@ -84,8 +91,9 @@ class SSAUpdaterImpl { public: explicit SSAUpdaterImpl(UpdaterT *U, AvailableValsTy *A, - SmallVectorImpl *Ins) : - Updater(U), AvailableVals(A), InsertedPHIs(Ins) {} + SmallVectorImpl *Ins, + ValIsDetachedTy *D = nullptr) : + Updater(U), AvailableVals(A), InsertedPHIs(Ins), ValIsDetached(D) {} /// GetValue - Check to see if AvailableVals has an entry for the specified /// BB and if so, return it. If not, construct SSA form by first @@ -350,6 +358,10 @@ class SSAUpdaterImpl { (*AvailableVals)[Info->BB] = PHI; } + // Set of blocks with detached values that would be used except + // for Reattach. + SmallVector DetachedValBlocks; + // Now go back through the worklist in reverse order to fill in the // arguments for any new PHIs added in the forward traversal. for (typename BlockListTy::reverse_iterator I = BlockList->rbegin(), @@ -368,14 +380,34 @@ class SSAUpdaterImpl { if (!PHI) continue; + // TODO: Change this so we do not assume that a block has at + // most one Detach and Reattach predecessor. + BBInfo *DetachPredInfo = nullptr; + BBInfo *ReattachPredInfo = nullptr; // Iterate through the block's predecessors. for (unsigned p = 0; p != Info->NumPreds; ++p) { BBInfo *PredInfo = Info->Preds[p]; BlkT *Pred = PredInfo->BB; + if (Traits::BlockReattaches(Pred, Updater)) { + ReattachPredInfo = PredInfo; + continue; + } // Skip to the nearest preceding definition. if (PredInfo->DefBB != PredInfo) PredInfo = PredInfo->DefBB; Traits::AddPHIOperand(PHI, PredInfo->AvailableVal, Pred); + if (Traits::BlockDetaches(Pred, Updater)) + DetachPredInfo = PredInfo; + } + if (ReattachPredInfo) { + assert(DetachPredInfo && + "Reattach predecessor found with no corresponding Detach predecessor."); + // Available value from predecessor through a reattach is the + // same as that for the corresponding detach. + Traits::AddPHIOperand(PHI, DetachPredInfo->AvailableVal, + ReattachPredInfo->BB); + if (DetachPredInfo->AvailableVal != ReattachPredInfo->AvailableVal) + DetachedValBlocks.push_back(Info); } LLVM_DEBUG(dbgs() << " Inserted PHI: " << *PHI << "\n"); @@ -383,6 +415,9 @@ class SSAUpdaterImpl { // If the client wants to know about all new instructions, tell it. if (InsertedPHIs) InsertedPHIs->push_back(PHI); } + + // Mark any definitions that are detached from their use. + MarkDetachedDefs(&DetachedValBlocks); } /// FindExistingPHI - Look through the PHI nodes in a block to see if any of @@ -416,7 +451,21 @@ class SSAUpdaterImpl { for (typename Traits::PHI_iterator I = Traits::PHI_begin(PHI), E = Traits::PHI_end(PHI); I != E; ++I) { ValT IncomingVal = I.getIncomingValue(); - BBInfo *PredInfo = BBMap[I.getIncomingBlock()]; + BlkT *BB = I.getIncomingBlock(); + + // Replace a reattach predecessor with the corresponding + // detach predecessor. + // + // TODO: Remove the implicit assumption here that each basic + // block has at most one reattach predecessor. + if (Traits::BlockReattaches(BB, Updater)) + for (typename Traits::PHI_iterator PI = Traits::PHI_begin(PHI), + PE = Traits::PHI_end(PHI); PI != PE; ++PI) + if (Traits::BlockDetaches(PI.getIncomingBlock(), Updater)) { + BB = PI.getIncomingBlock(); + break; + } + BBInfo *PredInfo = BBMap[BB]; // Skip to the nearest preceding definition. if (PredInfo->DefBB != PredInfo) PredInfo = PredInfo->DefBB; @@ -459,6 +508,30 @@ class SSAUpdaterImpl { BBMap[BB]->AvailableVal = PHIVal; } } + + /// MarkDetachedDefs - Mark all definitions that reach the basic + /// blocks in WorkList as having detached uses. + void MarkDetachedDefs(SmallVector *WorkList) { + BBInfo *Info; + while (!WorkList->empty()) { + Info = WorkList->pop_back_val(); + Info->DetachedUse = true; + + ValT AvailableVal = Info->AvailableVal; + if (!AvailableVal) + continue; + + if (ValIsDetached) + (*ValIsDetached)[Info->BB] = true; + + if (Traits::ValueIsPHI(AvailableVal, Updater) || + Info->DefBB != Info) + for (unsigned p = 0; p != Info->NumPreds; ++p) + if (!Info->Preds[p]->DetachedUse) + WorkList->push_back(Info->Preds[p]); + } + } + }; } // end namespace llvm diff --git a/llvm/include/llvm/Transforms/Utils/TapirUtils.h b/llvm/include/llvm/Transforms/Utils/TapirUtils.h new file mode 100644 index 00000000000000..f8e4e98850c237 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/TapirUtils.h @@ -0,0 +1,53 @@ +//===-- TapirUtils.h - Utility methods for Tapir ---------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file utility methods for handling code containing Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_TAPIRUITLS_H +#define LLVM_TRANSFORMS_UTILS_TAPIRUTILS_H + +#include "llvm/ADT/SmallVector.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" + +namespace llvm { + +class BasicBlock; +class DetachInst; +class DominatorTree; +class TerminatorInst; + +/// Move static allocas in a block into the specified entry block. Leave +/// lifetime markers behind for those static allocas. Returns true if the +/// cloned block still contains dynamic allocas, which cannot be moved. +bool MoveStaticAllocasInBlock( + BasicBlock *Entry, BasicBlock *Block, + SmallVectorImpl &ExitPoints); + +/// Serialize the sub-CFG detached by the specified detach +/// instruction. Removes the detach instruction and returns a pointer +/// to the branch instruction that replaces it. +BranchInst* SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT = nullptr); + +/// Get the entry basic block to the detached context that contains +/// the specified block. +const BasicBlock *GetDetachedCtx(const BasicBlock *BB); +BasicBlock *GetDetachedCtx(BasicBlock *BB); + +/// isCriticalContinueEdge - Return true if the specified edge is a critical +/// detach-continue edge. Critical detach-continue edges are critical edges - +/// from a block with multiple successors to a block with multiple predecessors +/// - even after ignoring all reattach edges. +bool isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum); + +} // End llvm namespace + +#endif diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 3446aef399381f..9d1efdfeddc68e 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -24,6 +24,8 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/CFLAndersAliasAnalysis.h" @@ -139,6 +141,42 @@ ModRefInfo AAResults::getModRefInfo(Instruction *I, const CallBase *Call2) { } else if (I->isFenceLike()) { // If this is a fence, just return ModRef. return ModRefInfo::ModRef; + } else if (auto D = dyn_cast(I)) { + ModRefInfo Result = ModRefInfo::NoModRef; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(D->getDetached()); + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + for (Instruction &DI : *BB) { + // Fail fast if we encounter an invalid CFG. + assert(!(D == &DI) && + "Detached CFG reaches its own Detach instruction."); + + // Ignore sync instructions in this analysis + if (isa(DI) || isa(DI)) + continue; + + if (isa(DI) || isa(DI) || + isa(DI) || isa(DI) || + DI.isFenceLike() || ImmutableCallSite(&DI)) + Result = ModRefInfo(Result | getModRefInfo(&DI, Call)); + if (&DI == Call.getInstruction()) + return ModRefInfo::NoModRef; + } + + // Add successors + const TerminatorInst *T = BB->getTerminator(); + if (!isa(T) || + T->getSuccessor(0) != D->getContinue()) + for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx) + WorkList.push_back(T->getSuccessor(idx)); + } + return Result; } else { // Otherwise, check if the call modifies or references the // location this memory access defines. The best we can say @@ -540,7 +578,90 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW, return ModRefInfo::ModRef; } -/// Return information about whether a particular call site modifies +ModRefInfo AAResults::getModRefInfo(const DetachInst *D, + const MemoryLocation &Loc) { + ModRefInfo Result = MRI_NoModRef; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(D->getSuccessor(0)); + while (!WorkList.empty()) { + const BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + // Ignore sync instructions in this analysis + if (isa(I)) + continue; + + // Fail fast if we encounter an invalid CFG. + assert(!(D == &*I) && + "Invalid CFG found: Detached CFG reaches its own Detach instruction."); + + if (!Loc.Ptr) + Result = ModRefInfo(Result | getModRefInfo(&*I)); + else + Result = ModRefInfo(Result | getModRefInfo(&*I, Loc)); + + // Early-exit the moment we reach the top of the lattice. + if (Result == MRI_ModRef) + return Result; + } + + // Add successors + const TerminatorInst *T = BB->getTerminator(); + if (!isa(T) || + T->getSuccessor(0) != D->getSuccessor(1)) + for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx) + WorkList.push_back(T->getSuccessor(idx)); + } + + return Result; +} + +ModRefInfo AAResults::getModRefInfo(const SyncInst *S, + const MemoryLocation &Loc) { + ModRefInfo Result = MRI_NoModRef; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(S->getParent()); + while(!WorkList.empty()) { + const BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + const TerminatorInst *T = BB->getTerminator(); + if (isa(T)) { + Result = ModRefInfo(Result | getModRefInfo(T, Loc)); + + // Early-exit the moment we reach the top of the lattice. + if (Result == MRI_ModRef) + return Result; + } + + // Add predecessors + for (const_pred_iterator PI = pred_begin(BB), E = pred_end(BB); + PI != E; ++PI) { + const BasicBlock *Pred = *PI; + const TerminatorInst *PT = Pred->getTerminator(); + // Ignore reattached predecessors and predecessors that end in + // syncs, because this sync does not wait on those predecessors. + if (isa(PT) || isa(PT)) + continue; + // If this block is detached, ignore the predecessor that + // detaches it. + if (const DetachInst *Det = dyn_cast(PT)) + if (Det->getDetached() == BB) + continue; + + WorkList.push_back(Pred); + } + } + + return Result; +} + +/// \brief Return information about whether a particular call site modifies /// or reads the specified memory location \p MemLoc before instruction \p I /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up /// instruction-ordering queries inside the BasicBlock containing \p I. diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp index bb8742123a0f08..be402b1990f75b 100644 --- a/llvm/lib/Analysis/Analysis.cpp +++ b/llvm/lib/Analysis/Analysis.cpp @@ -85,6 +85,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) { initializeLCSSAVerificationPassPass(Registry); initializeMemorySSAWrapperPassPass(Registry); initializeMemorySSAPrinterLegacyPassPass(Registry); + initializeDetachSSAWrapperPassPass(Registry); + initializeDetachSSAPrinterLegacyPassPass(Registry); } void LLVMInitializeAnalysis(LLVMPassRegistryRef R) { diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt index c57d8ef69d69b7..1742260bb24e52 100644 --- a/llvm/lib/Analysis/CMakeLists.txt +++ b/llvm/lib/Analysis/CMakeLists.txt @@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis Delinearization.cpp DemandedBits.cpp DependenceAnalysis.cpp + DetachSSA.cpp DivergenceAnalysis.cpp DomPrinter.cpp DominanceFrontier.cpp diff --git a/llvm/lib/Analysis/DetachSSA.cpp b/llvm/lib/Analysis/DetachSSA.cpp new file mode 100644 index 00000000000000..545280e5c3e930 --- /dev/null +++ b/llvm/lib/Analysis/DetachSSA.cpp @@ -0,0 +1,1082 @@ +//===-- DetachSSA.cpp - Detach SSA Builder---------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------===// +// +// This file implements the DetachSSA class. +// +//===----------------------------------------------------------------===// +#include "llvm/Analysis/DetachSSA.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/Analysis/IteratedDominanceFrontier.h" +#include "llvm/IR/AssemblyAnnotationWriter.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/FormattedStream.h" + +#define DEBUG_TYPE "detachssa" +using namespace llvm; +INITIALIZE_PASS_BEGIN(DetachSSAWrapperPass, "detachssa", "Detach SSA", false, + true) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(DetachSSAWrapperPass, "detachssa", "Detach SSA", false, + true) + +INITIALIZE_PASS_BEGIN(DetachSSAPrinterLegacyPass, "print-detachssa", + "Detach SSA Printer", false, false) +INITIALIZE_PASS_DEPENDENCY(DetachSSAWrapperPass) +INITIALIZE_PASS_END(DetachSSAPrinterLegacyPass, "print-detachssa", + "Detach SSA Printer", false, false) + +static cl::opt + VerifyDetachSSA("verify-detachssa", cl::init(false), cl::Hidden, + cl::desc("Verify DetachSSA in legacy printer pass.")); + +namespace llvm { +/// \brief An assembly annotator class to print Detach SSA information in +/// comments. +class DetachSSAAnnotatedWriter : public AssemblyAnnotationWriter { + friend class DetachSSA; + const DetachSSA *DSSA; + +public: + DetachSSAAnnotatedWriter(const DetachSSA *D) : DSSA(D) {} + + virtual void emitBasicBlockStartAnnot(const BasicBlock *BB, + formatted_raw_ostream &OS) { + if (DetachAccess *DA = DSSA->getDetachAccess(BB)) + OS << "; " << *DA << "\n"; + } + + virtual void emitInstructionAnnot(const Instruction *I, + formatted_raw_ostream &OS) { + if (DetachAccess *DA = DSSA->getDetachAccess(I)) + OS << "; " << *DA << "\n"; + } +}; + +struct RenamePassData { + DomTreeNode *DTN; + DomTreeNode::const_iterator ChildIt; + DetachAccess *IncomingVal; + + RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It, + DetachAccess *M) + : DTN(D), ChildIt(It), IncomingVal(M) {} + void swap(RenamePassData &RHS) { + std::swap(DTN, RHS.DTN); + std::swap(ChildIt, RHS.ChildIt); + std::swap(IncomingVal, RHS.IncomingVal); + } +}; +} // anonymous namespace + +namespace llvm { + +void DetachSSA::renameSuccessorPhis(BasicBlock *BB, DetachAccess *IncomingVal, + bool RenameAllUses) { + // Pass through values to our successors + for (const BasicBlock *S : successors(BB)) { + auto It = PerBlockAccesses.find(S); + // Rename the phi nodes in our successor block + if (It == PerBlockAccesses.end() || !isa(It->second->front())) + continue; + AccessList *Accesses = It->second.get(); + auto *Phi = cast(&Accesses->front()); + if (RenameAllUses) { + int PhiIndex = Phi->getBasicBlockIndex(BB); + assert(PhiIndex != -1 && "Incomplete phi during partial rename"); + Phi->setIncomingValue(PhiIndex, IncomingVal); + } else + Phi->addIncoming(IncomingVal, BB); + } +} + +/// \brief Rename a single basic block into DetachSSA form. +/// Uses the standard SSA renaming algorithm. +/// \returns The new incoming value. +DetachAccess *DetachSSA::renameBlock(BasicBlock *BB, DetachAccess *IncomingVal, + bool RenameAllUses) { + auto It = PerBlockAccesses.find(BB); + // Skip most processing if the list is empty. + if (It != PerBlockAccesses.end()) { + AccessList *Accesses = It->second.get(); + for (DetachAccess &L : *Accesses) { + if (DetachUseOrDef *DUD = dyn_cast(&L)) { + if (DUD->getDefiningAccess() == nullptr || RenameAllUses) + DUD->setDefiningAccess(IncomingVal); + if (isa(&L)) + IncomingVal = &L; + } else { + IncomingVal = &L; + } + } + } + return IncomingVal; +} + +/// \brief This is the standard SSA renaming algorithm. +/// +/// We walk the dominator tree in preorder, renaming accesses, and then filling +/// in phi nodes in our successors. +void DetachSSA::renamePass(DomTreeNode *Root, DetachAccess *IncomingVal, + SmallPtrSetImpl &Visited, + bool SkipVisited, bool RenameAllUses) { + SmallVector WorkStack; + // Skip everything if we already renamed this block and we are skipping. + // Note: You can't sink this into the if, because we need it to occur + // regardless of whether we skip blocks or not. + bool AlreadyVisited = !Visited.insert(Root->getBlock()).second; + if (SkipVisited && AlreadyVisited) + return; + + IncomingVal = renameBlock(Root->getBlock(), IncomingVal, RenameAllUses); + renameSuccessorPhis(Root->getBlock(), IncomingVal, RenameAllUses); + WorkStack.push_back({Root, Root->begin(), IncomingVal}); + + while (!WorkStack.empty()) { + DomTreeNode *Node = WorkStack.back().DTN; + DomTreeNode::const_iterator ChildIt = WorkStack.back().ChildIt; + IncomingVal = WorkStack.back().IncomingVal; + + if (ChildIt == Node->end()) { + WorkStack.pop_back(); + } else { + DomTreeNode *Child = *ChildIt; + ++WorkStack.back().ChildIt; + BasicBlock *BB = Child->getBlock(); + // Note: You can't sink this into the if, because we need it to occur + // regardless of whether we skip blocks or not. + AlreadyVisited = !Visited.insert(BB).second; + if (SkipVisited && AlreadyVisited) { + // We already visited this during our renaming, which can happen when + // being asked to rename multiple blocks. Figure out the incoming val, + // which is the last def. + // Incoming value can only change if there is a block def, and in that + // case, it's the last block def in the list. + if (auto *BlockDefs = getWritableBlockDefs(BB)) + IncomingVal = &*BlockDefs->rbegin(); + } else + IncomingVal = renameBlock(BB, IncomingVal, RenameAllUses); + renameSuccessorPhis(BB, IncomingVal, RenameAllUses); + WorkStack.push_back({Child, Child->begin(), IncomingVal}); + } + } +} + +/// \brief This handles unreachable block accesses by deleting phi nodes in +/// unreachable blocks, and marking all other unreachable DetachAccess's as +/// being uses of the live on entry definition. +void DetachSSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) { + assert(!DT->isReachableFromEntry(BB) && + "Reachable block found while handling unreachable blocks"); + + // Make sure phi nodes in our reachable successors end up with a + // LiveOnEntryDef for our incoming edge, even though our block is forward + // unreachable. We could just disconnect these blocks from the CFG fully, + // but we do not right now. + for (const BasicBlock *S : successors(BB)) { + if (!DT->isReachableFromEntry(S)) + continue; + auto It = PerBlockAccesses.find(S); + // Rename the phi nodes in our successor block + if (It == PerBlockAccesses.end() || !isa(It->second->front())) + continue; + AccessList *Accesses = It->second.get(); + auto *Phi = cast(&Accesses->front()); + Phi->addIncoming(LiveOnEntryDef.get(), BB); + } + + auto It = PerBlockAccesses.find(BB); + if (It == PerBlockAccesses.end()) + return; + + auto &Accesses = It->second; + for (auto AI = Accesses->begin(), AE = Accesses->end(); AI != AE;) { + auto Next = std::next(AI); + // If we have a phi, just remove it. We are going to replace all + // users with live on entry. + if (auto *UseOrDef = dyn_cast(AI)) + UseOrDef->setDefiningAccess(LiveOnEntryDef.get()); + else + Accesses->erase(AI); + AI = Next; + } +} + +DetachSSA::DetachSSA(Function &Func, DominatorTree *DT) + : DT(DT), F(Func), + NextID(INVALID_DETACHACCESS_ID) { + buildDetachSSA(); +} + +DetachSSA::~DetachSSA() { + // Drop all our references + for (const auto &Pair : PerBlockAccesses) + for (DetachAccess &DA : *Pair.second) + DA.dropAllReferences(); +} + +DetachSSA::AccessList *DetachSSA::getOrCreateAccessList(const BasicBlock *BB) { + auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr)); + + if (Res.second) + Res.first->second = make_unique(); + return Res.first->second.get(); +} +DetachSSA::DefsList *DetachSSA::getOrCreateDefsList(const BasicBlock *BB) { + auto Res = PerBlockDefs.insert(std::make_pair(BB, nullptr)); + + if (Res.second) + Res.first->second = make_unique(); + return Res.first->second.get(); +} + +// /// This class is a batch walker of all DetachUse's in the program, and points +// /// their defining access at the thing that actually clobbers them. Because it +// /// is a batch walker that touches everything, it does not operate like the +// /// other walkers. This walker is basically performing a top-down SSA renaming +// /// pass, where the version stack is used as the cache. This enables it to be +// /// significantly more time and detach efficient than using the regular walker, +// /// which is walking bottom-up. +// class DetachSSA::OptimizeUses { +// public: +// OptimizeUses(DetachSSA *DSSA, DetachSSAWalker *Walker, AliasAnalysis *AA, +// DominatorTree *DT) +// : DSSA(DSSA), Walker(Walker), AA(AA), DT(DT) { +// Walker = DSSA->getWalker(); +// } + +// void optimizeUses(); + +// private: +// /// This represents where a given detachlocation is in the stack. +// struct MemlocStackInfo { +// // This essentially is keeping track of versions of the stack. Whenever +// // the stack changes due to pushes or pops, these versions increase. +// unsigned long StackEpoch; +// unsigned long PopEpoch; +// // This is the lower bound of places on the stack to check. It is equal to +// // the place the last stack walk ended. +// // Note: Correctness depends on this being initialized to 0, which densemap +// // does +// unsigned long LowerBound; +// const BasicBlock *LowerBoundBlock; +// // This is where the last walk for this detach location ended. +// unsigned long LastKill; +// bool LastKillValid; +// }; +// void optimizeUsesInBlock(const BasicBlock *, unsigned long &, unsigned long &, +// SmallVectorImpl &, +// DenseMap &); +// DetachSSA *DSSA; +// DetachSSAWalker *Walker; +// AliasAnalysis *AA; +// DominatorTree *DT; +// }; + +// /// Optimize the uses in a given block This is basically the SSA renaming +// /// algorithm, with one caveat: We are able to use a single stack for all +// /// DetachUses. This is because the set of *possible* reaching DetachDefs is +// /// the same for every DetachUse. The *actual* clobbering DetachDef is just +// /// going to be some position in that stack of possible ones. +// /// +// /// We track the stack positions that each DetachLocation needs +// /// to check, and last ended at. This is because we only want to check the +// /// things that changed since last time. The same DetachLocation should +// /// get clobbered by the same store (getModRefInfo does not use invariantness or +// /// things like this, and if they start, we can modify DetachLocOrCall to +// /// include relevant data) +// void DetachSSA::OptimizeUses::optimizeUsesInBlock( +// const BasicBlock *BB, unsigned long &StackEpoch, unsigned long &PopEpoch, +// SmallVectorImpl &VersionStack, +// DenseMap &LocStackInfo) { + +// /// If no accesses, nothing to do. +// DetachSSA::AccessList *Accesses = DSSA->getWritableBlockAccesses(BB); +// if (Accesses == nullptr) +// return; + +// // Pop everything that doesn't dominate the current block off the stack, +// // increment the PopEpoch to account for this. +// while (true) { +// assert( +// !VersionStack.empty() && +// "Version stack should have liveOnEntry sentinel dominating everything"); +// BasicBlock *BackBlock = VersionStack.back()->getBlock(); +// if (DT->dominates(BackBlock, BB)) +// break; +// while (VersionStack.back()->getBlock() == BackBlock) +// VersionStack.pop_back(); +// ++PopEpoch; +// } + +// for (DetachAccess &DA : *Accesses) { +// auto *MU = dyn_cast(&DA); +// if (!MU) { +// VersionStack.push_back(&DA); +// ++StackEpoch; +// continue; +// } + +// if (isUseTriviallyOptimizableToLiveOnEntry(*AA, MU->getDetachInst())) { +// MU->setDefiningAccess(DSSA->getLiveOnEntryDef(), true); +// continue; +// } + +// DetachLocOrCall UseMLOC(MU); +// auto &LocInfo = LocStackInfo[UseMLOC]; +// // If the pop epoch changed, it means we've removed stuff from top of +// // stack due to changing blocks. We may have to reset the lower bound or +// // last kill info. +// if (LocInfo.PopEpoch != PopEpoch) { +// LocInfo.PopEpoch = PopEpoch; +// LocInfo.StackEpoch = StackEpoch; +// // If the lower bound was in something that no longer dominates us, we +// // have to reset it. +// // We can't simply track stack size, because the stack may have had +// // pushes/pops in the meantime. +// // XXX: This is non-optimal, but only is slower cases with heavily +// // branching dominator trees. To get the optimal number of queries would +// // be to make lowerbound and lastkill a per-loc stack, and pop it until +// // the top of that stack dominates us. This does not seem worth it ATM. +// // A much cheaper optimization would be to always explore the deepest +// // branch of the dominator tree first. This will guarantee this resets on +// // the smallest set of blocks. +// if (LocInfo.LowerBoundBlock && LocInfo.LowerBoundBlock != BB && +// !DT->dominates(LocInfo.LowerBoundBlock, BB)) { +// // Reset the lower bound of things to check. +// // TODO: Some day we should be able to reset to last kill, rather than +// // 0. +// LocInfo.LowerBound = 0; +// LocInfo.LowerBoundBlock = VersionStack[0]->getBlock(); +// LocInfo.LastKillValid = false; +// } +// } else if (LocInfo.StackEpoch != StackEpoch) { +// // If all that has changed is the StackEpoch, we only have to check the +// // new things on the stack, because we've checked everything before. In +// // this case, the lower bound of things to check remains the same. +// LocInfo.PopEpoch = PopEpoch; +// LocInfo.StackEpoch = StackEpoch; +// } +// if (!LocInfo.LastKillValid) { +// LocInfo.LastKill = VersionStack.size() - 1; +// LocInfo.LastKillValid = true; +// } + +// // At this point, we should have corrected last kill and LowerBound to be +// // in bounds. +// assert(LocInfo.LowerBound < VersionStack.size() && +// "Lower bound out of range"); +// assert(LocInfo.LastKill < VersionStack.size() && +// "Last kill info out of range"); +// // In any case, the new upper bound is the top of the stack. +// unsigned long UpperBound = VersionStack.size() - 1; + +// if (UpperBound - LocInfo.LowerBound > MaxCheckLimit) { +// DEBUG(dbgs() << "DetachSSA skipping optimization of " << *MU << " (" +// << *(MU->getDetachInst()) << ")" +// << " because there are " << UpperBound - LocInfo.LowerBound +// << " stores to disambiguate\n"); +// // Because we did not walk, LastKill is no longer valid, as this may +// // have been a kill. +// LocInfo.LastKillValid = false; +// continue; +// } +// bool FoundClobberResult = false; +// while (UpperBound > LocInfo.LowerBound) { +// if (isa(VersionStack[UpperBound])) { +// // For phis, use the walker, see where we ended up, go there +// Instruction *UseInst = MU->getDetachInst(); +// DetachAccess *Result = Walker->getClobberingDetachAccess(UseInst); +// // We are guaranteed to find it or something is wrong +// while (VersionStack[UpperBound] != Result) { +// assert(UpperBound != 0); +// --UpperBound; +// } +// FoundClobberResult = true; +// break; +// } + +// DetachDef *MD = cast(VersionStack[UpperBound]); +// // If the lifetime of the pointer ends at this instruction, it's live on +// // entry. +// if (!UseMLOC.IsCall && lifetimeEndsAt(MD, UseMLOC.getLoc(), *AA)) { +// // Reset UpperBound to liveOnEntryDef's place in the stack +// UpperBound = 0; +// FoundClobberResult = true; +// break; +// } +// if (instructionClobbersQuery(MD, MU, UseMLOC, *AA)) { +// FoundClobberResult = true; +// break; +// } +// --UpperBound; +// } +// // At the end of this loop, UpperBound is either a clobber, or lower bound +// // PHI walking may cause it to be < LowerBound, and in fact, < LastKill. +// if (FoundClobberResult || UpperBound < LocInfo.LastKill) { +// MU->setDefiningAccess(VersionStack[UpperBound], true); +// // We were last killed now by where we got to +// LocInfo.LastKill = UpperBound; +// } else { +// // Otherwise, we checked all the new ones, and now we know we can get to +// // LastKill. +// MU->setDefiningAccess(VersionStack[LocInfo.LastKill], true); +// } +// LocInfo.LowerBound = VersionStack.size() - 1; +// LocInfo.LowerBoundBlock = BB; +// } +// } + +// /// Optimize uses to point to their actual clobbering definitions. +// void DetachSSA::OptimizeUses::optimizeUses() { +// SmallVector VersionStack; +// DenseMap LocStackInfo; +// VersionStack.push_back(DSSA->getLiveOnEntryDef()); + +// unsigned long StackEpoch = 1; +// unsigned long PopEpoch = 1; +// // We perform a non-recursive top-down dominator tree walk. +// for (const auto *DomNode : depth_first(DT->getRootNode())) +// optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack, +// LocStackInfo); +// } + +void DetachSSA::placePHINodes( + const SmallPtrSetImpl &DefiningBlocks, + const DenseMap &BBNumbers) { + // Determine where our DetachPhi's should go + ForwardIDFCalculator IDFs(*DT); + IDFs.setDefiningBlocks(DefiningBlocks); + SmallVector IDFBlocks; + IDFs.calculate(IDFBlocks); + + std::sort(IDFBlocks.begin(), IDFBlocks.end(), + [&BBNumbers](const BasicBlock *A, const BasicBlock *B) { + return BBNumbers.lookup(A) < BBNumbers.lookup(B); + }); + + // Now place DetachPhi nodes. + for (auto &BB : IDFBlocks) + createDetachPhi(BB); +} + +void DetachSSA::buildDetachSSA() { + BasicBlock &StartingPoint = F.getEntryBlock(); + LiveOnEntryDef = make_unique(F.getContext(), nullptr, nullptr, + &StartingPoint, NextID++); + DenseMap BBNumbers; + unsigned NextBBNum = 0; + + // We maintain lists of detach accesses per block, trading memory for time. We + // could just look up the detach access for every possible instruction in the + // stream. + SmallPtrSet DefiningBlocks; + // Go through each block, figure out where defs occur, and chain together all + // the accesses. + for (BasicBlock &B : F) { + BBNumbers[&B] = NextBBNum++; + bool InsertIntoDef = false; + AccessList *Accesses = nullptr; + DefsList *Defs = nullptr; + if (isa(B.getTerminator()) || + isa(B.getTerminator())) { + DetachUseOrDef *DUD = new DetachDef(B.getContext(), nullptr, + B.getTerminator(), &B, + NextID++); + ValueToDetachAccess[B.getTerminator()] = DUD; + + if (!Accesses) + Accesses = getOrCreateAccessList(&B); + Accesses->push_back(DUD); + InsertIntoDef = true; + if (!Defs) + Defs = getOrCreateDefsList(&B); + Defs->push_back(*DUD); + } + if (InsertIntoDef) + DefiningBlocks.insert(&B); + } + placePHINodes(DefiningBlocks, BBNumbers); + + // Now do regular SSA renaming on the DetachDef/DetachUse. Visited will get + // filled in with all blocks. + SmallPtrSet Visited; + renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited); + + // CachingWalker *Walker = getWalkerImpl(); + + // // We're doing a batch of updates; don't drop useful caches between them. + // Walker->setAutoResetWalker(false); + // OptimizeUses(this, Walker, AA, DT).optimizeUses(); + // Walker->setAutoResetWalker(true); + // Walker->resetClobberWalker(); + + // Mark the uses in unreachable blocks as live on entry, so that they go + // somewhere. + for (auto &BB : F) + if (!Visited.count(&BB)) + markUnreachableAsLiveOnEntry(&BB); +} + +// This is a helper function used by the creation routines. It places NewAccess +// into the access and defs lists for a given basic block, at the given +// insertion point. +void DetachSSA::insertIntoListsForBlock(DetachAccess *NewAccess, + const BasicBlock *BB, + InsertionPlace Point) { + auto *Accesses = getOrCreateAccessList(BB); + if (Point == Beginning) { + // If it's a phi node, it goes first, otherwise, it goes after any phi + // nodes. + if (isa(NewAccess)) { + Accesses->push_front(NewAccess); + auto *Defs = getOrCreateDefsList(BB); + Defs->push_front(*NewAccess); + } else { + auto AI = find_if_not( + *Accesses, [](const DetachAccess &DA) { return isa(DA); }); + Accesses->insert(AI, NewAccess); + if (!isa(NewAccess)) { + auto *Defs = getOrCreateDefsList(BB); + auto DI = find_if_not( + *Defs, [](const DetachAccess &DA) { return isa(DA); }); + Defs->insert(DI, *NewAccess); + } + } + } else { + Accesses->push_back(NewAccess); + if (!isa(NewAccess)) { + auto *Defs = getOrCreateDefsList(BB); + Defs->push_back(*NewAccess); + } + } + BlockNumberingValid.erase(BB); +} + +void DetachSSA::insertIntoListsBefore(DetachAccess *What, const BasicBlock *BB, + AccessList::iterator InsertPt) { + auto *Accesses = getWritableBlockAccesses(BB); + bool WasEnd = InsertPt == Accesses->end(); + Accesses->insert(AccessList::iterator(InsertPt), What); + if (!isa(What)) { + auto *Defs = getOrCreateDefsList(BB); + // If we got asked to insert at the end, we have an easy job, just shove it + // at the end. If we got asked to insert before an existing def, we also get + // an terator. If we got asked to insert before a use, we have to hunt for + // the next def. + if (WasEnd) { + Defs->push_back(*What); + } else if (isa(InsertPt)) { + Defs->insert(InsertPt->getDefsIterator(), *What); + } else { + while (InsertPt != Accesses->end() && !isa(InsertPt)) + ++InsertPt; + // Either we found a def, or we are inserting at the end + if (InsertPt == Accesses->end()) + Defs->push_back(*What); + else + Defs->insert(InsertPt->getDefsIterator(), *What); + } + } + BlockNumberingValid.erase(BB); +} + +// Move What before Where in the IR. The end result is that What will belong to +// the right lists and have the right Block set, but will not otherwise be +// correct. It will not have the right defining access, and if it is a def, +// things below it will not properly be updated. +void DetachSSA::moveTo(DetachUseOrDef *What, BasicBlock *BB, + AccessList::iterator Where) { + // Keep it in the lookup tables, remove from the lists + removeFromLists(What, false); + What->setBlock(BB); + insertIntoListsBefore(What, BB, Where); +} + +void DetachSSA::moveTo(DetachUseOrDef *What, BasicBlock *BB, + InsertionPlace Point) { + removeFromLists(What, false); + What->setBlock(BB); + insertIntoListsForBlock(What, BB, Point); +} + +DetachPhi *DetachSSA::createDetachPhi(BasicBlock *BB) { + assert(!getDetachAccess(BB) && "DetachPhi already exists for this BB"); + DetachPhi *Phi = new DetachPhi(BB->getContext(), BB, NextID++); + // Phi's always are placed at the front of the block. + insertIntoListsForBlock(Phi, BB, Beginning); + ValueToDetachAccess[BB] = Phi; + return Phi; +} + +// DetachUseOrDef *DetachSSA::createDefinedAccess(Instruction *I, +// DetachAccess *Definition) { +// assert(!isa(I) && "Cannot create a defined access for a PHI"); +// DetachUseOrDef *NewAccess = createNewAccess(I); +// assert( +// NewAccess != nullptr && +// "Tried to create a detach access for a non-detach touching instruction"); +// NewAccess->setDefiningAccess(Definition); +// return NewAccess; +// } + +// /// \brief Helper function to create new detach accesses +// DetachUseOrDef *DetachSSA::createNewAccess(Instruction *I) { +// bool Def = isa(I); +// bool Use = isa(I); + +// if (!Def && !Use) +// return nullptr; + +// DetachUseOrDef *DUD; +// if (Def) +// DUD = new DetachDef(I->getContext, nullptr, I, +// cast(I)->getContinue(), NextID++); +// else if (Use) +// DUD = new DetachUse(I->getContext, nullptr, I, I->getParent()); +// ValueToDetachAccess[I] = DUD; +// return DUD; +// } + +/// \brief Returns true if \p Replacer dominates \p Replacee . +bool DetachSSA::dominatesUse(const DetachAccess *Replacer, + const DetachAccess *Replacee) const { + if (isa(Replacee)) + return DT->dominates(Replacer->getBlock(), Replacee->getBlock()); + const auto *DP = cast(Replacee); + // For a phi node, the use occurs in the predecessor block of the phi node. + // Since we may occur multiple times in the phi node, we have to check each + // operand to ensure Replacer dominates each operand where Replacee occurs. + for (const Use &Arg : DP->operands()) { + if (Arg.get() != Replacee && + !DT->dominates(Replacer->getBlock(), DP->getIncomingBlock(Arg))) + return false; + } + return true; +} + +/// \brief Properly remove \p DA from all of DetachSSA's lookup tables. +void DetachSSA::removeFromLookups(DetachAccess *DA) { + assert(DA->use_empty() && + "Trying to remove detach access that still has uses"); + BlockNumbering.erase(DA); + if (DetachUseOrDef *MUD = dyn_cast(DA)) + MUD->setDefiningAccess(nullptr); + // // Invalidate our walker's cache if necessary + // if (!isa(DA)) + // Walker->invalidateInfo(DA); + // The call below to erase will destroy DA, so we can't change the order we + // are doing things here + Value *DAInst; + if (DetachUseOrDef *DUD = dyn_cast(DA)) { + DAInst = DUD->getDAInst(); + } else { + DAInst = DA->getBlock(); + } + auto VDA = ValueToDetachAccess.find(DAInst); + if (VDA->second == DA) + ValueToDetachAccess.erase(VDA); +} + +/// \brief Properly remove \p DA from all of DetachSSA's lists. +/// +/// Because of the way the intrusive list and use lists work, it is important to +/// do removal in the right order. +/// ShouldDelete defaults to true, and will cause the detach access to also be +/// deleted, not just removed. +void DetachSSA::removeFromLists(DetachAccess *DA, bool ShouldDelete) { + // The access list owns the reference, so we erase it from the non-owning list + // first. + if (!isa(DA)) { + auto DefsIt = PerBlockDefs.find(DA->getBlock()); + std::unique_ptr &Defs = DefsIt->second; + Defs->remove(*DA); + if (Defs->empty()) + PerBlockDefs.erase(DefsIt); + } + + // The erase call here will delete it. If we don't want it deleted, we call + // remove instead. + auto AccessIt = PerBlockAccesses.find(DA->getBlock()); + std::unique_ptr &Accesses = AccessIt->second; + if (ShouldDelete) + Accesses->erase(DA); + else + Accesses->remove(DA); + + if (Accesses->empty()) + PerBlockAccesses.erase(AccessIt); +} + +void DetachSSA::print(raw_ostream &OS) const { + DetachSSAAnnotatedWriter Writer(this); + F.print(OS, &Writer); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void DetachSSA::dump() const { print(dbgs()); } +#endif + +void DetachSSA::verifyDetachSSA() const { + verifyDefUses(F); + verifyDomination(F); + verifyOrdering(F); + // Walker->verify(this); +} + +/// \brief Verify that the order and existence of DetachAccesses matches the +/// order and existence of detach affecting instructions. +void DetachSSA::verifyOrdering(Function &F) const { + // Walk all the blocks, comparing what the lookups think and what the access + // lists think, as well as the order in the blocks vs the order in the access + // lists. + SmallVector ActualAccesses; + SmallVector ActualDefs; + for (BasicBlock &B : F) { + const AccessList *AL = getBlockAccesses(&B); + const auto *DL = getBlockDefs(&B); + DetachAccess *Phi = getDetachAccess(&B); + if (Phi) { + ActualAccesses.push_back(Phi); + ActualDefs.push_back(Phi); + } + + for (Instruction &I : B) { + DetachAccess *DA = getDetachAccess(&I); + assert((!DA || (AL && (isa(DA) || DL))) && + "We have detach affecting instructions " + "in this block but they are not in the " + "access list or defs list"); + if (DA) { + ActualAccesses.push_back(DA); + if (isa(DA)) + ActualDefs.push_back(DA); + } + } + // Either we hit the assert, really have no accesses, or we have both + // accesses and an access list. + // Same with defs. + if (!AL && !DL) + continue; + assert(AL->size() == ActualAccesses.size() && + "We don't have the same number of accesses in the block as on the " + "access list"); + assert((DL || ActualDefs.size() == 0) && + "Either we should have a defs list, or we should have no defs"); + assert((!DL || DL->size() == ActualDefs.size()) && + "We don't have the same number of defs in the block as on the " + "def list"); + auto ALI = AL->begin(); + auto AAI = ActualAccesses.begin(); + while (ALI != AL->end() && AAI != ActualAccesses.end()) { + assert(&*ALI == *AAI && "Not the same accesses in the same order"); + ++ALI; + ++AAI; + } + ActualAccesses.clear(); + if (DL) { + auto DLI = DL->begin(); + auto ADI = ActualDefs.begin(); + while (DLI != DL->end() && ADI != ActualDefs.end()) { + assert(&*DLI == *ADI && "Not the same defs in the same order"); + ++DLI; + ++ADI; + } + } + ActualDefs.clear(); + } +} + +/// \brief Verify the domination properties of DetachSSA by checking that each +/// definition dominates all of its uses. +void DetachSSA::verifyDomination(Function &F) const { +#ifndef NDEBUG + for (BasicBlock &B : F) { + // Phi nodes are attached to basic blocks + if (DetachPhi *DP = getDetachAccess(&B)) + for (const Use &U : DP->uses()) + assert(dominates(DP, U) && "Detach PHI does not dominate it's uses"); + + for (Instruction &I : B) { + DetachAccess *MD = dyn_cast_or_null(getDetachAccess(&I)); + if (!MD) + continue; + + for (const Use &U : MD->uses()) + assert(dominates(MD, U) && "Detach Def does not dominate it's uses"); + } + } +#endif +} + +/// \brief Verify the def-use lists in DetachSSA, by verifying that \p Use +/// appears in the use list of \p Def. + +void DetachSSA::verifyUseInDefs(DetachAccess *Def, DetachAccess *Use) const { +#ifndef NDEBUG + if (!Def) + assert(isLiveOnEntryDef(Use) && + "Null def but use not point to live on entry def"); + else + assert(is_contained(Def->users(), Use) && + "Did not find use in def's use list"); +#endif +} + +/// \brief Verify the immediate use information, by walking all the detach +/// accesses and verifying that, for each use, it appears in the +/// appropriate def's use list +void DetachSSA::verifyDefUses(Function &F) const { + for (BasicBlock &B : F) { + // Phi nodes are attached to basic blocks + if (DetachPhi *Phi = getDetachAccess(&B)) { + assert(Phi->getNumOperands() == static_cast(std::distance( + pred_begin(&B), pred_end(&B))) && + "Incomplete DetachPhi Node"); + for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I) + verifyUseInDefs(Phi->getIncomingValue(I), Phi); + } + + for (Instruction &I : B) { + if (DetachUseOrDef *DA = getDetachAccess(&I)) { + verifyUseInDefs(DA->getDefiningAccess(), DA); + } + } + } +} + +DetachUseOrDef *DetachSSA::getDetachAccess(const Instruction *I) const { + return cast_or_null(ValueToDetachAccess.lookup(I)); +} + +DetachPhi *DetachSSA::getDetachAccess(const BasicBlock *BB) const { + return cast_or_null(ValueToDetachAccess.lookup(cast(BB))); +} + +/// Perform a local numbering on blocks so that instruction ordering can be +/// determined in constant time. +/// TODO: We currently just number in order. If we numbered by N, we could +/// allow at least N-1 sequences of insertBefore or insertAfter (and at least +/// log2(N) sequences of mixed before and after) without needing to invalidate +/// the numbering. +void DetachSSA::renumberBlock(const BasicBlock *B) const { + // The pre-increment ensures the numbers really start at 1. + unsigned long CurrentNumber = 0; + const AccessList *AL = getBlockAccesses(B); + assert(AL != nullptr && "Asking to renumber an empty block"); + for (const auto &I : *AL) + BlockNumbering[&I] = ++CurrentNumber; + BlockNumberingValid.insert(B); +} + +/// \brief Determine, for two detach accesses in the same block, +/// whether \p Dominator dominates \p Dominatee. +/// \returns True if \p Dominator dominates \p Dominatee. +bool DetachSSA::locallyDominates(const DetachAccess *Dominator, + const DetachAccess *Dominatee) const { + + const BasicBlock *DominatorBlock = Dominator->getBlock(); + + assert((DominatorBlock == Dominatee->getBlock()) && + "Asking for local domination when accesses are in different blocks!"); + // A node dominates itself. + if (Dominatee == Dominator) + return true; + + // When Dominatee is defined on function entry, it is not dominated by another + // detach access. + if (isLiveOnEntryDef(Dominatee)) + return false; + + // When Dominator is defined on function entry, it dominates the other detach + // access. + if (isLiveOnEntryDef(Dominator)) + return true; + + if (!BlockNumberingValid.count(DominatorBlock)) + renumberBlock(DominatorBlock); + + unsigned long DominatorNum = BlockNumbering.lookup(Dominator); + // All numbers start with 1 + assert(DominatorNum != 0 && "Block was not numbered properly"); + unsigned long DominateeNum = BlockNumbering.lookup(Dominatee); + assert(DominateeNum != 0 && "Block was not numbered properly"); + return DominatorNum < DominateeNum; +} + +bool DetachSSA::dominates(const DetachAccess *Dominator, + const DetachAccess *Dominatee) const { + if (Dominator == Dominatee) + return true; + + if (isLiveOnEntryDef(Dominatee)) + return false; + + if (Dominator->getBlock() != Dominatee->getBlock()) + return DT->dominates(Dominator->getBlock(), Dominatee->getBlock()); + return locallyDominates(Dominator, Dominatee); +} + +bool DetachSSA::dominates(const DetachAccess *Dominator, + const Use &Dominatee) const { + if (DetachPhi *DP = dyn_cast(Dominatee.getUser())) { + BasicBlock *UseBB = DP->getIncomingBlock(Dominatee); + // The def must dominate the incoming block of the phi. + if (UseBB != Dominator->getBlock()) + return DT->dominates(Dominator->getBlock(), UseBB); + // If the UseBB and the DefBB are the same, compare locally. + return locallyDominates(Dominator, cast(Dominatee)); + } + // If it's not a PHI node use, the normal dominates can already handle it. + return dominates(Dominator, cast(Dominatee.getUser())); +} + +void DetachAccess::print(raw_ostream &OS) const { + switch (getValueID()) { + case DetachPhiVal: return static_cast(this)->print(OS); + case DetachDefVal: return static_cast(this)->print(OS); + case DetachUseVal: return static_cast(this)->print(OS); + } + llvm_unreachable("invalid value id"); +} + +void DetachDef::print(raw_ostream &OS) const { + DetachAccess *UO = getDefiningAccess(); + + OS << getID() << " = DetachDef("; + if (UO && UO->getID()) + OS << UO->getID(); + OS << ')'; +} + +void DetachPhi::print(raw_ostream &OS) const { + bool First = true; + OS << getID() << " = DetachPhi("; + for (const auto &Op : operands()) { + BasicBlock *BB = getIncomingBlock(Op); + DetachAccess *DA = cast(Op); + if (!First) + OS << ','; + else + First = false; + + OS << '{'; + if (BB->hasName()) + OS << BB->getName(); + else + BB->printAsOperand(OS, false); + OS << ','; + if (unsigned ID = DA->getID()) + OS << ID; + OS << '}'; + } + OS << ')'; +} + +void DetachUse::print(raw_ostream &OS) const { + DetachAccess *UO = getDefiningAccess(); + OS << "DetachUse("; + if (UO && UO->getID()) + OS << UO->getID(); + OS << ')'; +} + +void DetachAccess::dump() const { +// Cannot completely remove virtual function even in release mode. +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + print(dbgs()); + dbgs() << "\n"; +#endif +} + +char DetachSSAPrinterLegacyPass::ID = 0; + +DetachSSAPrinterLegacyPass::DetachSSAPrinterLegacyPass() : FunctionPass(ID) { + initializeDetachSSAPrinterLegacyPassPass(*PassRegistry::getPassRegistry()); +} + +void DetachSSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequired(); + AU.addPreserved(); +} + +bool DetachSSAPrinterLegacyPass::runOnFunction(Function &F) { + auto &DSSA = getAnalysis().getDSSA(); + DSSA.print(dbgs()); + if (VerifyDetachSSA) + DSSA.verifyDetachSSA(); + return false; +} + +AnalysisKey DetachSSAAnalysis::Key; + +DetachSSAAnalysis::Result DetachSSAAnalysis::run(Function &F, + FunctionAnalysisManager &AM) { + auto &DT = AM.getResult(F); + return DetachSSAAnalysis::Result(make_unique(F, &DT)); +} + +PreservedAnalyses DetachSSAPrinterPass::run(Function &F, + FunctionAnalysisManager &AM) { + OS << "DetachSSA for function: " << F.getName() << "\n"; + AM.getResult(F).getDSSA().print(OS); + + return PreservedAnalyses::all(); +} + +PreservedAnalyses DetachSSAVerifierPass::run(Function &F, + FunctionAnalysisManager &AM) { + AM.getResult(F).getDSSA().verifyDetachSSA(); + + return PreservedAnalyses::all(); +} + +char DetachSSAWrapperPass::ID = 0; + +DetachSSAWrapperPass::DetachSSAWrapperPass() : FunctionPass(ID) { + initializeDetachSSAWrapperPassPass(*PassRegistry::getPassRegistry()); +} + +void DetachSSAWrapperPass::releaseMemory() { DSSA.reset(); } + +void DetachSSAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); + AU.addRequiredTransitive(); +} + +bool DetachSSAWrapperPass::runOnFunction(Function &F) { + auto &DT = getAnalysis().getDomTree(); + DSSA.reset(new DetachSSA(F, &DT)); + return false; +} + +void DetachSSAWrapperPass::verifyAnalysis() const { DSSA->verifyDetachSSA(); } + +void DetachSSAWrapperPass::print(raw_ostream &OS, const Module *M) const { + DSSA->print(OS); +} +} // namespace llvm + +void DetachPhi::deleteMe(DerivedUser *Self) { + delete static_cast(Self); +} + +void DetachDef::deleteMe(DerivedUser *Self) { + delete static_cast(Self); +} + +void DetachUse::deleteMe(DerivedUser *Self) { + delete static_cast(Self); +} diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp index 6a5567ed765bb2..7df5d9a8c03da8 100644 --- a/llvm/lib/Analysis/MemorySSA.cpp +++ b/llvm/lib/Analysis/MemorySSA.cpp @@ -137,7 +137,7 @@ class MemoryLocOrCall { IsCall = false; // There is no such thing as a memorylocation for a fence inst, and it is // unique in that regard. - if (!isa(Inst)) + if (!isa(Inst) && !isa(Inst)) Loc = MemoryLocation::get(Inst); } } diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp index eab7ec81953609..6b4e0e0207fcf9 100644 --- a/llvm/lib/AsmParser/LLLexer.cpp +++ b/llvm/lib/AsmParser/LLLexer.cpp @@ -859,6 +859,9 @@ lltok::Kind LLLexer::LexIdentifier() { INSTKEYWORD(invoke, Invoke); INSTKEYWORD(resume, Resume); INSTKEYWORD(unreachable, Unreachable); + INSTKEYWORD(detach, Detach); + INSTKEYWORD(reattach, Reattach); + INSTKEYWORD(sync, Sync); INSTKEYWORD(alloca, Alloca); INSTKEYWORD(load, Load); diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index ee634505581e81..6c4cd4207c61cf 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -5577,6 +5577,9 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB, Inst->setFastMathFlags(FMF); return false; } + case lltok::kw_detach: return ParseDetach(Inst, PFS); + case lltok::kw_reattach: return ParseReattach(Inst, PFS); + case lltok::kw_sync: return ParseSync(Inst, PFS); // Binary Operators. case lltok::kw_add: case lltok::kw_sub: @@ -5776,6 +5779,89 @@ bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) { return false; } +/// ParseDetach +/// ::= 'detach' within SyncRegion ',' TypeAndValue ',' TypeAndValue +bool LLParser::ParseDetach(Instruction *&Inst, PerFunctionState &PFS) { + LocTy Loc, Loc2; + Value *SR; + BasicBlock *Op1, *Op2; + + if (ParseToken(lltok::kw_within, "expected 'within' after detach")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return TokError("expected scope value for detach"); + + if (ParseValue(Type::getTokenTy(Context), SR, PFS)) + return true; + + if (ParseToken(lltok::comma, "expected ',' after detach scope")) + return true; + + if (ParseTypeAndBasicBlock(Op1, Loc, PFS) || + ParseToken(lltok::comma, "expected ',' after detached destination") || + ParseTypeAndBasicBlock(Op2, Loc2, PFS)) + return true; + + Inst = DetachInst::Create(Op1, Op2, SR); + return false; +} + +/// ParseReattach +/// ::= 'reattach' within SyncRegion ',' TypeAndValue +bool LLParser::ParseReattach(Instruction *&Inst, PerFunctionState &PFS) { + LocTy Loc; + Value *SR; + BasicBlock *Op; + + if (ParseToken(lltok::kw_within, "expected 'within' after reatach")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return TokError("expected scope value for reattach"); + + if (ParseValue(Type::getTokenTy(Context), SR, PFS)) + return true; + + if (ParseToken(lltok::comma, "expected ',' after reattach scope")) + return true; + + if (ParseTypeAndBasicBlock(Op, Loc, PFS)) + return true; + + Inst = ReattachInst::Create(Op, SR); + return false; +} + +/// ParseSync +/// ::= 'sync' within SyncRegion ',' TypeAndValue +bool LLParser::ParseSync(Instruction *&Inst, PerFunctionState &PFS) { + LocTy Loc; + Value *SR; + BasicBlock *Op; + + if (ParseToken(lltok::kw_within, "expected 'within' after sync")) + return true; + + if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar && + Lex.getKind() != lltok::LocalVarID) + return TokError("expected scope value for reattach"); + + if (ParseValue(Type::getTokenTy(Context), SR, PFS)) + return true; + + if (ParseToken(lltok::comma, "expected ',' after scope in sync")) + return true; + + if (ParseTypeAndBasicBlock(Op, Loc, PFS)) + return true; + + Inst = SyncInst::Create(Op, SR); + return false; +} + /// ParseSwitch /// Instruction /// ::= 'switch' TypeAndValue ',' TypeAndValue '[' JumpTable ']' diff --git a/llvm/lib/AsmParser/LLParser.h b/llvm/lib/AsmParser/LLParser.h index 5a0fc297265d4d..2b53bbea557b4d 100644 --- a/llvm/lib/AsmParser/LLParser.h +++ b/llvm/lib/AsmParser/LLParser.h @@ -571,6 +571,9 @@ namespace llvm { bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS); bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS); bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS); + bool ParseDetach(Instruction *&Inst, PerFunctionState &PFS); + bool ParseReattach(Instruction *&Inst, PerFunctionState &PFS); + bool ParseSync(Instruction *&Inst, PerFunctionState &PFS); bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc, unsigned OperandType); diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h index c2e2795a9467be..d21527f347a211 100644 --- a/llvm/lib/AsmParser/LLToken.h +++ b/llvm/lib/AsmParser/LLToken.h @@ -344,6 +344,11 @@ enum Kind { kw_insertvalue, kw_blockaddress, + // Tapir types + kw_detach, + kw_reattach, + kw_sync, + // Metadata types. kw_distinct, diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp index fe051e7a91256d..1173e1e8792616 100644 --- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp @@ -4231,6 +4231,59 @@ Error BitcodeReader::parseFunctionBody(Function *F) { I = new UnreachableInst(Context); InstructionList.push_back(I); break; + case bitc::FUNC_CODE_INST_DETACH: { // DETACH: [bb#, bb#, val] + if (Record.size() != 3) + return error("Invalid record"); + BasicBlock *Detached = getBasicBlock(Record[0]); + if (!Detached) + return error("Invalid record"); + + BasicBlock *Continue = getBasicBlock(Record[1]); + if (!Continue) + return error("Invalid record"); + + Value *SyncRegion = + getValue(Record, 2, NextValueNo, Type::getTokenTy(Context)); + if (!SyncRegion) + return error("Invalid record"); + + I = DetachInst::Create(Detached, Continue, SyncRegion); + InstructionList.push_back(I); + break; + } + case bitc::FUNC_CODE_INST_REATTACH: { // REATTACH: [bb#, val] + if (Record.size() != 2) + return error("Invalid record"); + + BasicBlock *DetachContinue = getBasicBlock(Record[0]); + if (!DetachContinue) + return error("Invalid record"); + + Value *SyncRegion = + getValue(Record, 1, NextValueNo, Type::getTokenTy(Context)); + if (!SyncRegion) + return error("Invalid record"); + + I = ReattachInst::Create(DetachContinue, SyncRegion); + InstructionList.push_back(I); + break; + } + case bitc::FUNC_CODE_INST_SYNC: { // Sync: [bb#, val] + if (Record.size() != 1) + return error("Invalid record"); + BasicBlock *Continue = getBasicBlock(Record[0]); + if (!Continue) + return error("Invalid record"); + + Value *SyncRegion = + getValue(Record, 1, NextValueNo, Type::getTokenTy(Context)); + if (!SyncRegion) + return error("Invalid record"); + + I = SyncInst::Create(Continue, SyncRegion); + InstructionList.push_back(I); + break; + } case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...] if (Record.size() < 1 || ((Record.size()-1)&1)) return error("Invalid record"); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp index ba4f932e2e6db8..26d032ffe47c1d 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -2780,6 +2780,31 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I, Code = bitc::FUNC_CODE_INST_UNREACHABLE; AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV; break; + case Instruction::Detach: + { + Code = bitc::FUNC_CODE_INST_DETACH; + const DetachInst &DI = cast(I); + Vals.push_back(VE.getValueID(DI.getSuccessor(0))); + Vals.push_back(VE.getValueID(DI.getSuccessor(1))); + pushValue(DI.getSyncRegion(), InstID, Vals); + } + break; + case Instruction::Reattach: + { + Code = bitc::FUNC_CODE_INST_REATTACH; + const ReattachInst &RI = cast(I); + Vals.push_back(VE.getValueID(RI.getSuccessor(0))); + pushValue(RI.getSyncRegion(), InstID, Vals); + } + break; + case Instruction::Sync: + { + Code = bitc::FUNC_CODE_INST_SYNC; + const SyncInst &SI = cast(I); + Vals.push_back(VE.getValueID(SI.getSuccessor(0))); + pushValue(SI.getSyncRegion(), InstID, Vals); + } + break; case Instruction::PHI: { const PHINode &PN = cast(I); diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 95f6274aa068be..a451527c5bb472 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -476,6 +476,62 @@ bool IRTranslator::translateIndirectBr(const User &U, return true; } +bool IRTranslator::translateDetach(const User &U, + MachineIRBuilder &MIRBuilder) { + const DetachInst &DetInst = cast(U); + + // Lowering of Tapir instructions should have happened already. At this + // stage, treat Detach like an unconditional branch to the detached successor. + const BasicBlock &DetTgt = *cast(DetInst.getDetached()); + MachineBasicBlock &TgtBB = getMBB(DetTgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the detached successor is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); + + // Link detached successor. + CurBB.addSuccessor(&getMBB(*cast(DetInst.getDetached()))); + return true; +} + +bool IRTranslator::translateReattach(const User &U, + MachineIRBuilder &MIRBuilder) { + const ReattachInst &ReatInst = cast(U); + + // Lowering of Tapir instructions should have happened already. At this + // stage, treat Reattach like an unconditional branch to its successor. + const BasicBlock &ReatTgt = *cast(ReatInst.getSuccessor(0)); + MachineBasicBlock &TgtBB = getMBB(ReatTgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the reattach successor is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); + + // Link the Reattach instruction's successor. + CurBB.addSuccessor(&getMBB(*cast(ReatInst.getSuccessor(0)))); + return true; +} + +bool IRTranslator::translateSync(const User &U, MachineIRBuilder &MIRBuilder) { + const SyncInst &SInst = cast(U); + + // Lowering of Tapir instructions should have happened already. At this + // stage, treat Sync like an unconditional branch to its successor. + const BasicBlock &STgt = *cast(SInst.getSuccessor(0)); + MachineBasicBlock &TgtBB = getMBB(STgt); + MachineBasicBlock &CurBB = MIRBuilder.getMBB(); + + // If the sync successor is the layout successor, fallthrough. + if (!CurBB.isLayoutSuccessor(&TgtBB)) + MIRBuilder.buildBr(TgtBB); + + // Link the Sync instruction's successor. + CurBB.addSuccessor(&getMBB(*cast(SInst.getSuccessor(0)))); + return true; +} + bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) { const LoadInst &LI = cast(U); diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp index 542491eabbf29c..a6fef51aa3098d 100644 --- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp +++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp @@ -299,6 +299,16 @@ class SSAUpdaterTraits { return NewDef->getOperand(0).getReg(); } + static bool BlockReattaches(MachineBasicBlock *BB, + MachineSSAUpdater *Updater) { + return false; + } + + static bool BlockDetaches(MachineBasicBlock *BB, + MachineSSAUpdater *Updater) { + return false; + } + /// CreateEmptyPHI - Create a PHI instruction that defines a new register. /// Add it into the specified block and return the register. static unsigned CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds, @@ -344,6 +354,12 @@ class SSAUpdaterTraits { static unsigned GetPHIValue(MachineInstr *PHI) { return PHI->getOperand(0).getReg(); } + + static void MarkDetachedDef(unsigned Val, MachineBasicBlock *BB, + MachineSSAUpdater *Updater) { + return; + } + }; } // end namespace llvm diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp index cdc597db640166..43e4fd352c6ddb 100644 --- a/llvm/lib/CodeGen/MachineSink.cpp +++ b/llvm/lib/CodeGen/MachineSink.cpp @@ -555,6 +555,28 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr &MI, return false; } +static inline bool hasSetJmpPred( MachineBasicBlock *bl0 ) { + +// llvm::errs() << "\n"; +// bl0->dump(); +// llvm::errs() << "\n"; + + for( auto bl : bl0->predecessors() ) { +// llvm::errs() << " \n"; + auto term = bl->getFirstTerminator(); + while( term != bl->end() ) { + auto mc = (*term).getDesc(); +// if (mc.Opcode != 777) continue; + if (mc.Opcode == 777) { return true; } +// llvm::errs() << " flags:" << mc.Flags << " opc:" << mc.Opcode << "\n"; +// term->dump(); + term++; + } +// llvm::errs() << " \n"; + } + return false; +} + /// Get the sorted sequence of successors for this MachineBasicBlock, possibly /// computing it if it was not already cached. SmallVector & @@ -565,7 +587,7 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB, if (Succs != AllSuccessors.end()) return Succs->second; - SmallVector AllSuccs(MBB->succ_begin(), + SmallPtrSet AllSuccs0(MBB->succ_begin(), MBB->succ_end()); // Handle cases where sinking can happen but where the sink point isn't a @@ -582,7 +604,43 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB, if (DTChild->getIDom()->getBlock() == MI.getParent() && // Skip MBBs already added to the AllSuccs vector above. !MBB->isSuccessor(DTChild->getBlock())) - AllSuccs.push_back(DTChild->getBlock()); + AllSuccs0.insert(DTChild->getBlock()); + + ///* + bool unstable = true; + while(unstable) { + unstable = false; + SmallPtrSet toRemove; + for( auto bl0 : AllSuccs0 ) { + //if (hasSetJmpPred(bl0)) assert(bl0->hasAddressTaken()); + if (toRemove.count(bl0) == 0 && (hasSetJmpPred(bl0) || bl0->hasAddressTaken()) ) { + SmallVector Q; + Q.push_back(bl0); + toRemove.insert(bl0); + while( Q.size() > 0 ) { + auto f = Q.back(); + Q.pop_back(); + //llvm::errs() << "saw and removing: " << f->getFullName() << "$BB#" << f->getNumber() << "\n"; + for( auto a : f->successors() ) { + if ( toRemove.count(a) > 0 || AllSuccs0.count(a) == 0 ) continue; + toRemove.insert(a); + Q.push_back(a); + } + } + unstable = true; + } + } + for (auto b : toRemove) { + AllSuccs0.erase(b); + } + } // */ + + //MBB->dump(); + //llvm::errs() << "CHECK CHILDREN FOR " << MBB->getFullName() << "$BB#" << MBB->getNumber() << ": " << "|{"; + //for( auto a : AllSuccs0 ) llvm::errs() << a->getFullName() << "$BB#" << a->getNumber() << ","; + //llvm::errs() << "}\n"; + SmallVector AllSuccs(AllSuccs0.begin(), + AllSuccs0.end()); // Sort Successors according to their loop depth or block frequency info. std::stable_sort( diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index bfeb3d1bc2b91f..7fa157cc1bac4b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -2671,6 +2671,66 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) { DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot())); } +void SelectionDAGBuilder::visitDetach(const DetachInst &I) { + MachineBasicBlock *DetachMBB = FuncInfo.MBB; + + // Update machine-CFG edges. + MachineBasicBlock *Detached = FuncInfo.MBBMap[I.getSuccessor(0)]; + //MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(1)]; + + // Update machine-CFG edges. + DetachMBB->addSuccessor(Detached); + + // If this is not a fall-through branch or optimizations are switched off, + // emit the branch. + if (Detached != NextBlock(DetachMBB) || TM.getOptLevel() == CodeGenOpt::None) + DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), + MVT::Other, getControlRoot(), + DAG.getBasicBlock(Detached))); + + return; + +} + +void SelectionDAGBuilder::visitReattach(const ReattachInst &I) { + MachineBasicBlock *ReattachMBB = FuncInfo.MBB; + + // Update machine-CFG edges. + MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)]; + + // Update machine-CFG edges. + ReattachMBB->addSuccessor(Continue); + + // If this is not a fall-through branch or optimizations are switched off, + // emit the branch. + if (Continue != NextBlock(ReattachMBB) || TM.getOptLevel() == CodeGenOpt::None) + DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), + MVT::Other, getControlRoot(), + DAG.getBasicBlock(Continue))); + + return; +} + +void SelectionDAGBuilder::visitSync(const SyncInst &I) { + MachineBasicBlock *SyncMBB = FuncInfo.MBB; + + // Update machine-CFG edges. + MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)]; + + // Update machine-CFG edges. + SyncMBB->addSuccessor(Continue); + + // If this is not a fall-through branch or optimizations are switched off, + // emit the branch. + if (Continue != NextBlock(SyncMBB) || TM.getOptLevel() == CodeGenOpt::None) + DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), + MVT::Other, getControlRoot(), + DAG.getBasicBlock(Continue))); + + return; +} + + void SelectionDAGBuilder::visitFSub(const User &I) { // -0.0 - X --> fneg Type *Ty = I.getType(); @@ -6375,6 +6435,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely // delete it now. return nullptr; + // Tapir intrinsics + // Lower the starting point of a sync region to a no-op. + case Intrinsic::syncregion_start: + return nullptr; } } diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 5f9cdb69daf72d..b0cc4725884aa8 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -57,6 +57,7 @@ class ConstantInt; class ConstrainedFPIntrinsic; class DbgValueInst; class DataLayout; +class DetachInst; class DIExpression; class DILocalVariable; class DILocation; @@ -72,11 +73,13 @@ class LLVMContext; class LoadInst; class MachineBasicBlock; class PHINode; +class ReattachInst; class ResumeInst; class ReturnInst; class SDDbgValue; class StoreInst; class SwitchInst; +class SyncInst; class TargetLibraryInfo; class TargetMachine; class Type; @@ -825,6 +828,9 @@ class SelectionDAGBuilder { void visitCatchRet(const CatchReturnInst &I); void visitCatchPad(const CatchPadInst &I); void visitCleanupPad(const CleanupPadInst &CPI); + void visitDetach(const DetachInst& I); + void visitReattach(const ReattachInst& I); + void visitSync(const SyncInst& I); BranchProbability getEdgeProbability(const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const; diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp index e8619037564245..4edebace9622ea 100644 --- a/llvm/lib/CodeGen/TargetLoweringBase.cpp +++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp @@ -1455,6 +1455,9 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const { case CatchSwitch: return 0; case CleanupPad: return 0; case FNeg: return ISD::FNEG; + case Detach: return 0; + case Reattach: return 0; + case Sync: return 0; case Add: return ISD::ADD; case FAdd: return ISD::FADD; case Sub: return ISD::SUB; diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index a5dc623e1a30fe..adead5e5dc1d62 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -3637,6 +3637,29 @@ void AssemblyWriter::printInstruction(const Instruction &I) { writeOperand(BI.getSuccessor(0), true); Out << ", "; writeOperand(BI.getSuccessor(1), true); + } else if (isa(I)) { + // Special case detach instruction to get formatting nice and correct + const DetachInst &DI(cast(I)); + Out << " within "; + writeOperand(DI.getSyncRegion(), /*PrintType=*/false); + Out << ", "; + writeOperand(DI.getDetached(), true); + Out << ", "; + writeOperand(DI.getContinue(), true); + } else if (isa(I)) { + // Special case reattach instruction to get formatting nice and correct + const ReattachInst &RI(cast(I)); + Out << " within "; + writeOperand(RI.getSyncRegion(), /*PrintType=*/false); + Out << ", "; + writeOperand(RI.getSuccessor(0), true); + } else if (isa(I)) { + // Special case sync instruction to get formatting nice and correct + const SyncInst &SI(cast(I)); + Out << " within "; + writeOperand(SI.getSyncRegion(), /*PrintType=*/false); + Out << ", "; + writeOperand(SI.getSuccessor(0), true); } else if (isa(I)) { const SwitchInst& SI(cast(I)); diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index 375924360dda83..213c8deedc0bd7 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -443,6 +443,48 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) { return New; } +BasicBlock *BasicBlock::splitBasicBlockWithTerminator(const Twine &BBName) { + auto term = getTerminator(); + assert(term && "Can't use splitBasicBlock on degenerate BB!"); + assert(term->getNumSuccessors() == 1 && "Number of successors must be 1"); + + BasicBlock *New = BasicBlock::Create(getContext(), BBName, getParent(), + this->getNextNode()); + + // Save DebugLoc of split point before invalidating iterator. + DebugLoc Loc = term->getDebugLoc(); + // Move all of the specified instructions from the original basic block into + // the new basic block. + auto suc = term->getSuccessor(0); + term->setSuccessor(0, New); + + // Add a branch instruction to the newly formed basic block. + BranchInst *BI = BranchInst::Create(suc, New); + BI->setDebugLoc(Loc); + + // Now we must loop through all of the successors of the New block (which + // _were_ the successors of the 'this' block), and update any PHI nodes in + // successors. If there were PHI nodes in the successors, then they need to + // know that incoming branches will be from New, not from Old. + // + for (succ_iterator I = succ_begin(New), E = succ_end(New); I != E; ++I) { + // Loop over any phi nodes in the basic block, updating the BB field of + // incoming values... + BasicBlock *Successor = *I; + PHINode *PN; + for (BasicBlock::iterator II = Successor->begin(); + (PN = dyn_cast(II)); ++II) { + int IDX = PN->getBasicBlockIndex(this); + while (IDX != -1) { + PN->setIncomingBlock((unsigned)IDX, New); + IDX = PN->getBasicBlockIndex(this); + } + } + } + + return New; +} + void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) { Instruction *TI = getTerminator(); if (!TI) diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp index d861b5288592ca..57d3923622991b 100644 --- a/llvm/lib/IR/Instruction.cpp +++ b/llvm/lib/IR/Instruction.cpp @@ -302,6 +302,9 @@ const char *Instruction::getOpcodeName(unsigned OpCode) { case CatchRet: return "catchret"; case CatchPad: return "catchpad"; case CatchSwitch: return "catchswitch"; + case Detach: return "detach"; + case Reattach: return "reattach"; + case Sync: return "sync"; // Standard unary operators... case FNeg: return "fneg"; @@ -510,6 +513,7 @@ bool Instruction::mayReadFromMemory() const { case Instruction::VAArg: case Instruction::Load: case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory + case Instruction::Sync: // Like Instruction::Fence case Instruction::AtomicCmpXchg: case Instruction::AtomicRMW: case Instruction::CatchPad: @@ -528,6 +532,7 @@ bool Instruction::mayWriteToMemory() const { switch (getOpcode()) { default: return false; case Instruction::Fence: // FIXME: refine definition of mayWriteToMemory + case Instruction::Sync: // Like Instruction::Fence case Instruction::Store: case Instruction::VAArg: case Instruction::AtomicCmpXchg: diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index 06b46724a87f80..81bb40423e8234 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -973,6 +973,180 @@ UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd) : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr, 0, InsertAtEnd) {} +//===----------------------------------------------------------------------===// +// DetachInst Implementation +//===----------------------------------------------------------------------===// + +void DetachInst::AssertOK() { + assert(getSyncRegion()->getType()->isTokenTy() && + "Sync region must be a token!"); +} + +DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, + Instruction *InsertBefore) + : TerminatorInst(Type::getVoidTy(Detached->getContext()), + Instruction::Detach, + OperandTraits::op_end(this) - 3, 3, + InsertBefore) { + Op<-1>() = Detached; + Op<-2>() = Continue; + Op<-3>() = SyncRegion; +#ifndef NDEBUG + AssertOK(); +#endif +} + +DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue, + Value *SyncRegion, + BasicBlock *InsertAtEnd) + : TerminatorInst(Type::getVoidTy(Detached->getContext()), + Instruction::Detach, + OperandTraits::op_end(this) - 3, 3, + InsertAtEnd) { + Op<-1>() = Detached; + Op<-2>() = Continue; + Op<-3>() = SyncRegion; +#ifndef NDEBUG + AssertOK(); +#endif +} + + +DetachInst::DetachInst(const DetachInst &DI) + : TerminatorInst(Type::getVoidTy(DI.getContext()), Instruction::Detach, + OperandTraits::op_end(this) - + DI.getNumOperands(), + DI.getNumOperands()) { + Op<-1>() = DI.Op<-1>(); + Op<-2>() = DI.Op<-2>(); + Op<-3>() = DI.Op<-3>(); + assert(DI.getNumOperands() == 3 && "Detach must have 3 operands!"); + SubclassOptionalData = DI.SubclassOptionalData; +} + +BasicBlock *DetachInst::getSuccessorV(unsigned idx) const { + return getSuccessor(idx); +} +unsigned DetachInst::getNumSuccessorsV() const { + return getNumSuccessors(); +} +void DetachInst::setSuccessorV(unsigned idx, BasicBlock *B) { + setSuccessor(idx, B); +} + +//===----------------------------------------------------------------------===// +// ReattachInst Implementation +//===----------------------------------------------------------------------===// + +void ReattachInst::AssertOK() { + assert(getSyncRegion()->getType()->isTokenTy() && + "Sync region must be a token!"); +} + +ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + Instruction *InsertBefore) + : TerminatorInst(Type::getVoidTy(DetachContinue->getContext()), + Instruction::Reattach, + OperandTraits::op_end(this) - 2, 2, + InsertBefore) { + Op<-1>() = DetachContinue; + Op<-2>() = SyncRegion; +#ifndef NDEBUG + AssertOK(); +#endif +} + +ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion, + BasicBlock *InsertAtEnd) + : TerminatorInst(Type::getVoidTy(DetachContinue->getContext()), + Instruction::Reattach, + OperandTraits::op_end(this) - 2, 2, + InsertAtEnd) { + Op<-1>() = DetachContinue; + Op<-2>() = SyncRegion; +#ifndef NDEBUG + AssertOK(); +#endif +} + +ReattachInst::ReattachInst(const ReattachInst &RI) + : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Reattach, + OperandTraits::op_end(this) - + RI.getNumOperands(), + RI.getNumOperands()) { + Op<-1>() = RI.Op<-1>(); + Op<-2>() = RI.Op<-2>(); + assert(RI.getNumOperands() == 2 && "Reattach must have 2 operands!"); + SubclassOptionalData = RI.SubclassOptionalData; +} + +unsigned ReattachInst::getNumSuccessorsV() const { + return getNumSuccessors(); +} + +BasicBlock *ReattachInst::getSuccessorV(unsigned idx) const { + return getSuccessor(idx); +} + +void ReattachInst::setSuccessorV(unsigned idx, BasicBlock *B) { + setSuccessor(idx, B); +} + +//===----------------------------------------------------------------------===// +// SyncInst Implementation +//===----------------------------------------------------------------------===// + +void SyncInst::AssertOK() { + assert(getSyncRegion()->getType()->isTokenTy() && + "Sync region must be a token!"); +} + +SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion, + Instruction *InsertBefore) + : TerminatorInst(Type::getVoidTy(Continue->getContext()), Instruction::Sync, + OperandTraits::op_end(this) - 2, 2, + InsertBefore) { + Op<-1>() = Continue; + Op<-2>() = SyncRegion; +#ifndef NDEBUG + AssertOK(); +#endif +} + +SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion, + BasicBlock *InsertAtEnd) + : TerminatorInst(Type::getVoidTy(Continue->getContext()), Instruction::Sync, + OperandTraits::op_end(this) - 2, 2, + InsertAtEnd) { + Op<-1>() = Continue; + Op<-2>() = SyncRegion; +#ifndef NDEBUG + AssertOK(); +#endif +} + + +SyncInst::SyncInst(const SyncInst &SI) : + TerminatorInst(Type::getVoidTy(SI.getContext()), Instruction::Sync, + OperandTraits::op_end(this) - SI.getNumOperands(), + SI.getNumOperands()) { + Op<-1>() = SI.Op<-1>(); + Op<-2>() = SI.Op<-2>(); + assert(SI.getNumOperands() == 2 && "Sync must have 2 operands!"); + SubclassOptionalData = SI.SubclassOptionalData; +} + +BasicBlock *SyncInst::getSuccessorV(unsigned idx) const { + return getSuccessor(idx); +} +unsigned SyncInst::getNumSuccessorsV() const { + return getNumSuccessors(); +} +void SyncInst::setSuccessorV(unsigned idx, BasicBlock *B) { + setSuccessor(idx, B); +} + //===----------------------------------------------------------------------===// // BranchInst Implementation //===----------------------------------------------------------------------===// @@ -4000,3 +4174,15 @@ UnreachableInst *UnreachableInst::cloneImpl() const { LLVMContext &Context = getContext(); return new UnreachableInst(Context); } + +DetachInst *DetachInst::cloneImpl() const { + return new(getNumOperands()) DetachInst(*this); +} + +ReattachInst *ReattachInst::cloneImpl() const { + return new(getNumOperands()) ReattachInst(*this); +} + +SyncInst *SyncInst::cloneImpl() const { + return new(getNumOperands()) SyncInst(*this); +} diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp index 0fb079c5ab7395..6c7255f4319e50 100644 --- a/llvm/lib/IR/Type.cpp +++ b/llvm/lib/IR/Type.cpp @@ -366,6 +366,13 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef ETypes, return ST; } +StructType *StructType::getOrCreate(LLVMContext &Context, StringRef Name) { + StructType *Ty = Context.pImpl->NamedStructTypes.lookup(Name); + if (!Ty) + Ty = StructType::create(Context, Name); + return Ty; +} + void StructType::setBody(ArrayRef Elements, bool isPacked) { assert(isOpaque() && "Struct body already set!"); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 30e77b92009f0f..00fdc08b066e2c 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -105,6 +105,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +//#include "llvm/Transforms/Tapir/CilkABI.h" #include #include #include @@ -342,6 +343,12 @@ class Verifier : public InstVisitor, VerifierSupport { BB.printAsOperand(*OS, true, MST); *OS << "\n"; } + // if (const DetachInst* Det = dyn_cast(&I->back())) { + // if (!cilk::verifyDetachedCFG(*Det, DT)) { + // OS << "Invalid end to detached CFG\n"; + // return true; + // } + // } return false; } diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 5ec94ea6f40ab0..2d935a1074dd88 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -30,6 +30,7 @@ #include "llvm/Analysis/CallGraph.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/DependenceAnalysis.h" +#include "llvm/Analysis/DetachSSA.h" #include "llvm/Analysis/DominanceFrontier.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/IVUsers.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 771d2f5b212ae9..c24cf8e33375d1 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -112,6 +112,7 @@ FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis()) FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis()) FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis()) FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis()) +FUNCTION_ANALYSIS("detachssa", DetachSSAAnalysis()) FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis()) FUNCTION_ANALYSIS("loops", LoopAnalysis()) FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis()) @@ -202,6 +203,7 @@ FUNCTION_PASS("print", AssumptionPrinterPass(dbgs())) FUNCTION_PASS("print", BlockFrequencyPrinterPass(dbgs())) FUNCTION_PASS("print", BranchProbabilityPrinterPass(dbgs())) FUNCTION_PASS("print", DependenceAnalysisPrinterPass(dbgs())) +FUNCTION_PASS("print", DetachSSAPrinterPass(dbgs())) FUNCTION_PASS("print", DominatorTreePrinterPass(dbgs())) FUNCTION_PASS("print", PostDominatorTreePrinterPass(dbgs())) FUNCTION_PASS("print", DemandedBitsPrinterPass(dbgs())) @@ -224,6 +226,7 @@ FUNCTION_PASS("sroa", SROA()) FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass()) FUNCTION_PASS("verify", VerifierPass()) +FUNCTION_PASS("verify", DetachSSAVerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) FUNCTION_PASS("verify", MemorySSAVerifierPass()) diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt index 74db9e53304da9..c39bc7e368d379 100644 --- a/llvm/lib/Transforms/CMakeLists.txt +++ b/llvm/lib/Transforms/CMakeLists.txt @@ -8,3 +8,4 @@ add_subdirectory(Vectorize) add_subdirectory(Hello) add_subdirectory(ObjCARC) add_subdirectory(Coroutines) +add_subdirectory(Tapir) diff --git a/llvm/lib/Transforms/IPO/LLVMBuild.txt b/llvm/lib/Transforms/IPO/LLVMBuild.txt index 54ce23876e66b4..e0d6b8353fc3a7 100644 --- a/llvm/lib/Transforms/IPO/LLVMBuild.txt +++ b/llvm/lib/Transforms/IPO/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = IPO parent = Transforms library_name = ipo -required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation +required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TapirOpts TransformUtils Vectorize Instrumentation diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp index 9764944dc3329e..6f0c86f64fd304 100644 --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -41,6 +41,8 @@ #include "llvm/Transforms/Scalar/InstSimplifyPass.h" #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h" #include "llvm/Transforms/Utils.h" +#include "llvm/Transforms/Tapir.h" +#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Vectorize.h" using namespace llvm; @@ -100,6 +102,10 @@ static cl::opt EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false), cl::Hidden, cl::desc("Enable Unroll And Jam Pass")); +static cl::opt EnableLoopFuse( + "enable-loop-fuse", cl::init(false), cl::Hidden, + cl::desc("Enable the new, experimental LoopFusion Pass")); + static cl::opt EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden, cl::desc("Enable preparation for ThinLTO.")); @@ -161,8 +167,11 @@ static cl::opt cl::desc("Enable control height reduction optimization (CHR)")); PassManagerBuilder::PassManagerBuilder() { + InstrumentCilk = false; OptLevel = 2; SizeLevel = 0; + ParallelLevel = 0; + Rhino = false; LibraryInfo = nullptr; Inliner = nullptr; DisableUnrollLoops = false; @@ -423,6 +432,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses( MPM.add(createControlHeightReductionLegacyPass()); } +// void PassManagerBuilder::prepopulateModulePassManager( void PassManagerBuilder::populateModulePassManager( legacy::PassManagerBase &MPM) { if (!PGOSampleUse.empty()) { @@ -442,6 +452,15 @@ void PassManagerBuilder::populateModulePassManager( Inliner = nullptr; } + if (ParallelLevel > 0) { + MPM.add(createInferFunctionAttrsLegacyPass()); + // MPM.add(createUnifyFunctionExitNodesPass()); + MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk)); + // The lowering pass may leave cruft around. Clean it up. + MPM.add(createCFGSimplificationPass()); + MPM.add(createInferFunctionAttrsLegacyPass()); + } + // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly // creates a CGSCC pass manager, but we don't want to add extensions into // that pass manager. To prevent this we insert a no-op module pass to reset @@ -498,6 +517,15 @@ void PassManagerBuilder::populateModulePassManager( if (PrepareForThinLTOUsingPGOSampleProfile) DisableUnrollLoops = true; + bool RerunAfterTapirLowering = false; + bool TapirHasBeenLowered = (ParallelLevel == 0); + if (ParallelLevel == 3) // -fdetach + MPM.add(createLowerTapirToCilkPass(false, InstrumentCilk)); + + do { + RerunAfterTapirLowering = + !TapirHasBeenLowered && (ParallelLevel > 0) && !PrepareForThinLTO; + // Infer attributes about declarations if possible. MPM.add(createInferFunctionAttrsLegacyPass()); @@ -745,6 +773,45 @@ void PassManagerBuilder::populateModulePassManager( // resulted in single-entry-single-exit or empty blocks. Clean up the CFG. MPM.add(createCFGSimplificationPass()); + if (RerunAfterTapirLowering || (ParallelLevel == 0)) + // Add passes to run just before Tapir lowering. + addExtensionsToPM(EP_TapirLate, MPM); + + if (!TapirHasBeenLowered) { + // First handle Tapir loops. + MPM.add(createIndVarSimplifyPass()); + + // Re-rotate loops in all our loop nests. These may have fallout out of + // rotated form due to GVN or other transformations, and loop spawning + // relies on the rotated form. Disable header duplication at -Oz. + MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + + MPM.add(createLoopSpawningPass()); + + // The LoopSpawning pass may leave cruft around. Clean it up. + MPM.add(createLoopDeletionPass()); + MPM.add(createCFGSimplificationPass()); + addInstructionCombiningPass(MPM); + addExtensionsToPM(EP_Peephole, MPM); + + // Now lower Tapir to Cilk runtime calls. + // + // TODO: Make this sequence of passes check the library info for the Cilk + // RTS. + + MPM.add(createInferFunctionAttrsLegacyPass()); + // MPM.add(createUnifyFunctionExitNodesPass()); + MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk)); + // The lowering pass may leave cruft around. Clean it up. + MPM.add(createCFGSimplificationPass()); + MPM.add(createInferFunctionAttrsLegacyPass()); + MPM.add(createMergeFunctionsPass()); + MPM.add(createBarrierNoopPass()); + + TapirHasBeenLowered = true; + } + } while (RerunAfterTapirLowering); + addExtensionsToPM(EP_OptimizerLast, MPM); if (PrepareForLTO) { @@ -754,6 +821,58 @@ void PassManagerBuilder::populateModulePassManager( } } +// void PassManagerBuilder::populateModulePassManager(legacy::PassManagerBase& MPM) { +// if (ParallelLevel != 0) { +// switch (ParallelLevel) { +// case 1: //fcilkplus +// case 2: //ftapir +// prepopulateModulePassManager(MPM); +// addExtensionsToPM(EP_TapirLate, MPM); +// break; +// case 3: //fdetach +// MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk)); +// prepopulateModulePassManager(MPM); +// addExtensionsToPM(EP_TapirLate, MPM); +// break; +// case 0: llvm_unreachable("invalid"); +// } + +// MPM.add(createBarrierNoopPass()); + +// if (OptLevel > 0) { +// MPM.add(createIndVarSimplifyPass()); + +// // Re-rotate loops in all our loop nests. These may have fallout out of +// // rotated form due to GVN or other transformations, and loop spawning +// // relies on the rotated form. Disable header duplication at -Oz. +// MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1)); + +// MPM.add(createLoopSpawningPass()); + +// // The LoopSpawning pass may leave cruft around. Clean it up. +// MPM.add(createLoopDeletionPass()); +// MPM.add(createCFGSimplificationPass()); +// addInstructionCombiningPass(MPM); +// addExtensionsToPM(EP_Peephole, MPM); +// } + +// // if (ParallelLevel != 3) MPM.add(createInferFunctionAttrsLegacyPass()); +// MPM.add(createInferFunctionAttrsLegacyPass()); +// MPM.add(createUnifyFunctionExitNodesPass()); +// MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk)); +// // The lowering pass may leave cruft around. Clean it up. +// MPM.add(createCFGSimplificationPass()); +// // if (ParallelLevel != 3) MPM.add(createInferFunctionAttrsLegacyPass()); +// MPM.add(createInferFunctionAttrsLegacyPass()); +// if (OptLevel != 0) MPM.add(createMergeFunctionsPass()); +// MPM.add(createBarrierNoopPass()); +// } +// prepopulateModulePassManager(MPM); +// if (ParallelLevel == 0) +// addExtensionsToPM(EP_TapirLate, MPM); +// addExtensionsToPM(EP_OptimizerLast, MPM); +// } + void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) { // Load sample profile before running the LTO optimization pipeline. if (!PGOSampleUse.empty()) { diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index aeb25d530d71b3..3f15930c467c57 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3908,6 +3908,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) { Intrinsic::lifetime_end, *this)) return nullptr; break; + case Intrinsic::syncregion_start: { + int NumUsers = 0; + for (User *U : II->users()) + if (isa(U) || isa(U) || isa(U)) + ++NumUsers; + if (!NumUsers) + return eraseInstFromFunction(CI); + break; + } case Intrinsic::assume: { Value *IIOperand = II->getArgOperand(0); // Remove an assume if it is followed by an identical assume. diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 76ab614090faa8..c1fe6ff1c54ae3 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1539,6 +1539,7 @@ bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) { if (StoreBB == DestBB || OtherBB == DestBB) return false; + assert(OtherBB); // Verify that the other block ends in a branch and is not otherwise empty. BasicBlock::iterator BBI(OtherBB->getTerminator()); BranchInst *OtherBr = dyn_cast(BBI); diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp index fef051aa1b7c35..421d4346c4593c 100644 --- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -3085,6 +3085,11 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) { // We can only sink load instructions if there is nothing between the load and // the end of block that could change the value. if (I->mayReadFromMemory()) { + // We can't generally move an instruction that reads from memory past a + // detach or reattach. + if (isa(I->getParent()->getTerminator()) || + isa(I->getParent()->getTerminator())) + return false; for (BasicBlock::iterator Scan = I->getIterator(), E = I->getParent()->end(); Scan != E; ++Scan) @@ -3185,8 +3190,10 @@ bool InstCombiner::run() { // If the user is one of our immediate successors, and if that successor // only has us as a predecessors (we'd have to split the critical edge - // otherwise), we can keep going. - if (UserIsSuccessor && UserParent->getUniquePredecessor()) { + // otherwise), we can keep going. Don't do this if the successor + // follows through a sync instruction, because that's a pessimization. + if (UserIsSuccessor && UserParent->getUniquePredecessor() && + !isa(BB->getTerminator())) { // Okay, the CFG is simple enough, try to sink this instruction. if (TryToSinkInstruction(I, UserParent)) { LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n'); diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp index f1558c75cb90bf..dd57e8a31e9587 100644 --- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -1194,6 +1194,11 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) { if (PreviouslySeenAllocaInfo != ProcessedAllocas.end()) return PreviouslySeenAllocaInfo->getSecond(); + bool FunctionContainsDetach = false; + { + for (const BasicBlock &BB : *(AI.getParent()->getParent())) + FunctionContainsDetach |= isa(BB.getTerminator()); + } bool IsInteresting = (AI.getAllocatedType()->isSized() && // alloca() may be called with 0 size, ignore it. @@ -1201,6 +1206,8 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) { // We are only interested in allocas not promotable to registers. // Promotable allocas are common under -O0. (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) && + (!ClSkipPromotableAllocas || + (!FunctionContainsDetach || !isAllocaParallelPromotable(&AI, *DT))) && // inalloca allocas are not treated as static, and we don't want // dynamic alloca instrumentation for them as well. !AI.isUsedWithInAlloca() && diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt index 94461849d5094e..0b41031ae18280 100644 --- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt @@ -3,6 +3,7 @@ add_llvm_library(LLVMInstrumentation BoundsChecking.cpp CGProfile.cpp ControlHeightReduction.cpp + CilkSanitizer.cpp DataFlowSanitizer.cpp GCOVProfiling.cpp MemorySanitizer.cpp @@ -15,6 +16,7 @@ add_llvm_library(LLVMInstrumentation ThreadSanitizer.cpp EfficiencySanitizer.cpp HWAddressSanitizer.cpp + ComprehensiveStaticInstrumentation.cpp ADDITIONAL_HEADER_DIRS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms diff --git a/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp new file mode 100644 index 00000000000000..62b3e0b1ed5710 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp @@ -0,0 +1,1164 @@ +//===- CilkSanitizer.cpp - determinacy race detector for Cilk/Tapir -------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of CilkSan, a determinacy race detector for Cilk +// programs. +// +// This instrumentation pass inserts calls to the runtime library before +// appropriate memory accesses. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/DetachSSA.h" +#include "llvm/Analysis/MemorySSA.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/Module.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Transforms/CSI.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "cilksan" + +STATISTIC(NumInstrumentedReads, "Number of instrumented reads"); +STATISTIC(NumInstrumentedWrites, "Number of instrumented writes"); +STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size"); +STATISTIC(NumOmittedReadsBeforeWrite, + "Number of reads ignored due to following writes"); +STATISTIC(NumOmittedReadsFromConstants, + "Number of reads from constant data"); +STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing"); +STATISTIC(NumInstrumentedDetaches, "Number of instrumented detaches"); +STATISTIC(NumInstrumentedDetachExits, "Number of instrumented detach exits"); +STATISTIC(NumInstrumentedSyncs, "Number of instrumented syncs"); + +static const char *const CsanDetachBaseIdName = "__csan_unit_detach_base_id"; +static const char *const CsanTaskBaseIdName = "__csan_unit_task_base_id"; +static const char *const CsanTaskExitBaseIdName = + "__csan_unit_task_exit_base_id"; +static const char *const CsanDetachContinueBaseIdName = + "__csan_unit_detach_continue_base_id"; +static const char *const CsanSyncBaseIdName = "__csan_unit_sync_base_id"; +static const char *const CsiUnitObjTableName = "__csi_unit_obj_table"; +static const char *const CsiUnitObjTableArrayName = "__csi_unit_obj_tables"; + +/// Maintains a mapping from CSI ID of a load or store to the source information +/// of the object accessed by that load or store. +class ObjectTable : public ForensicTable { +public: + ObjectTable() : ForensicTable() {} + ObjectTable(Module &M, StringRef BaseIdName) + : ForensicTable(M, BaseIdName) {} + + /// The number of entries in this table + uint64_t size() const { return LocalIdToSourceLocationMap.size(); } + + /// Add the given instruction to this table. + /// \returns The local ID of the Instruction. + uint64_t add(Instruction &I, Value *Addr, const DataLayout &DL); + + /// Get the Type for a pointer to a table entry. + /// + /// A table entry is just a source location. + static PointerType *getPointerType(LLVMContext &C); + + /// Insert this table into the given Module. + /// + /// The table is constructed as a ConstantArray indexed by local IDs. The + /// runtime is responsible for performing the mapping that allows the table to + /// be indexed by global ID. + Constant *insertIntoModule(Module &M) const; + +private: + struct SourceLocation { + StringRef Name; + int32_t Line; + StringRef Filename; + StringRef Directory; + }; + + /// Map of local ID to SourceLocation. + DenseMap LocalIdToSourceLocationMap; + + /// Create a struct type to match the "struct SourceLocation" type. + /// (and the source_loc_t type in csi.h). + static StructType *getSourceLocStructType(LLVMContext &C); + + /// Append the line and file information to the table. + void add(uint64_t ID, int32_t Line = -1, + StringRef Filename = "", StringRef Directory = "", + StringRef Name = ""); +}; + +namespace { + +struct CilkSanitizerImpl : public CSIImpl { + // CilkSanitizerImpl(Module &M, CallGraph *CG, + // function_ref GetDSSA, + // function_ref GetMSSA) + // : CSIImpl(M, CG), GetDSSA(GetDSSA), GetMSSA(GetMSSA) { + CilkSanitizerImpl(Module &M, CallGraph *CG, + function_ref GetDomTree, + const TargetLibraryInfo *TLI) + : CSIImpl(M, CG), GetDomTree(GetDomTree), TLI(TLI), + CsanFuncEntry(nullptr), CsanFuncExit(nullptr), CsanRead(nullptr), + CsanWrite(nullptr), CsanDetach(nullptr), CsanDetachContinue(nullptr), + CsanTaskEntry(nullptr), CsanTaskExit(nullptr), CsanSync(nullptr) { + // Even though we're doing our own instrumentation, we want the CSI setup + // for the instrumentation of function entry/exit, memory accesses (i.e., + // loads and stores), atomics, memory intrinsics. We also want call sites, + // for extracting debug information. + Options.InstrumentBasicBlocks = false; + // Options.InstrumentCalls = false; + Options.InstrumentMemoryAccesses = false; + Options.InstrumentMemIntrinsics = false; + } + bool run(); + + static StructType *getUnitObjTableType(LLVMContext &C, + PointerType *EntryPointerType); + static Constant *objTableToUnitObjTable(Module &M, + StructType *UnitObjTableType, + ObjectTable &ObjTable); + + // Methods for handling FED tables + void initializeCsanFEDTables(); + void collectUnitFEDTables(); + + // Methods for handling object tables + void initializeCsanObjectTables(); + void collectUnitObjectTables(); + + CallInst *createRTUnitInitCall(IRBuilder<> &IRB) override; + + // Initialize custom hooks for CilkSanitizer + void initializeCsanHooks(); + + // Insert hooks at relevant program points + bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL); + bool instrumentAtomic(Instruction *I, const DataLayout &DL); + bool instrumentMemIntrinsic(Instruction *I, const DataLayout &DL); + bool instrumentCallsite(Instruction *I, DominatorTree *DT); + bool instrumentDetach(DetachInst *DI, DominatorTree *DT); + bool instrumentSync(SyncInst *SI); + bool instrumentFunction(Function &F); + void chooseInstructionsToInstrument( + SmallVectorImpl &Local, + SmallVectorImpl &All, + const DataLayout &DL); + +private: + // Analysis results + // function_ref GetDSSA; + // function_ref GetMSSA; + function_ref GetDomTree; + const TargetLibraryInfo *TLI; + + // Instrumentation hooks + Function *CsanFuncEntry, *CsanFuncExit; + Function *CsanRead, *CsanWrite; + Function *CsanLargeRead, *CsanLargeWrite; + Function *CsanDetach, *CsanDetachContinue; + Function *CsanTaskEntry, *CsanTaskExit; + Function *CsanSync; + + // CilkSanitizer FED tables + FrontEndDataTable DetachFED, TaskFED, TaskExitFED, DetachContinueFED, + SyncFED; + + // CilkSanitizer custom forensic tables + ObjectTable LoadObj, StoreObj; + + SmallVector UnitObjTables; + +}; + +/// CilkSanitizer: instrument the code in module to find races. +struct CilkSanitizer : public ModulePass { + static char ID; // Pass identification, replacement for typeid. + CilkSanitizer() : ModulePass(ID) { + initializeCilkSanitizerPass(*PassRegistry::getPassRegistry()); + } + StringRef getPassName() const override { + return "CilkSanitizer"; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; + bool runOnModule(Module &M); +}; +} // namespace + +char CilkSanitizer::ID = 0; + +INITIALIZE_PASS_BEGIN( + CilkSanitizer, "csan", + "CilkSanitizer: detects determinacy races in Cilk programs.", + false, false) +INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +// INITIALIZE_PASS_DEPENDENCY(DetachSSAWrapperPass) +// INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass) +INITIALIZE_PASS_END( + CilkSanitizer, "csan", + "CilkSanitizer: detects determinacy races in Cilk programs.", + false, false) + +void CilkSanitizer::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + // AU.addRequired(); + // AU.addRequired(); +} + +ModulePass *llvm::createCilkSanitizerPass() { + return new CilkSanitizer(); +} + +uint64_t ObjectTable::add(Instruction &I, + Value *Addr, + const DataLayout &DL) { + uint64_t ID = getId(&I); + Value *Obj = GetUnderlyingObject(Addr, DL); + + // First, if the underlying object is a global variable, get that variable's + // debug information. + if (GlobalVariable *GV = dyn_cast(Obj)) { + SmallVector DbgGVExprs; + GV->getDebugInfo(DbgGVExprs); + for (auto *GVE : DbgGVExprs) { + auto *DGV = GVE->getVariable(); + if (DGV->getName() != "") { + add(ID, DGV->getLine(), DGV->getFilename(), DGV->getDirectory(), + DGV->getName()); + return ID; + } + } + add(ID); + return ID; + } + + // Next, if this is an alloca instruction, look for a llvm.dbg.declare + // intrinsic. + if (isa(Obj)) { + if (auto *DDI = FindAllocaDbgDeclare(Obj)) { + auto *LV = DDI->getVariable(); + if (LV->getName() != "") { + add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(), + LV->getName()); + return ID; + } + } + } + + // Otherwise just examine the llvm.dbg.value intrinsics for this object. + SmallVector DbgValues; + findDbgValues(DbgValues, Obj); + for (auto *DVI : DbgValues) { + auto *LV = DVI->getVariable(); + if (LV->getName() != "") { + add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(), + LV->getName()); + return ID; + } + } + + add(ID); + return ID; +} + +PointerType *ObjectTable::getPointerType(LLVMContext &C) { + return PointerType::get(getSourceLocStructType(C), 0); +} + +StructType *ObjectTable::getSourceLocStructType(LLVMContext &C) { + return StructType::get( + /* Name */ PointerType::get(IntegerType::get(C, 8), 0), + /* Line */ IntegerType::get(C, 32), + /* File */ PointerType::get(IntegerType::get(C, 8), 0)); +} + +void ObjectTable::add(uint64_t ID, int32_t Line, + StringRef Filename, StringRef Directory, + StringRef Name) { + assert(LocalIdToSourceLocationMap.find(ID) == + LocalIdToSourceLocationMap.end() && + "Id already exists in FED table."); + LocalIdToSourceLocationMap[ID] = {Name, Line, Filename, Directory}; +} + +Constant *ObjectTable::insertIntoModule(Module &M) const { + LLVMContext &C = M.getContext(); + StructType *TableType = getSourceLocStructType(C); + IntegerType *Int32Ty = IntegerType::get(C, 32); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + Value *GepArgs[] = {Zero, Zero}; + SmallVector TableEntries; + + for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) { + const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second; + Constant *Line = ConstantInt::get(Int32Ty, E.Line); + Constant *File; + { + std::string Filename = E.Filename.str(); + if (!E.Directory.empty()) + Filename = E.Directory.str() + "/" + Filename; + Constant *FileStrConstant = ConstantDataArray::getString(C, Filename); + GlobalVariable *GV = + M.getGlobalVariable("__csi_unit_filename_" + Filename, true); + if (GV == NULL) { + GV = new GlobalVariable(M, FileStrConstant->getType(), + true, GlobalValue::PrivateLinkage, + FileStrConstant, + "__csi_unit_filename_" + Filename, + nullptr, + GlobalVariable::NotThreadLocal, 0); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + } + assert(GV); + File = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); + } + Constant *Name; + if (E.Name.empty()) + Name = ConstantPointerNull::get(PointerType::get( + IntegerType::get(C, 8), 0)); + else { + Constant *NameStrConstant = ConstantDataArray::getString(C, E.Name); + GlobalVariable *GV = + M.getGlobalVariable(("__csi_unit_object_name_" + E.Name).str(), true); + if (GV == NULL) { + GV = new GlobalVariable(M, NameStrConstant->getType(), + true, GlobalValue::PrivateLinkage, + NameStrConstant, + "__csi_unit_object_name_" + E.Name, + nullptr, + GlobalVariable::NotThreadLocal, 0); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + } + assert(GV); + Name = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); + } + // The order of arguments to ConstantStruct::get() must match the + // source_loc_t type in csi.h. + TableEntries.push_back(ConstantStruct::get(TableType, Name, Line, File)); + } + + ArrayType *TableArrayType = ArrayType::get(TableType, TableEntries.size()); + Constant *Table = ConstantArray::get(TableArrayType, TableEntries); + GlobalVariable *GV = + new GlobalVariable(M, TableArrayType, false, GlobalValue::InternalLinkage, + Table, CsiUnitObjTableName); + return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); +} + +bool CilkSanitizerImpl::run() { + initializeCsi(); + initializeCsanFEDTables(); + initializeCsanObjectTables(); + initializeCsanHooks(); + + for (Function &F : M) { + DEBUG(dbgs() << "Instrumenting " << F.getName() << "\n"); + instrumentFunction(F); + } + + collectUnitFEDTables(); + collectUnitObjectTables(); + finalizeCsi(); + return true; +} + +void CilkSanitizerImpl::initializeCsanFEDTables() { + DetachFED = FrontEndDataTable(M, CsanDetachBaseIdName); + TaskFED = FrontEndDataTable(M, CsanTaskBaseIdName); + TaskExitFED = FrontEndDataTable(M, CsanTaskExitBaseIdName); + DetachContinueFED = FrontEndDataTable(M, CsanDetachContinueBaseIdName); + SyncFED = FrontEndDataTable(M, CsanSyncBaseIdName); +} + +void CilkSanitizerImpl::initializeCsanObjectTables() { + LoadObj = ObjectTable(M, CsiLoadBaseIdName); + StoreObj = ObjectTable(M, CsiStoreBaseIdName); +} + +void CilkSanitizerImpl::collectUnitFEDTables() { + CSIImpl::collectUnitFEDTables(); + LLVMContext &C = M.getContext(); + StructType *UnitFedTableType = + getUnitFedTableType(C, FrontEndDataTable::getPointerType(C)); + + // The order of the FED tables here must match the enum in csanrt.c and the + // csan_instrumentation_counts_t in csan.h. + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, DetachFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, TaskFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, TaskExitFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, DetachContinueFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, SyncFED)); +} + +// Create a struct type to match the unit_obj_entry_t type in csanrt.c. +StructType *CilkSanitizerImpl::getUnitObjTableType(LLVMContext &C, + PointerType *EntryPointerType) { + return StructType::get(IntegerType::get(C, 64), + EntryPointerType); +} + +Constant *CilkSanitizerImpl::objTableToUnitObjTable( + Module &M, StructType *UnitObjTableType, ObjectTable &ObjTable) { + Constant *NumEntries = + ConstantInt::get(IntegerType::get(M.getContext(), 64), ObjTable.size()); + // Constant *BaseIdPtr = + // ConstantExpr::getPointerCast(FedTable.baseId(), + // Type::getInt8PtrTy(M.getContext(), 0)); + Constant *InsertedTable = ObjTable.insertIntoModule(M); + return ConstantStruct::get(UnitObjTableType, NumEntries, + InsertedTable); +} + +void CilkSanitizerImpl::collectUnitObjectTables() { + LLVMContext &C = M.getContext(); + StructType *UnitObjTableType = + getUnitObjTableType(C, ObjectTable::getPointerType(C)); + + UnitObjTables.push_back( + objTableToUnitObjTable(M, UnitObjTableType, LoadObj)); + UnitObjTables.push_back( + objTableToUnitObjTable(M, UnitObjTableType, StoreObj)); +} + +CallInst *CilkSanitizerImpl::createRTUnitInitCall(IRBuilder<> &IRB) { + LLVMContext &C = M.getContext(); + + StructType *UnitFedTableType = + getUnitFedTableType(C, FrontEndDataTable::getPointerType(C)); + StructType *UnitObjTableType = + getUnitObjTableType(C, ObjectTable::getPointerType(C)); + + // Lookup __csirt_unit_init + SmallVector InitArgTypes({IRB.getInt8PtrTy(), + PointerType::get(UnitFedTableType, 0), + PointerType::get(UnitObjTableType, 0), + InitCallsiteToFunction->getType()}); + FunctionType *InitFunctionTy = + FunctionType::get(IRB.getVoidTy(), InitArgTypes, false); + RTUnitInit = checkCsiInterfaceFunction( + M.getOrInsertFunction(CsiRtUnitInitName, InitFunctionTy)); + assert(RTUnitInit); + + ArrayType *UnitFedTableArrayType = + ArrayType::get(UnitFedTableType, UnitFedTables.size()); + Constant *FEDTable = ConstantArray::get(UnitFedTableArrayType, UnitFedTables); + GlobalVariable *FEDGV = new GlobalVariable(M, UnitFedTableArrayType, false, + GlobalValue::InternalLinkage, FEDTable, + CsiUnitFedTableArrayName); + + ArrayType *UnitObjTableArrayType = + ArrayType::get(UnitObjTableType, UnitObjTables.size()); + Constant *ObjTable = ConstantArray::get(UnitObjTableArrayType, UnitObjTables); + GlobalVariable *ObjGV = new GlobalVariable(M, UnitObjTableArrayType, false, + GlobalValue::InternalLinkage, ObjTable, + CsiUnitObjTableArrayName); + + Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0); + Value *GepArgs[] = {Zero, Zero}; + + // Insert call to __csirt_unit_init + return IRB.CreateCall( + RTUnitInit, + {IRB.CreateGlobalStringPtr(M.getName()), + ConstantExpr::getGetElementPtr(FEDGV->getValueType(), FEDGV, GepArgs), + ConstantExpr::getGetElementPtr(ObjGV->getValueType(), ObjGV, GepArgs), + InitCallsiteToFunction}); +} + +void CilkSanitizerImpl::initializeCsanHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *FuncPropertyTy = CsiFuncProperty::getType(C); + Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C); + Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C); + Type *StorePropertyTy = CsiLoadStoreProperty::getType(C); + Type *RetType = IRB.getVoidTy(); + Type *AddrType = IRB.getInt8PtrTy(); + Type *NumBytesType = IRB.getInt32Ty(); + Type *LargeNumBytesType = IntptrTy; + Type *IDType = IRB.getInt64Ty(); + + CsanFuncEntry = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_func_entry", RetType, + /* func_id */ IDType, + /* stack_ptr */ AddrType, + FuncPropertyTy)); + CsanFuncExit = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_func_exit", RetType, + /* func_exit_id */ IDType, + /* func_id */ IDType, + FuncExitPropertyTy)); + + CsanRead = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_load", RetType, IDType, + AddrType, NumBytesType, LoadPropertyTy)); + CsanWrite = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_store", RetType, IDType, + AddrType, NumBytesType, StorePropertyTy)); + CsanLargeRead = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_large_load", RetType, IDType, + AddrType, LargeNumBytesType, LoadPropertyTy)); + CsanLargeWrite = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_large_store", RetType, IDType, + AddrType, LargeNumBytesType, StorePropertyTy)); + // CsanWrite = checkCsiInterfaceFunction( + // M.getOrInsertFunction("__csan_atomic_exchange", RetType, IDType, + // AddrType, NumBytesType, StorePropertyTy)); + + CsanDetach = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_detach", RetType, + /* detach_id */ IDType)); + CsanTaskEntry = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_task", RetType, + /* task_id */ IDType, + /* detach_id */ IDType, + /* stack_ptr */ AddrType)); + CsanTaskExit = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_task_exit", RetType, + /* task_exit_id */ IDType, + /* task_id */ IDType, + /* detach_id */ IDType)); + CsanDetachContinue = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_detach_continue", RetType, + /* detach_continue_id */ IDType, + /* detach_id */ IDType)); + CsanSync = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csan_sync", RetType, IDType)); +} + +// Do not instrument known races/"benign races" that come from compiler +// instrumentatin. The user has no way of suppressing them. +static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) { + // Peel off GEPs and BitCasts. + Addr = Addr->stripInBoundsOffsets(); + + if (GlobalVariable *GV = dyn_cast(Addr)) { + if (GV->hasSection()) { + StringRef SectionName = GV->getSection(); + // Check if the global is in the PGO counters section. + auto OF = Triple(M->getTargetTriple()).getObjectFormat(); + if (SectionName.endswith( + getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false))) + return false; + } + + // Check if the global is private gcov data. + if (GV->getName().startswith("__llvm_gcov") || + GV->getName().startswith("__llvm_gcda")) + return false; + } + + // Do not instrument acesses from different address spaces; we cannot deal + // with them. + if (Addr) { + Type *PtrTy = cast(Addr->getType()->getScalarType()); + if (PtrTy->getPointerAddressSpace() != 0) + return false; + } + + return true; +} + +// Examine the uses of a given AllocaInst to determine if some use is detached. +static bool MightHaveDetachedUse(const AllocaInst *AI) { + const BasicBlock *AllocaCtx = GetDetachedCtx(AI->getParent()); + SmallVector Worklist; + SmallSet Visited; + + for (const Use &U : AI->uses()) { + Visited.insert(&U); + Worklist.push_back(&U); + } + + while (!Worklist.empty()) { + const Use *U = Worklist.pop_back_val(); + Instruction *I = cast(U->getUser()); + if (AllocaCtx != GetDetachedCtx(I->getParent())) + return true; + + switch (I->getOpcode()) { + case Instruction::BitCast: + case Instruction::GetElementPtr: + case Instruction::PHI: + case Instruction::Select: + case Instruction::AddrSpaceCast: + for (Use &UU : I->uses()) + if (Visited.insert(&UU).second) + Worklist.push_back(&UU); + break; + default: + break; + } + } + return false; +} + +void CilkSanitizerImpl::chooseInstructionsToInstrument( + SmallVectorImpl &Local, SmallVectorImpl &All, + const DataLayout &DL) { + SmallSet WriteTargets; + // Iterate from the end. + for (Instruction *I : reverse(Local)) { + if (StoreInst *Store = dyn_cast(I)) { + Value *Addr = Store->getPointerOperand(); + if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr)) + continue; + WriteTargets.insert(Addr); + } else { + LoadInst *Load = cast(I); + Value *Addr = Load->getPointerOperand(); + if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr)) + continue; + if (WriteTargets.count(Addr)) { + // We will write to this temp, so no reason to analyze the read. + NumOmittedReadsBeforeWrite++; + continue; + } + if (addrPointsToConstantData(Addr)) { + // Addr points to some constant data -- it can not race with any writes. + NumOmittedReadsFromConstants++; + continue; + } + } + Value *Addr = isa(*I) + ? cast(I)->getPointerOperand() + : cast(I)->getPointerOperand(); + Value *Obj = GetUnderlyingObject(Addr, DL); + if (isa(Obj) && + !PointerMayBeCaptured(Addr, true, true) && + !MightHaveDetachedUse(cast(Obj))) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + continue; + } + All.push_back(I); + } + Local.clear(); +} + +bool CilkSanitizerImpl::instrumentFunction(Function &F) { + if (F.empty() || shouldNotInstrumentFunction(F)) + return false; + + DominatorTree *DT = &GetDomTree(F); + // DetachSSA &DSSA = GetDSSA(F); + // MemorySSA &MSSA = GetMSSA(F); + + SmallVector AllLoadsAndStores; + SmallVector LocalLoadsAndStores; + SmallVector AtomicAccesses; + SmallVector MemIntrinCalls; + SmallVector Callsites; + SmallVector Detaches; + SmallVector Syncs; + bool Res = false; + bool HasCalls = false; + bool MaySpawn = false; + + // TODO: Consider modifying this to choose instrumentation to insert based on + // fibrils, not basic blocks. + for (BasicBlock &BB : F) { + // Record the Tapir instructions found + if (DetachInst *DI = dyn_cast(BB.getTerminator())) { + MaySpawn = true; + Detaches.push_back(DI); + } else if (SyncInst *SI = dyn_cast(BB.getTerminator())) + Syncs.push_back(SI); + + // Record the memory accesses in the basic block + for (Instruction &Inst : BB) { + if (isa(Inst) || isa(Inst)) + LocalLoadsAndStores.push_back(&Inst); + else if (isa(Inst) || isa(Inst)) + AtomicAccesses.push_back(&Inst); + else if (isa(Inst) || isa(Inst)) { + if (CallInst *CI = dyn_cast(&Inst)) + maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI); + if (isa(Inst)) + MemIntrinCalls.push_back(&Inst); + if (!isa(Inst)) { + if (!isa(Inst)) + Callsites.push_back(&Inst); + HasCalls = true; + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, + DL); + } + } + } + chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL); + } + + uint64_t LocalId = getLocalFunctionID(F); + + for (auto Inst : AllLoadsAndStores) + Res |= instrumentLoadOrStore(Inst, DL); + + for (auto Inst : AtomicAccesses) + Res |= instrumentAtomic(Inst, DL); + + for (auto Inst : MemIntrinCalls) + Res |= instrumentMemIntrinsic(Inst, DL); + + for (auto Inst : Callsites) + Res |= instrumentCallsite(Inst, DT); + + for (auto Inst : Detaches) + Res |= instrumentDetach(Inst, DT); + + for (auto Inst : Syncs) + Res |= instrumentSync(Inst); + + if ((Res || HasCalls)) { + IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt()); + CsiFuncProperty FuncEntryProp; + FuncEntryProp.setMaySpawn(MaySpawn); + Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB); + // TODO: Determine if we actually want the frame pointer, not the stack + // pointer. + // Value *StackSave = IRB.CreateCall( + // Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + // IRB.CreateCall(CsanFuncEntry, {FuncId, StackSave, FuncEntryProp.getValue(IRB)}); + Value *FrameAddr = IRB.CreateCall( + Intrinsic::getDeclaration(&M, Intrinsic::frameaddress), + {IRB.getInt32(0)}); + IRB.CreateCall(CsanFuncEntry, {FuncId, FrameAddr, FuncEntryProp.getValue(IRB)}); + + EscapeEnumerator EE(F, "csan_cleanup", true); + while (IRBuilder<> *AtExit = EE.Next()) { + // uint64_t ExitLocalId = FunctionExitFED.add(F); + uint64_t ExitLocalId = FunctionExitFED.add(*AtExit->GetInsertPoint()); + Value *ExitCsiId = FunctionExitFED.localToGlobalId(ExitLocalId, *AtExit); + CsiFuncExitProperty FuncExitProp; + FuncExitProp.setMaySpawn(MaySpawn); + AtExit->CreateCall(CsanFuncExit, + {ExitCsiId, FuncId, FuncExitProp.getValue(*AtExit)}); + } + } + return Res; +} + +bool CilkSanitizerImpl::instrumentLoadOrStore(Instruction *I, + const DataLayout &DL) { + IRBuilder<> IRB(I); + bool IsWrite = isa(*I); + Value *Addr = IsWrite + ? cast(I)->getPointerOperand() + : cast(I)->getPointerOperand(); + + // swifterror memory addresses are mem2reg promoted by instruction selection. + // As such they cannot have regular uses like an instrumentation function and + // it makes no sense to track them as memory. + if (Addr->isSwiftError()) + return false; + + int NumBytesAccessed = getNumBytesAccessed(Addr, DL); + if (-1 == NumBytesAccessed) { + // Ignore accesses with bad sizes. + NumAccessesWithBadSize++; + return false; + } + + const unsigned Alignment = IsWrite + ? cast(I)->getAlignment() + : cast(I)->getAlignment(); + CsiLoadStoreProperty Prop; + Prop.setAlignment(Alignment); + if (IsWrite) { + uint64_t LocalId = StoreFED.add(*I); + uint64_t StoreObjId = StoreObj.add(*I, Addr, DL); + assert(LocalId == StoreObjId && + "Store received different ID's in FED and object tables."); + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.getInt32(NumBytesAccessed), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanWrite, Args); + IRB.SetInstDebugLocation(Call); + NumInstrumentedWrites++; + } else { + uint64_t LocalId = LoadFED.add(*I); + uint64_t LoadObjId = LoadObj.add(*I, Addr, DL); + assert(LocalId == LoadObjId && + "Load received different ID's in FED and object tables."); + Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.getInt32(NumBytesAccessed), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanRead, Args); + IRB.SetInstDebugLocation(Call); + NumInstrumentedReads++; + } + return true; +} + +bool CilkSanitizerImpl::instrumentAtomic(Instruction *I, const DataLayout &DL) { + IRBuilder<> IRB(I); + CsiLoadStoreProperty Prop; + Value *Addr; + if (AtomicRMWInst *RMWI = dyn_cast(I)) { + Addr = RMWI->getPointerOperand(); + } else if (AtomicCmpXchgInst *CASI = dyn_cast(I)) { + Addr = CASI->getPointerOperand(); + } else { + return false; + } + + Value *Obj = GetUnderlyingObject(Addr, DL); + if (isa(Obj) && + !PointerMayBeCaptured(Addr, true, true) && + !MightHaveDetachedUse(cast(Obj))) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + return false; + } + + int NumBytesAccessed = getNumBytesAccessed(Addr, DL); + if (-1 == NumBytesAccessed) { + // Ignore accesses with bad sizes. + NumAccessesWithBadSize++; + return false; + } + + uint64_t LocalId = StoreFED.add(*I); + uint64_t StoreObjId = StoreObj.add(*I, Addr, DL); + assert(LocalId == StoreObjId && + "Store received different ID's in FED and object tables."); + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.getInt32(NumBytesAccessed), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanWrite, Args); + IRB.SetInstDebugLocation(Call); + NumInstrumentedWrites++; + return true; +} + +bool CilkSanitizerImpl::instrumentMemIntrinsic(Instruction *I, + const DataLayout &DL) { + CsiLoadStoreProperty Prop; + IRBuilder<> IRB(I); + if (MemSetInst *M = dyn_cast(I)) { + // Check if we need to instrument the memset. + Value *Addr = M->getArgOperand(0); + Value *Obj = GetUnderlyingObject(Addr, DL); + if (isa(Obj) && + !PointerMayBeCaptured(Addr, true, true) && + !MightHaveDetachedUse(cast(Obj))) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + return false; + } + + if (ConstantInt *CI = dyn_cast(M->getArgOperand(3))) + Prop.setAlignment(CI->getZExtValue()); + uint64_t LocalId = StoreFED.add(*I); + uint64_t StoreObjId = StoreObj.add(*I, Addr, DL); + assert(LocalId == StoreObjId && + "Store received different ID's in FED and object tables."); + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + Value *Args[] = {CsiId, + IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false), + Prop.getValue(IRB)}; + Instruction *Call = IRB.CreateCall(CsanLargeWrite, Args); + IRB.SetInstDebugLocation(Call); + return true; + + } else if (MemTransferInst *M = dyn_cast(I)) { + if (ConstantInt *CI = dyn_cast(M->getArgOperand(3))) + Prop.setAlignment(CI->getZExtValue()); + Value *StoreAddr = M->getArgOperand(0); + Value *LoadAddr = M->getArgOperand(1); + bool Instrumented = false; + + // First check if we need to instrument the store. + Value *SObj = GetUnderlyingObject(StoreAddr, DL); + if (isa(SObj) && + !PointerMayBeCaptured(StoreAddr, true, true) && + !MightHaveDetachedUse(cast(SObj))) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + } else { + // Instrument the store + uint64_t StoreId = StoreFED.add(*I); + uint64_t StoreObjId = StoreObj.add(*I, StoreAddr, DL); + assert(StoreId == StoreObjId && + "Store received different ID's in FED and object tables."); + Value *StoreCsiId = StoreFED.localToGlobalId(StoreId, IRB); + Value *StoreArgs[] = {StoreCsiId, + IRB.CreatePointerCast(StoreAddr, IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false), + Prop.getValue(IRB)}; + Instruction *WriteCall = IRB.CreateCall(CsanLargeWrite, StoreArgs); + IRB.SetInstDebugLocation(WriteCall); + Instrumented = true; + } + Value *LObj = GetUnderlyingObject(LoadAddr, DL); + if (isa(LObj) && + !PointerMayBeCaptured(LoadAddr, true, true) && + !MightHaveDetachedUse(cast(LObj))) { + // The variable is addressable but not captured, so it cannot be + // referenced from a different thread and participate in a data race + // (see llvm/Analysis/CaptureTracking.h for details). + NumOmittedNonCaptured++; + } else { + // Instrument the load + uint64_t LoadId = LoadFED.add(*I); + uint64_t LoadObjId = LoadObj.add(*I, LoadAddr, DL); + assert(LoadId == LoadObjId && + "Load received different ID's in FED and object tables."); + Value *LoadCsiId = StoreFED.localToGlobalId(LoadId, IRB); + Value *LoadArgs[] = {LoadCsiId, + IRB.CreatePointerCast(LoadAddr, IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false), + Prop.getValue(IRB)}; + Instruction *ReadCall = IRB.CreateCall(CsanLargeRead, LoadArgs); + IRB.SetInstDebugLocation(ReadCall); + Instrumented = true; + } + return Instrumented; + } + return false; +} + +bool CilkSanitizerImpl::instrumentCallsite(Instruction *I, DominatorTree *DT) { + // Exclude calls to the syncregion.start intrinsic. + if (IntrinsicInst *II = dyn_cast(I)) + if (Intrinsic::syncregion_start == II->getIntrinsicID() || + Intrinsic::lifetime_start == II->getIntrinsicID() || + Intrinsic::lifetime_end == II->getIntrinsicID()) + return false; + + bool IsInvoke = isa(I); + + Function *Called = NULL; + if (CallInst *CI = dyn_cast(I)) + Called = CI->getCalledFunction(); + else if (InvokeInst *II = dyn_cast(I)) + Called = II->getCalledFunction(); + + IRBuilder<> IRB(I); + uint64_t LocalId = CallsiteFED.add(*I); + Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + Value *FuncId = NULL; + GlobalVariable *FuncIdGV = NULL; + if (Called) { + Module *M = I->getParent()->getParent()->getParent(); + std::string GVName = + CsiFuncIdVariablePrefix + Called->getName().str(); + FuncIdGV = dyn_cast(M->getOrInsertGlobal(GVName, + IRB.getInt64Ty())); + assert(FuncIdGV); + FuncIdGV->setConstant(false); + FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage); + FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId)); + FuncId = IRB.CreateLoad(FuncIdGV); + } else { + // Unknown targets (i.e. indirect calls) are always unknown. + FuncId = IRB.getInt64(CsiCallsiteUnknownTargetId); + } + assert(FuncId != NULL); + CsiCallProperty Prop; + Prop.setIsIndirect(!Called); + Value *PropVal = Prop.getValue(IRB); + insertConditionalHookCall(I, CsiBeforeCallsite, + {CallsiteId, FuncId, PropVal}); + + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block + // and the exception block. This also means we have to recompute + // the callsite and function IDs in each basic block so that we + // can use it for the after hook. + + // TODO: Do we want the "after" hook for this callsite to come + // before or after the BB entry hook? Currently it is inserted + // before BB entry because instrumentCallsite is called after + // instrumentBasicBlock. + + // TODO: If a destination of an invoke has multiple predecessors, then we + // must split that destination. + InvokeInst *II = dyn_cast(I); + BasicBlock *NormalBB = II->getNormalDest(); + unsigned SuccNum = GetSuccessorNumber(II->getParent(), NormalBB); + if (isCriticalEdge(II, SuccNum)) + NormalBB = SplitCriticalEdge(II, SuccNum, + CriticalEdgeSplittingOptions(DT)); + IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt()); + CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV); + PropVal = Prop.getValue(IRB); + insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}); + + BasicBlock *UnwindBB = II->getUnwindDest(); + IRB.SetInsertPoint(&*UnwindBB->getFirstInsertionPt()); + CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV); + PropVal = Prop.getValue(IRB); + insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}); + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + PropVal = Prop.getValue(IRB); + insertConditionalHookCall(&*Iter, CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}); + } + + return true; +} + +bool CilkSanitizerImpl::instrumentDetach(DetachInst *DI, + DominatorTree *DT) { + // Instrument the detach instruction itself + Value *DetachID; + { + IRBuilder<> IRB(DI); + uint64_t LocalID = DetachFED.add(*DI); + DetachID = DetachFED.localToGlobalId(LocalID, IRB); + Instruction *Call = IRB.CreateCall(CsanDetach, {DetachID}); + IRB.SetInstDebugLocation(Call); + } + NumInstrumentedDetaches++; + + // Find the detached block, continuation, and associated reattaches. + BasicBlock *DetachedBlock = DI->getDetached(); + BasicBlock *ContinueBlock = DI->getContinue(); + SmallVector TaskExits; + // TODO: Extend this loop to find EH exits of the detached task. + for (BasicBlock *Pred : predecessors(ContinueBlock)) + if (isa(Pred->getTerminator())) + TaskExits.push_back(Pred); + + // Instrument the entry and exit points of the detached task. + { + // Instrument the entry point of the detached task. + IRBuilder<> IRB(&*DetachedBlock->getFirstInsertionPt()); + uint64_t LocalID = TaskFED.add(*DetachedBlock); + Value *TaskID = TaskFED.localToGlobalId(LocalID, IRB); + // TODO: Determine if we actually want the frame pointer, not the stack + // pointer. + // Value *StackSave = IRB.CreateCall( + // Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + // Instruction *Call = IRB.CreateCall(CsanTaskEntry, + // {TaskID, DetachID, StackSave}); + Value *FrameAddr = IRB.CreateCall( + Intrinsic::getDeclaration(&M, Intrinsic::frameaddress), + {IRB.getInt32(0)}); + Instruction *Call = IRB.CreateCall(CsanTaskEntry, + {TaskID, DetachID, FrameAddr}); + IRB.SetInstDebugLocation(Call); + + // Instrument the exit points of the detached tasks. + for (BasicBlock *TaskExit : TaskExits) { + IRBuilder<> IRB(TaskExit->getTerminator()); + uint64_t LocalID = TaskExitFED.add(*TaskExit->getTerminator()); + Value *TaskExitID = TaskExitFED.localToGlobalId(LocalID, IRB); + Instruction *Call = IRB.CreateCall(CsanTaskExit, + {TaskExitID, TaskID, DetachID}); + IRB.SetInstDebugLocation(Call); + NumInstrumentedDetachExits++; + } + } + + // Instrument the continuation of the detach. + { + if (isCriticalContinueEdge(DI, 1)) + ContinueBlock = SplitCriticalEdge( + DI, 1, + CriticalEdgeSplittingOptions(DT).setSplitDetachContinue()); + + IRBuilder<> IRB(&*ContinueBlock->getFirstInsertionPt()); + uint64_t LocalID = DetachContinueFED.add(*ContinueBlock); + Value *ContinueID = DetachContinueFED.localToGlobalId(LocalID, IRB); + Instruction *Call = IRB.CreateCall(CsanDetachContinue, + {ContinueID, DetachID}); + IRB.SetInstDebugLocation(Call); + } + return true; +} + +bool CilkSanitizerImpl::instrumentSync(SyncInst *SI) { + IRBuilder<> IRB(SI); + // Get the ID of this sync. + uint64_t LocalID = SyncFED.add(*SI); + Value *SyncID = SyncFED.localToGlobalId(LocalID, IRB); + // Insert instrumentation before the sync. + Instruction *Call = IRB.CreateCall(CsanSync, {SyncID}); + IRB.SetInstDebugLocation(Call); + NumInstrumentedSyncs++; + return true; +} + +bool CilkSanitizer::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + // auto GetDSSA = [this](Function &F) -> DetachSSA & { + // return this->getAnalysis(F).getDSSA(); + // }; + // auto GetMSSA = [this](Function &F) -> MemorySSA & { + // return this->getAnalysis(F).getMSSA(); + // }; + + CallGraph *CG = &getAnalysis().getCallGraph(); + const TargetLibraryInfo *TLI = + &getAnalysis().getTLI(); + auto GetDomTree = [this](Function &F) -> DominatorTree & { + return this->getAnalysis(F).getDomTree(); + }; + + // return CilkSanitizerImpl(M, CG, GetDSSA, GetMSSA).run(); + return CilkSanitizerImpl(M, CG, GetDomTree, TLI).run(); +} diff --git a/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp new file mode 100644 index 00000000000000..1446eb4b8e7dd3 --- /dev/null +++ b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp @@ -0,0 +1,982 @@ +//===-- ComprehensiveStaticInstrumentation.cpp - instrumentation hooks ----===// +// +// The LLVM Compiler Infrastructure +// +// TODO: License +//===----------------------------------------------------------------------===// +// +// This file is part of CSI, a framework that provides comprehensive static +// instrumentation. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/CSI.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CallGraph.h" +#include "llvm/Analysis/CaptureTracking.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "csi" + +static cl::opt ClInstrumentFuncEntryExit( + "csi-instrument-func-entry-exit", cl::init(true), + cl::desc("Instrument function entry and exit"), cl::Hidden); +static cl::opt ClInstrumentBasicBlocks( + "csi-instrument-basic-blocks", cl::init(true), + cl::desc("Instrument basic blocks"), cl::Hidden); +static cl::opt ClInstrumentMemoryAccesses( + "csi-instrument-memory-accesses", cl::init(true), + cl::desc("Instrument memory accesses"), cl::Hidden); +static cl::opt ClInstrumentCalls( + "csi-instrument-function-calls", cl::init(true), + cl::desc("Instrument function calls"), cl::Hidden); +static cl::opt ClInstrumentAtomics( + "csi-instrument-atomics", cl::init(true), + cl::desc("Instrument atomics"), cl::Hidden); +static cl::opt ClInstrumentMemIntrinsics( + "csi-instrument-memintrinsics", cl::init(true), + cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden); + +namespace { + +static CSIOptions OverrideFromCL(CSIOptions Options) { + Options.InstrumentFuncEntryExit |= ClInstrumentFuncEntryExit; + Options.InstrumentBasicBlocks |= ClInstrumentBasicBlocks; + Options.InstrumentMemoryAccesses |= ClInstrumentMemoryAccesses; + Options.InstrumentCalls |= ClInstrumentCalls; + Options.InstrumentAtomics |= ClInstrumentAtomics; + Options.InstrumentMemIntrinsics |= ClInstrumentMemIntrinsics; + return Options; +} + +/// The Comprehensive Static Instrumentation pass. +/// Inserts calls to user-defined hooks at predefined points in the IR. +struct ComprehensiveStaticInstrumentation : public ModulePass { + static char ID; // Pass identification, replacement for typeid. + + ComprehensiveStaticInstrumentation( + const CSIOptions &Options = CSIOptions()) + : ModulePass(ID), Options(OverrideFromCL(Options)) { + initializeComprehensiveStaticInstrumentationPass( + *PassRegistry::getPassRegistry()); + } + StringRef getPassName() const override { + return "ComprehensiveStaticInstrumentation"; + } + bool runOnModule(Module &M) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + CSIOptions Options; +}; // struct ComprehensiveStaticInstrumentation +} // anonymous namespace + +char ComprehensiveStaticInstrumentation::ID = 0; + +INITIALIZE_PASS(ComprehensiveStaticInstrumentation, "csi", + "ComprehensiveStaticInstrumentation pass", false, false) + +ModulePass *llvm::createComprehensiveStaticInstrumentationPass( + const CSIOptions &Options) { + return new ComprehensiveStaticInstrumentation(Options); +} + +/// Return the first DILocation in the given basic block, or nullptr +/// if none exists. +static const DILocation *getFirstDebugLoc(const BasicBlock &BB) { + for (const Instruction &Inst : BB) + if (const DILocation *Loc = Inst.getDebugLoc()) + return Loc; + + return nullptr; +} + +/// Set DebugLoc on the call instruction to a CSI hook, based on the +/// debug information of the instrumented instruction. +static void setInstrumentationDebugLoc(Instruction *Instrumented, + Instruction *Call) { + DISubprogram *Subprog = Instrumented->getFunction()->getSubprogram(); + if (Subprog) { + if (Instrumented->getDebugLoc()) { + Call->setDebugLoc(Instrumented->getDebugLoc()); + } else { + LLVMContext &C = Instrumented->getFunction()->getParent()->getContext(); + Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog)); + } + } +} + +/// Set DebugLoc on the call instruction to a CSI hook, based on the +/// debug information of the instrumented instruction. +static void setInstrumentationDebugLoc(BasicBlock &Instrumented, + Instruction *Call) { + DISubprogram *Subprog = Instrumented.getParent()->getSubprogram(); + if (Subprog) { + if (const DILocation *FirstDebugLoc = getFirstDebugLoc(Instrumented)) + Call->setDebugLoc(FirstDebugLoc); + else { + LLVMContext &C = Instrumented.getParent()->getParent()->getContext(); + Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog)); + } + } +} + +/// Set DebugLoc on the call instruction to a CSI hook, based on the +/// debug information of the instrumented instruction. +static void setInstrumentationDebugLoc(Function &Instrumented, + Instruction *Call) { + DISubprogram *Subprog = Instrumented.getSubprogram(); + if (Subprog) { + LLVMContext &C = Instrumented.getParent()->getContext(); + Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog)); + } +} + +bool CSIImpl::run() { + initializeCsi(); + + for (Function &F : M) + instrumentFunction(F); + + collectUnitFEDTables(); + finalizeCsi(); + return true; // We always insert the unit constructor. +} + +ForensicTable::ForensicTable(Module &M, StringRef BaseIdName) { + LLVMContext &C = M.getContext(); + IntegerType *Int64Ty = IntegerType::get(C, 64); + IdCounter = 0; + BaseId = new GlobalVariable(M, Int64Ty, false, GlobalValue::InternalLinkage, + ConstantInt::get(Int64Ty, 0), BaseIdName); + assert(BaseId); +} + +uint64_t ForensicTable::getId(const Value *V) { + if (!ValueToLocalIdMap.count(V)) + ValueToLocalIdMap[V] = IdCounter++; + assert(ValueToLocalIdMap.count(V) && "Value not in ID map."); + return ValueToLocalIdMap[V]; +} + +Value *ForensicTable::localToGlobalId(uint64_t LocalId, + IRBuilder<> &IRB) const { + assert(BaseId); + LLVMContext &C = IRB.getContext(); + LoadInst *Base = IRB.CreateLoad(BaseId); + MDNode *MD = llvm::MDNode::get(C, None); + Base->setMetadata(LLVMContext::MD_invariant_load, MD); + Value *Offset = IRB.getInt64(LocalId); + return IRB.CreateAdd(Base, Offset); +} + +uint64_t FrontEndDataTable::add(const Function &F) { + uint64_t ID = getId(&F); + add(ID, F.getSubprogram()); + return ID; +} + +uint64_t FrontEndDataTable::add(const BasicBlock &BB) { + uint64_t ID = getId(&BB); + add(ID, getFirstDebugLoc(BB)); + return ID; +} + +uint64_t FrontEndDataTable::add(const Instruction &I) { + uint64_t ID = getId(&I); + add(ID, I.getDebugLoc()); + return ID; +} + +PointerType *FrontEndDataTable::getPointerType(LLVMContext &C) { + return PointerType::get(getSourceLocStructType(C), 0); +} + +StructType *FrontEndDataTable::getSourceLocStructType(LLVMContext &C) { + return StructType::get( + /* Name */ PointerType::get(IntegerType::get(C, 8), 0), + /* Line */ IntegerType::get(C, 32), + /* Column */ IntegerType::get(C, 32), + /* File */ PointerType::get(IntegerType::get(C, 8), 0)); +} + +void FrontEndDataTable::add(uint64_t ID, const DILocation *Loc) { + if (Loc) { + // TODO: Add location information for inlining + const DISubprogram *Subprog = Loc->getScope()->getSubprogram(); + add(ID, (int32_t)Loc->getLine(), (int32_t)Loc->getColumn(), + Loc->getFilename(), Loc->getDirectory(), Subprog->getName()); + } else + add(ID); +} + +void FrontEndDataTable::add(uint64_t ID, const DISubprogram *Subprog) { + if (Subprog) + add(ID, (int32_t)Subprog->getLine(), -1, Subprog->getFilename(), + Subprog->getDirectory(), Subprog->getName()); + else + add(ID); +} + +void FrontEndDataTable::add(uint64_t ID, int32_t Line, int32_t Column, + StringRef Filename, StringRef Directory, + StringRef Name) { + assert(LocalIdToSourceLocationMap.find(ID) == + LocalIdToSourceLocationMap.end() && + "Id already exists in FED table."); + LocalIdToSourceLocationMap[ID] = {Name, Line, Column, Filename, Directory}; +} + +Constant *FrontEndDataTable::insertIntoModule(Module &M) const { + LLVMContext &C = M.getContext(); + StructType *FedType = getSourceLocStructType(C); + IntegerType *Int32Ty = IntegerType::get(C, 32); + Constant *Zero = ConstantInt::get(Int32Ty, 0); + Value *GepArgs[] = {Zero, Zero}; + SmallVector FEDEntries; + + for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) { + const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second; + Constant *Line = ConstantInt::get(Int32Ty, E.Line); + Constant *Column = ConstantInt::get(Int32Ty, E.Column); + Constant *File; + { + std::string Filename = E.Filename.str(); + if (!E.Directory.empty()) + Filename = E.Directory.str() + "/" + Filename; + Constant *FileStrConstant = ConstantDataArray::getString(C, Filename); + GlobalVariable *GV = + M.getGlobalVariable("__csi_unit_filename_" + Filename, true); + if (GV == NULL) { + GV = new GlobalVariable(M, FileStrConstant->getType(), + true, GlobalValue::PrivateLinkage, + FileStrConstant, + "__csi_unit_filename_" + Filename, + nullptr, + GlobalVariable::NotThreadLocal, 0); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + } + assert(GV); + File = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); + } + Constant *Name; + if (E.Name.empty()) + Name = ConstantPointerNull::get(PointerType::get( + IntegerType::get(C, 8), 0)); + else { + Constant *NameStrConstant = ConstantDataArray::getString(C, E.Name); + GlobalVariable *GV = + M.getGlobalVariable(("__csi_unit_function_name_" + E.Name).str(), true); + if (GV == NULL) { + GV = new GlobalVariable(M, NameStrConstant->getType(), + true, GlobalValue::PrivateLinkage, + NameStrConstant, + "__csi_unit_function_name_" + E.Name, + nullptr, + GlobalVariable::NotThreadLocal, 0); + GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); + } + assert(GV); + Name = + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); + } + // The order of arguments to ConstantStruct::get() must match the + // source_loc_t type in csi.h. + FEDEntries.push_back(ConstantStruct::get(FedType, Name, Line, Column, + File)); + } + + ArrayType *FedArrayType = ArrayType::get(FedType, FEDEntries.size()); + Constant *Table = ConstantArray::get(FedArrayType, FEDEntries); + GlobalVariable *GV = + new GlobalVariable(M, FedArrayType, false, GlobalValue::InternalLinkage, + Table, CsiUnitFedTableName); + return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs); +} + +void CSIImpl::initializeFuncHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *FuncPropertyTy = CsiFuncProperty::getType(C); + CsiFuncEntry = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_func_entry", IRB.getVoidTy(), + IRB.getInt64Ty(), FuncPropertyTy)); + Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C); + CsiFuncExit = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_func_exit", IRB.getVoidTy(), + IRB.getInt64Ty(), IRB.getInt64Ty(), + FuncExitPropertyTy)); +} + +void CSIImpl::initializeBasicBlockHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *PropertyTy = CsiBBProperty::getType(C); + CsiBBEntry = checkCsiInterfaceFunction(M.getOrInsertFunction( + "__csi_bb_entry", IRB.getVoidTy(), IRB.getInt64Ty(), PropertyTy)); + CsiBBExit = checkCsiInterfaceFunction(M.getOrInsertFunction( + "__csi_bb_exit", IRB.getVoidTy(), IRB.getInt64Ty(), PropertyTy)); +} + +void CSIImpl::initializeCallsiteHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *PropertyTy = CsiCallProperty::getType(C); + CsiBeforeCallsite = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_before_call", IRB.getVoidTy(), + IRB.getInt64Ty(), IRB.getInt64Ty(), PropertyTy)); + CsiAfterCallsite = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_after_call", IRB.getVoidTy(), + IRB.getInt64Ty(), IRB.getInt64Ty(), PropertyTy)); +} + +void CSIImpl::initializeLoadStoreHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C); + Type *StorePropertyTy = CsiLoadStoreProperty::getType(C); + Type *RetType = IRB.getVoidTy(); + Type *AddrType = IRB.getInt8PtrTy(); + Type *NumBytesType = IRB.getInt32Ty(); + + CsiBeforeRead = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_before_load", RetType, IRB.getInt64Ty(), + AddrType, NumBytesType, LoadPropertyTy)); + CsiAfterRead = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_after_load", RetType, IRB.getInt64Ty(), + AddrType, NumBytesType, LoadPropertyTy)); + + CsiBeforeWrite = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_before_store", RetType, IRB.getInt64Ty(), + AddrType, NumBytesType, StorePropertyTy)); + CsiAfterWrite = checkCsiInterfaceFunction( + M.getOrInsertFunction("__csi_after_store", RetType, IRB.getInt64Ty(), + AddrType, NumBytesType, StorePropertyTy)); +} + +void CSIImpl::initializeMemIntrinsicsHooks() { + LLVMContext &C = M.getContext(); + IRBuilder<> IRB(C); + + MemmoveFn = checkCsiInterfaceFunction( + M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IntptrTy)); + MemcpyFn = checkCsiInterfaceFunction( + M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt8PtrTy(), IntptrTy)); + MemsetFn = checkCsiInterfaceFunction( + M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), + IRB.getInt32Ty(), IntptrTy)); +} + +int CSIImpl::getNumBytesAccessed(Value *Addr, const DataLayout &DL) { + Type *OrigPtrTy = Addr->getType(); + Type *OrigTy = cast(OrigPtrTy)->getElementType(); + assert(OrigTy->isSized()); + uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy); + if (TypeSize % 8 != 0) { + // if (TypeSize != 8 && TypeSize != 16 && TypeSize != 32 && TypeSize != 64 && + // TypeSize != 128 && TypeSize != 256 && TypeSize != 512) { + return -1; + } + return TypeSize / 8; +} + +void CSIImpl::addLoadStoreInstrumentation( + Instruction *I, Function *BeforeFn, Function *AfterFn, Value *CsiId, + Type *AddrType, Value *Addr, int NumBytes, CsiLoadStoreProperty &Prop) { + IRBuilder<> IRB(I); + Value *PropVal = Prop.getValue(IRB); + insertConditionalHookCall(I, BeforeFn, + {CsiId, IRB.CreatePointerCast(Addr, AddrType), + IRB.getInt32(NumBytes), PropVal}); + + BasicBlock::iterator Iter(I); + Iter++; + IRB.SetInsertPoint(&*Iter); + insertConditionalHookCall(&*Iter, AfterFn, + {CsiId, IRB.CreatePointerCast(Addr, AddrType), + IRB.getInt32(NumBytes), PropVal}); +} + +void CSIImpl::instrumentLoadOrStore(Instruction *I, CsiLoadStoreProperty &Prop, + const DataLayout &DL) { + IRBuilder<> IRB(I); + bool IsWrite = isa(I); + Value *Addr = IsWrite ? cast(I)->getPointerOperand() + : cast(I)->getPointerOperand(); + int NumBytes = getNumBytesAccessed(Addr, DL); + Type *AddrType = IRB.getInt8PtrTy(); + + if (NumBytes == -1) + return; // size that we don't recognize + + if (IsWrite) { + uint64_t LocalId = StoreFED.add(*I); + Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB); + addLoadStoreInstrumentation(I, CsiBeforeWrite, CsiAfterWrite, CsiId, + AddrType, Addr, NumBytes, Prop); + } else { // is read + uint64_t LocalId = LoadFED.add(*I); + Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB); + addLoadStoreInstrumentation(I, CsiBeforeRead, CsiAfterRead, CsiId, AddrType, + Addr, NumBytes, Prop); + } +} + +void CSIImpl::instrumentAtomic(Instruction *I, const DataLayout &DL) { + // For now, print a message that this code contains atomics. + dbgs() << "WARNING: Uninstrumented atomic operations in program-under-test!\n"; +} + +// If a memset intrinsic gets inlined by the code gen, we will miss races on it. +// So, we either need to ensure the intrinsic is not inlined, or instrument it. +// We do not instrument memset/memmove/memcpy intrinsics (too complicated), +// instead we simply replace them with regular function calls, which are then +// intercepted by the run-time. +// Since our pass runs after everyone else, the calls should not be +// replaced back with intrinsics. If that becomes wrong at some point, +// we will need to call e.g. __csi_memset to avoid the intrinsics. +bool CSIImpl::instrumentMemIntrinsic(Instruction *I) { + IRBuilder<> IRB(I); + if (MemSetInst *M = dyn_cast(I)) { + Instruction *Call = IRB.CreateCall( + MemsetFn, + {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); + setInstrumentationDebugLoc(I, Call); + I->eraseFromParent(); + return true; + } else if (MemTransferInst *M = dyn_cast(I)) { + Instruction *Call = IRB.CreateCall( + isa(M) ? MemcpyFn : MemmoveFn, + {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()), + IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()), + IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)}); + setInstrumentationDebugLoc(I, Call); + I->eraseFromParent(); + return true; + } + return false; +} + +void CSIImpl::instrumentBasicBlock(BasicBlock &BB) { + IRBuilder<> IRB(&*BB.getFirstInsertionPt()); + //LLVMContext &C = IRB.getContext(); + uint64_t LocalId = BasicBlockFED.add(BB); + Value *CsiId = BasicBlockFED.localToGlobalId(LocalId, IRB); + CsiBBProperty Prop; + TerminatorInst *TI = BB.getTerminator(); + Value *PropVal = Prop.getValue(IRB); + insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiBBEntry, + {CsiId, PropVal}); + insertConditionalHookCall(TI, CsiBBExit, + {CsiId, PropVal}); +} + +void CSIImpl::instrumentCallsite(Instruction *I) { + // Ignore calls to debug intrinsics + if (isa(I)) + return; + + bool IsInvoke = false; + Function *Called = NULL; + if (CallInst *CI = dyn_cast(I)) { + Called = CI->getCalledFunction(); + } else if (InvokeInst *II = dyn_cast(I)) { + Called = II->getCalledFunction(); + IsInvoke = true; + } + + // if (Called && Called->getName().startswith("llvm.dbg")) { + // return; + // } + + IRBuilder<> IRB(I); + uint64_t LocalId = CallsiteFED.add(*I); + Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + Value *FuncId = NULL; + GlobalVariable *FuncIdGV = NULL; + if (Called) { + Module *M = I->getParent()->getParent()->getParent(); + std::string GVName = + CsiFuncIdVariablePrefix + Called->getName().str(); + FuncIdGV = dyn_cast(M->getOrInsertGlobal(GVName, + IRB.getInt64Ty())); + assert(FuncIdGV); + FuncIdGV->setConstant(false); + FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage); + FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId)); + FuncId = IRB.CreateLoad(FuncIdGV); + } else { + // Unknown targets (i.e. indirect calls) are always unknown. + FuncId = IRB.getInt64(CsiCallsiteUnknownTargetId); + } + assert(FuncId != NULL); + CsiCallProperty Prop; + Prop.setIsIndirect(!Called); + Value *PropVal = Prop.getValue(IRB); + insertConditionalHookCall(I, CsiBeforeCallsite, + {CallsiteId, FuncId, PropVal}); + + BasicBlock::iterator Iter(I); + if (IsInvoke) { + // There are two "after" positions for invokes: the normal block + // and the exception block. This also means we have to recompute + // the callsite and function IDs in each basic block so that we + // can use it for the after hook. + + // TODO: Do we want the "after" hook for this callsite to come + // before or after the BB entry hook? Currently it is inserted + // before BB entry because instrumentCallsite is called after + // instrumentBasicBlock. + InvokeInst *II = dyn_cast(I); + BasicBlock *NormalBB = II->getNormalDest(); + IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt()); + CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV); + PropVal = Prop.getValue(IRB); + insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}); + + BasicBlock *UnwindBB = II->getUnwindDest(); + IRB.SetInsertPoint(&*UnwindBB->getFirstInsertionPt()); + CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB); + if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV); + PropVal = Prop.getValue(IRB); + insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}); + } else { + // Simple call instruction; there is only one "after" position. + Iter++; + IRB.SetInsertPoint(&*Iter); + PropVal = Prop.getValue(IRB); + insertConditionalHookCall(&*Iter, CsiAfterCallsite, + {CallsiteId, FuncId, PropVal}); + } +} + +void CSIImpl::insertConditionalHookCall(Instruction *I, Function *HookFunction, + ArrayRef HookArgs) { + IRBuilder<> IRB(I); + // Value *Cond = IRB.CreateICmpEQ(IRB.CreateLoad(DisableInstrGV), IRB.getInt1(false)); + // TerminatorInst *TI = SplitBlockAndInsertIfThen(Cond, I, false); + // IRB.SetInsertPoint(TI); + // IRB.CreateStore(IRB.getInt1(true), DisableInstrGV); + Instruction *Call = IRB.CreateCall(HookFunction, HookArgs); + setInstrumentationDebugLoc(I, Call); + // IRB.CreateStore(IRB.getInt1(false), DisableInstrGV); +} + + +void CSIImpl::initializeFEDTables() { + FunctionFED = FrontEndDataTable(M, CsiFunctionBaseIdName); + FunctionExitFED = FrontEndDataTable(M, CsiFunctionExitBaseIdName); + BasicBlockFED = FrontEndDataTable(M, CsiBasicBlockBaseIdName); + CallsiteFED = FrontEndDataTable(M, CsiCallsiteBaseIdName); + LoadFED = FrontEndDataTable(M, CsiLoadBaseIdName); + StoreFED = FrontEndDataTable(M, CsiStoreBaseIdName); +} + +uint64_t CSIImpl::getLocalFunctionID(Function &F) { + uint64_t LocalId = FunctionFED.add(F); + FuncOffsetMap[F.getName()] = LocalId; + return LocalId; +} + +void CSIImpl::generateInitCallsiteToFunction() { + LLVMContext &C = M.getContext(); + BasicBlock *EntryBB = BasicBlock::Create(C, "", InitCallsiteToFunction); + IRBuilder<> IRB(ReturnInst::Create(C, EntryBB)); + + GlobalVariable *Base = FunctionFED.baseId(); + LoadInst *LI = IRB.CreateLoad(Base); + // Traverse the map of function name -> function local id. Generate + // a store of each function's global ID to the corresponding weak + // global variable. + for (const auto &it : FuncOffsetMap) { + std::string GVName = CsiFuncIdVariablePrefix + it.first.str(); + GlobalVariable *GV = nullptr; + if ((GV = M.getGlobalVariable(GVName)) == nullptr) { + GV = new GlobalVariable(M, IRB.getInt64Ty(), false, + GlobalValue::WeakAnyLinkage, + IRB.getInt64(CsiCallsiteUnknownTargetId), GVName); + } + assert(GV); + IRB.CreateStore(IRB.CreateAdd(LI, IRB.getInt64(it.second)), GV); + } +} + +void CSIImpl::initializeCsi() { + IntptrTy = DL.getIntPtrType(M.getContext()); + + initializeFEDTables(); + if (Options.InstrumentFuncEntryExit) + initializeFuncHooks(); + if (Options.InstrumentMemoryAccesses) + initializeLoadStoreHooks(); + if (Options.InstrumentBasicBlocks) + initializeBasicBlockHooks(); + if (Options.InstrumentCalls) + initializeCallsiteHooks(); + if (Options.InstrumentMemIntrinsics) + initializeMemIntrinsicsHooks(); + + FunctionType *FnType = + FunctionType::get(Type::getVoidTy(M.getContext()), {}, false); + InitCallsiteToFunction = checkCsiInterfaceFunction( + M.getOrInsertFunction(CsiInitCallsiteToFunctionName, FnType)); + assert(InitCallsiteToFunction); + InitCallsiteToFunction->setLinkage(GlobalValue::InternalLinkage); + + /* + The runtime declares this as a __thread var --- need to change this decl generation + or the tool won't compile + DisableInstrGV = new GlobalVariable(M, IntegerType::get(M.getContext(), 1), false, + GlobalValue::ExternalLinkage, nullptr, + CsiDisableInstrumentationName, nullptr, + GlobalValue::GeneralDynamicTLSModel, 0, true); + */ +} + +// Create a struct type to match the unit_fed_entry_t type in csirt.c. +StructType *CSIImpl::getUnitFedTableType(LLVMContext &C, + PointerType *EntryPointerType) { + return StructType::get(IntegerType::get(C, 64), + Type::getInt8PtrTy(C, 0), + EntryPointerType); +} + +Constant *CSIImpl::fedTableToUnitFedTable(Module &M, + StructType *UnitFedTableType, + FrontEndDataTable &FedTable) { + Constant *NumEntries = + ConstantInt::get(IntegerType::get(M.getContext(), 64), FedTable.size()); + Constant *BaseIdPtr = + ConstantExpr::getPointerCast(FedTable.baseId(), + Type::getInt8PtrTy(M.getContext(), 0)); + Constant *InsertedTable = FedTable.insertIntoModule(M); + return ConstantStruct::get(UnitFedTableType, NumEntries, BaseIdPtr, + InsertedTable); +} + +void CSIImpl::collectUnitFEDTables() { + LLVMContext &C = M.getContext(); + StructType *UnitFedTableType = + getUnitFedTableType(C, FrontEndDataTable::getPointerType(C)); + + // The order of the FED tables here must match the enum in csirt.c and the + // instrumentation_counts_t in csi.h. + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, FunctionFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, FunctionExitFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, BasicBlockFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, CallsiteFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, LoadFED)); + UnitFedTables.push_back( + fedTableToUnitFedTable(M, UnitFedTableType, StoreFED)); +} + +CallInst *CSIImpl::createRTUnitInitCall(IRBuilder<> &IRB) { + LLVMContext &C = M.getContext(); + + StructType *UnitFedTableType = + getUnitFedTableType(C, FrontEndDataTable::getPointerType(C)); + + // Lookup __csirt_unit_init + SmallVector InitArgTypes({IRB.getInt8PtrTy(), + PointerType::get(UnitFedTableType, 0), + InitCallsiteToFunction->getType()}); + FunctionType *InitFunctionTy = + FunctionType::get(IRB.getVoidTy(), InitArgTypes, false); + RTUnitInit = checkCsiInterfaceFunction( + M.getOrInsertFunction(CsiRtUnitInitName, InitFunctionTy)); + assert(RTUnitInit); + + ArrayType *UnitFedTableArrayType = + ArrayType::get(UnitFedTableType, UnitFedTables.size()); + Constant *Table = ConstantArray::get(UnitFedTableArrayType, UnitFedTables); + GlobalVariable *GV = new GlobalVariable(M, UnitFedTableArrayType, false, + GlobalValue::InternalLinkage, Table, + CsiUnitFedTableArrayName); + + Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0); + Value *GepArgs[] = {Zero, Zero}; + + // Insert call to __csirt_unit_init + return IRB.CreateCall( + RTUnitInit, + {IRB.CreateGlobalStringPtr(M.getName()), + ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs), + InitCallsiteToFunction}); +} + +void CSIImpl::finalizeCsi() { + LLVMContext &C = M.getContext(); + + // Add CSI global constructor, which calls unit init. + Function *Ctor = + Function::Create(FunctionType::get(Type::getVoidTy(C), false), + GlobalValue::InternalLinkage, CsiRtUnitCtorName, &M); + BasicBlock *CtorBB = BasicBlock::Create(C, "", Ctor); + IRBuilder<> IRB(ReturnInst::Create(C, CtorBB)); + + // Insert __csi_func_id_ weak symbols for all defined functions and + // generate the runtime code that stores to all of them. + generateInitCallsiteToFunction(); + + CallInst *Call = createRTUnitInitCall(IRB); + + // Add the constructor to the global list + appendToGlobalCtors(M, Ctor, CsiUnitCtorPriority); + + CallGraphNode *CNCtor = CG->getOrInsertFunction(Ctor); + CallGraphNode *CNFunc = CG->getOrInsertFunction(RTUnitInit); + CNCtor->addCalledFunction(Call, CNFunc); +} + +bool CSIImpl::shouldNotInstrumentFunction(Function &F) { + Module &M = *F.getParent(); + // Never instrument the CSI ctor. + if (F.hasName() && F.getName() == CsiRtUnitCtorName) + return true; + + // Don't instrument functions that will run before or + // simultaneously with CSI ctors. + GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors"); + if (GV == nullptr) + return false; + ConstantArray *CA = cast(GV->getInitializer()); + for (Use &OP : CA->operands()) { + if (isa(OP)) + continue; + ConstantStruct *CS = cast(OP); + + if (Function *CF = dyn_cast(CS->getOperand(1))) { + uint64_t Priority = + dyn_cast(CS->getOperand(0))->getLimitedValue(); + if (Priority <= CsiUnitCtorPriority && CF->getName() == F.getName()) { + // Do not instrument F. + return true; + } + } + } + // false means do instrument it. + return false; +} + +bool CSIImpl::isVtableAccess(Instruction *I) { + if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa)) + return Tag->isTBAAVtableAccess(); + return false; +} + +bool CSIImpl::addrPointsToConstantData(Value *Addr) { + // If this is a GEP, just analyze its pointer operand. + if (GetElementPtrInst *GEP = dyn_cast(Addr)) + Addr = GEP->getPointerOperand(); + + if (GlobalVariable *GV = dyn_cast(Addr)) { + if (GV->isConstant()) { + return true; + } + } else if (LoadInst *L = dyn_cast(Addr)) { + if (isVtableAccess(L)) { + return true; + } + } + return false; +} + +bool CSIImpl::isAtomic(Instruction *I) { + if (LoadInst *LI = dyn_cast(I)) + return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread; + if (StoreInst *SI = dyn_cast(I)) + return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread; + if (isa(I)) + return true; + if (isa(I)) + return true; + if (isa(I)) + return true; + return false; +} + +void CSIImpl::computeLoadAndStoreProperties( + SmallVectorImpl> &LoadAndStoreProperties, + SmallVectorImpl &BBLoadsAndStores, + const DataLayout &DL) { + SmallSet WriteTargets; + + for (SmallVectorImpl::reverse_iterator + It = BBLoadsAndStores.rbegin(), + E = BBLoadsAndStores.rend(); + It != E; ++It) { + Instruction *I = *It; + unsigned Alignment; + if (StoreInst *Store = dyn_cast(I)) { + Value *Addr = Store->getPointerOperand(); + WriteTargets.insert(Addr); + CsiLoadStoreProperty Prop; + // Update alignment property data + Alignment = Store->getAlignment(); + Prop.setAlignment(Alignment); + // Set vtable-access property + Prop.setIsVtableAccess(isVtableAccess(Store)); + // Set constant-data-access property + Prop.setIsConstant(addrPointsToConstantData(Addr)); + Value *Obj = GetUnderlyingObject(Addr, DL); + // Set is-on-stack property + Prop.setIsOnStack(isa(Obj)); + // Set may-be-captured property + Prop.setMayBeCaptured(isa(Obj) || + PointerMayBeCaptured(Addr, true, true)); + LoadAndStoreProperties.push_back(std::make_pair(I, Prop)); + } else { + LoadInst *Load = cast(I); + Value *Addr = Load->getPointerOperand(); + CsiLoadStoreProperty Prop; + // Update alignment property data + Alignment = Load->getAlignment(); + Prop.setAlignment(Alignment); + // Set vtable-access property + Prop.setIsVtableAccess(isVtableAccess(Load)); + // Set constant-data-access-property + Prop.setIsConstant(addrPointsToConstantData(Addr)); + Value *Obj = GetUnderlyingObject(Addr, DL); + // Set is-on-stack property + Prop.setIsOnStack(isa(Obj)); + // Set may-be-captured property + Prop.setMayBeCaptured(isa(Obj) || + PointerMayBeCaptured(Addr, true, true)); + // Set load-read-before-write-in-bb property + bool HasBeenSeen = WriteTargets.count(Addr) > 0; + Prop.setLoadReadBeforeWriteInBB(HasBeenSeen); + LoadAndStoreProperties.push_back(std::make_pair(I, Prop)); + } + } + BBLoadsAndStores.clear(); +} + +void CSIImpl::instrumentFunction(Function &F) { + // This is required to prevent instrumenting the call to + // __csi_module_init from within the module constructor. + if (F.empty() || shouldNotInstrumentFunction(F)) { + return; + } + + SmallVector, 8> + LoadAndStoreProperties; + SmallVector ReturnInstructions; + SmallVector MemIntrinsics; + SmallVector Callsites; + SmallVector BasicBlocks; + SmallVector AtomicAccesses; + + // Compile lists of all instrumentation points before anything is modified. + for (BasicBlock &BB : F) { + SmallVector BBLoadsAndStores; + for (Instruction &I : BB) { + if (isAtomic(&I)) + AtomicAccesses.push_back(&I); + else if (isa(I) || isa(I)) { + BBLoadsAndStores.push_back(&I); + } else if (isa(I)) { + ReturnInstructions.push_back(&I); + } else if (isa(I) || isa(I)) { + if (isa(I)) { + MemIntrinsics.push_back(&I); + } else { + Callsites.push_back(&I); + } + computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores, + DL); + } + } + computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores, DL); + BasicBlocks.push_back(&BB); + } + + uint64_t LocalId = getLocalFunctionID(F); + + // Instrument basic blocks. Note that we do this before other instrumentation + // so that we put this at the beginning of the basic block, and then the + // function entry call goes before the call to basic block entry. + if (Options.InstrumentBasicBlocks) + for (BasicBlock *BB : BasicBlocks) + instrumentBasicBlock(*BB); + + // Do this work in a separate loop after copying the iterators so that we + // aren't modifying the list as we're iterating. + if (Options.InstrumentMemoryAccesses) + for (std::pair p : + LoadAndStoreProperties) + instrumentLoadOrStore(p.first, p.second, DL); + + // Instrument atomic memory accesses in any case (they can be used to + // implement synchronization). + if (Options.InstrumentAtomics) + for (Instruction *I : AtomicAccesses) + instrumentAtomic(I, DL); + + if (Options.InstrumentMemIntrinsics) + for (Instruction *I : MemIntrinsics) + instrumentMemIntrinsic(I); + + if (Options.InstrumentCalls) + for (Instruction *I : Callsites) + instrumentCallsite(I); + + // Instrument function entry/exit points. + if (Options.InstrumentFuncEntryExit) { + IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt()); + CsiFuncProperty FuncEntryProp; + CsiFuncExitProperty FuncExitProp; + Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB); + Value *PropVal = FuncEntryProp.getValue(IRB); + insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiFuncEntry, + {FuncId, PropVal}); + + for (Instruction *I : ReturnInstructions) { + IRBuilder<> IRBRet(I); + // uint64_t ExitLocalId = FunctionExitFED.add(F); + uint64_t ExitLocalId = FunctionExitFED.add(*I); + Value *ExitCsiId = FunctionExitFED.localToGlobalId(ExitLocalId, IRBRet); + PropVal = FuncExitProp.getValue(IRBRet); + insertConditionalHookCall(I, CsiFuncExit, + {ExitCsiId, FuncId, PropVal}); + } + } +} + +void ComprehensiveStaticInstrumentation::getAnalysisUsage( + AnalysisUsage &AU) const { + AU.addRequired(); +} + +bool ComprehensiveStaticInstrumentation::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + CallGraph *CG = &getAnalysis().getCallGraph(); + + return CSIImpl(M, CG, Options).run(); +} diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp index c3e323613c7079..f9ba37987a61e9 100644 --- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp +++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp @@ -105,6 +105,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeAddressSanitizerModulePass(Registry); initializeBoundsCheckingLegacyPassPass(Registry); initializeControlHeightReductionLegacyPassPass(Registry); + initializeCilkSanitizerPass(Registry); initializeGCOVProfilerLegacyPassPass(Registry); initializePGOInstrumentationGenLegacyPassPass(Registry); initializePGOInstrumentationUseLegacyPassPass(Registry); @@ -117,6 +118,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) { initializeSanitizerCoverageModulePass(Registry); initializeDataFlowSanitizerPass(Registry); initializeEfficiencySanitizerPass(Registry); + initializeComprehensiveStaticInstrumentationPass(Registry); } /// LLVMInitializeInstrumentation - C binding for diff --git a/llvm/lib/Transforms/LLVMBuild.txt b/llvm/lib/Transforms/LLVMBuild.txt index f061c6d9285e3e..ae57c40a946255 100644 --- a/llvm/lib/Transforms/LLVMBuild.txt +++ b/llvm/lib/Transforms/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC +subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Tapir Utils Vectorize ObjCARC [component_0] type = Group diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt index e3548ce5cd0afd..688365dfae4676 100644 --- a/llvm/lib/Transforms/Scalar/CMakeLists.txt +++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt @@ -28,6 +28,7 @@ add_llvm_library(LLVMScalarOpts LoopDeletion.cpp LoopDataPrefetch.cpp LoopDistribute.cpp + LoopFuse.cpp LoopIdiomRecognize.cpp LoopInstSimplify.cpp LoopInterchange.cpp diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 9861948c8297a9..fcc11e0716f9b5 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1123,8 +1123,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) { continue; } + if (isa(Pred->getTerminator())) { + continue; + } - if (Pred->getTerminator()->getNumSuccessors() != 1) { + if (Pred->getTerminator()->getNumSuccessors() != 1 && + !isa(Pred->getTerminator())) { if (isa(Pred->getTerminator())) { LLVM_DEBUG( dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '" @@ -1327,6 +1331,20 @@ bool GVN::processNonLocalLoad(LoadInst *LI) { return false; } + // If we depend on a detach instruction, reject. + for (unsigned i = 0, e = NumDeps; i != e; ++i) { + MemDepResult DepInfo = Deps[i].getResult(); + if (!(DepInfo.getInst())) + continue; + if (isa(DepInfo.getInst())|| + isa(DepInfo.getInst())) { + DEBUG(dbgs() << "GVN: Cannot process" << *LI << + " due to dependency on" << + *(DepInfo.getInst()) << "\n"); + return false; + } + } + // If this load follows a GEP, see if we can PRE the indices before analyzing. if (GetElementPtrInst *GEP = dyn_cast(LI->getOperand(0))) { for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(), @@ -2184,6 +2202,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { unsigned NumWithout = 0; BasicBlock *PREPred = nullptr; BasicBlock *CurrentBlock = CurInst->getParent(); + BasicBlock *DetachPred = nullptr, *ReattachPred = nullptr; + Value *DetachV = nullptr, *ReattachV = nullptr; // Update the RPO numbers for this function. if (InvalidBlockRPONumbers) @@ -2212,18 +2232,36 @@ bool GVN::performScalarPRE(Instruction *CurInst) { break; } + // Ignore reattach predecessors for determining whether to perform + // PRE. These predecessors have the same available values as + // their corresponding detach predecessors. + if (isa(P->getTerminator())) + ReattachPred = P; + uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this); Value *predV = findLeader(P, TValNo); + + if (isa(P->getTerminator())) { + assert(nullptr == DetachPred && "Multiple detach predecessors found!"); + DetachPred = P; + } + if (!predV) { - predMap.push_back(std::make_pair(static_cast(nullptr), P)); - PREPred = P; - ++NumWithout; + if (!isa(P->getTerminator())) { + predMap.push_back(std::make_pair(static_cast(nullptr), P)); + PREPred = P; + ++NumWithout; + } } else if (predV == CurInst) { /* CurInst dominates this predecessor. */ NumWithout = 2; break; } else { predMap.push_back(std::make_pair(predV, P)); + if (isa(P->getTerminator())) + DetachV = predV; + if (isa(P->getTerminator())) + ReattachV = predV; ++NumWith; } } @@ -2233,6 +2271,15 @@ bool GVN::performScalarPRE(Instruction *CurInst) { if (NumWithout > 1 || NumWith == 0) return false; + // If the reattach predecessor has a value that does not match the + // detach predecessor's value, assume that this is not a redundant + // instruction. + if (ReattachV && ReattachV != DetachV) + return false; + + assert((!ReattachPred || DetachPred) && + "Reattach predecessor found with no detach predecessor"); + // We may have a case where all predecessors have the instruction, // and we just need to insert a phi node. Otherwise, perform // insertion. @@ -2256,7 +2303,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) { // the edge to be split and perform the PRE the next time we iterate // on the function. unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock); - if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) { + if (isCriticalEdge(PREPred->getTerminator(), SuccNum) && + !isa(PREPred->getTerminator())) { toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum)); return false; } @@ -2267,6 +2315,9 @@ bool GVN::performScalarPRE(Instruction *CurInst) { LLVM_DEBUG(verifyRemoved(PREInstr)); PREInstr->deleteValue(); return false; + } else if (DetachPred == PREPred && ReattachPred) { + assert(nullptr == DetachV && "Detach predecessor already had a value"); + predMap.push_back(std::make_pair(PREInstr, ReattachPred)); } } diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 48de56a02834d5..bf2865332ce880 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -987,8 +987,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) { // because now the condition in this block can be threaded through // predecessors of our predecessor block. if (BasicBlock *SinglePred = BB->getSinglePredecessor()) { - const Instruction *TI = SinglePred->getTerminator(); - if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 && + const TerminatorInst *TI = SinglePred->getTerminator(); + if (!TI->isExceptional() && + !isa(SinglePred->getTerminator()) && // Can't remove syncs + TI->getNumSuccessors() == 1 && SinglePred != BB && !hasAddressTakenAndUsed(BB)) { // If SinglePred was a loop header, BB becomes one. if (LoopHeaders.erase(SinglePred)) @@ -1373,7 +1375,8 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) { } } - if (!PredAvailable) { + if (!PredAvailable || + isa(PredBB->getTerminator())) { OneUnavailablePred = PredBB; continue; } @@ -1416,6 +1419,9 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) { // unconditional branch, we know that it isn't a critical edge. if (PredsScanned.size() == AvailablePreds.size()+1 && OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) { + // If the predecessor is a reattach, we can't split the edge + if (isa(OneUnavailablePred->getTerminator())) + return false; UnavailablePred = OneUnavailablePred; } else if (PredsScanned.size() != AvailablePreds.size()) { // Otherwise, we had multiple unavailable predecessors or we had a critical @@ -1428,8 +1434,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) { // Add all the unavailable predecessors to the PredsToSplit list. for (BasicBlock *P : predecessors(LoadBB)) { - // If the predecessor is an indirect goto, we can't split the edge. - if (isa(P->getTerminator())) + // If the predecessor is an indirect goto or a reattach, we + // can't split the edge. + if (isa(P->getTerminator()) || + isa(P->getTerminator())) return false; if (!AvailablePredSet.count(P)) diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index d204654c39157d..d598ec917d8932 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -72,6 +72,7 @@ #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/SSAUpdater.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include #include using namespace llvm; @@ -1775,6 +1776,18 @@ bool llvm::promoteLoopAccessesToScalars( bool DereferenceableInPH = false; bool SafeToInsertStore = false; + // We cannot speculate loads to values that are stored in a detached + // context within the loop. Precompute whether or not there is a + // detach within this loop. + bool DetachWithinLoop = + isa(CurLoop->getHeader()->getTerminator()); + if (!DetachWithinLoop) + for (BasicBlock *BB : CurLoop->getBlocks()) + if (isa(BB->getTerminator())) { + DetachWithinLoop = true; + break; + } + SmallVector LoopUses; // We start with an alignment of one and try to find instructions that allow @@ -1838,6 +1851,23 @@ bool llvm::promoteLoopAccessesToScalars( if (!Store->isUnordered()) return false; + // We conservatively avoid promoting stores that are detached + // within the loop. Technically it can be legal to move these + // stores -- the program already contains a determinacy race + // -- but to preserve the serial execution, we have to avoid + // moving stores that are loaded. For now, we simply avoid + // moving these stores. + // + // TODO: The call to GetDetachedCtx can potentially be + // expensive. Optimize this analysis in the future. + if (DetachWithinLoop && + CurLoop->contains(GetDetachedCtx(Store->getParent()))) + return false; + + // Note that we only check GuaranteedToExecute inside the store case + // so that we do not introduce stores where they did not exist before + // (which would break the LLVM concurrency model). + SawUnorderedAtomic |= Store->isAtomic(); SawNotAtomic |= !Store->isAtomic(); diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp new file mode 100644 index 00000000000000..4c90ace351c603 --- /dev/null +++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp @@ -0,0 +1,561 @@ +//===------------- LoopFuse.cpp - Loop Fusion Pass ------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// Fuse two adjacent loops to improve cache locality. Loops are multi-versioned +/// and unconditionally fused along one version to check for dependence +/// legality. Legality decides whether to keep the original version or the fused +/// version or both versions with runtime checks. LoopAccessLegacyAnalysis is used to +/// check dependence legality. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Scalar/LoopFuse.h" + +#define DEBUG_TYPE "loop-fuse" + +using namespace llvm; + +static cl::opt + LFuseVerify("loop-fuse-verify", cl::Hidden, + cl::desc("Turn on DominatorTree and LoopInfo verification " + "after Loop Fusion"), + cl::init(false)); + +STATISTIC(NumLoopsFused, "Number of loops fused"); + +// Replace IncomingBlocks in PHI nodes of @Br successors from Br's parent to +// @To. +void LoopFuse::RewritePHI(BranchInst *Br, BasicBlock *To) { + assert((Br && To)); + for (auto *S : Br->successors()) { + auto I = S->begin(); + while (PHINode *P = dyn_cast(&*I)) { + P->setIncomingBlock(P->getBasicBlockIndex(Br->getParent()), To); + ++I; + } + } +} + +//===----------------------------------------------------------------------===// +// Loop Fusion Implementation. +// The idea to check fusion legality is by first fusing the loops and then look +// for fusion preventing dependences. This is done by versioning the loops +// first. The check is done on versioned loops and one of the version is +// discarded based on legality's success. +//===----------------------------------------------------------------------===// + +/* Fuse loops @L1 and @L2. Remove ConnectingBlock (CB) and connect L1Latch to + L2Header. Loop from L2Latch to L1Header. Make L1's indvar as indvar for the + fused loop. Update LI by moving L2Blocks into L1 and call L1 as FusedLoop. + Return FusedLoop. + L1 + | L1Blocks + CB --> | \ + | L2Blocks/ + L2 |/ +*/ +Loop *LoopFuse::FuseLoops(Loop &L1, Loop &L2) { + PHINode *P1 = L1.getCanonicalInductionVariable(); + PHINode *P2 = L2.getCanonicalInductionVariable(); + + BranchInst *Br1 = dyn_cast(L1.getLoopLatch()->getTerminator()); + BranchInst *Br2 = dyn_cast(L2.getLoopLatch()->getTerminator()); + + // Make Br2 to branch to L1 header based on Br1's condition. + unsigned LoopBack = 0; + if (Br2->getSuccessor(1) == L2.getHeader()) + LoopBack = 1; + assert((Br2->getSuccessor(LoopBack) == L2.getHeader())); + Br2->setSuccessor(LoopBack, L1.getHeader()); + Br2->setCondition(Br1->getCondition()); + RewritePHI(Br1, Br2->getParent()); + + // Zap L2 preheader and unconditionally branch from L1 latch to L2 header. + // L2 preheader is a connecting block and it is known to contain only an + // unconditional branch to L2 header. + BasicBlock *L2PH = L2.getLoopPreheader(), *L2Header = L2.getHeader(); + BranchInst *L2PHBr = dyn_cast(L2PH->getTerminator()); + RewritePHI(L2PHBr, Br1->getParent()); + DT->changeImmediateDominator(L2Header, L1.getLoopLatch()); + + BranchInst::Create(L2Header, Br1); + Br1->eraseFromParent(); + L2PH->dropAllReferences(); + L2PHBr->eraseFromParent(); + L2PH->eraseFromParent(); + DT->eraseNode(L2PH); + LI->removeBlock(L2PH); + + P2->replaceAllUsesWith(P1); + P2->eraseFromParent(); + + // Update LI. + // Move all blocks from L2 to L1. + SmallVector L2BBs; + for (auto bb = L2.block_begin(), bbe = L2.block_end(); bb != bbe; ++bb) + L2BBs.push_back(*bb); + for (auto *bb : L2BBs) { + LI->removeBlock(bb); + L1.addBasicBlockToLoop(bb, *LI); + } + // Remove L2. + SE->forgetLoop(&L2); + LI->markAsRemoved(&L2); + + // Update DT: DT changed only at L2PH zap and was updated during zapping. + + return &L1; +} + +/* Version the given loops along a parallel path and fuse the cloned loops. + Check the dependence legality of the fused loop. + + L1PH BooleanBB BooleanBB + | /\ /\ + L1 L1PH L1PH.clone L1PH FusedPH + | version | | Fuse along | | + CB (L1Exit/L2PH) ----> L1 L1.clone --------> L1 L1Blocks + | | | versioned | | \ + L2 CB CB.clone path CB L2Blocks | + | | | | | |/ + L2Exit L2 L2.clone L2 | + \ / \ / + L2Exit CommonExit + CB is ConnectingBlock. +*/ +bool LoopFuse::DependenceLegal(Loop &L1, Loop &L2) { + + // Version to fuse. LoopVersioning is not used here because: + // a. Runtime checks are inserted later. + // b. Intermediate VMap updates are required. + // Moreover it is convenient for now to just clone and remap. + BasicBlock *BooleanBB = L1.getLoopPreheader(); + BasicBlock *L1PH = SplitEdge(BooleanBB, L1.getHeader(), DT, LI); + + ValueToValueMapTy VMap1; + SmallVector ClonedBBs1; + Loop *ClonedLoop1 = + cloneLoopWithPreheader(L1.getExitBlock(), BooleanBB, &L1, VMap1, + Twine(".L1clone"), LI, DT, ClonedBBs1); + + ValueToValueMapTy VMap2; + SmallVector ClonedBBs2; + Loop *ClonedLoop2 = + cloneLoopWithPreheader(L2.getExitBlock(), L1.getExitBlock(), &L2, VMap2, + Twine(".L2clone"), LI, DT, ClonedBBs2); + remapInstructionsInBlocks(ClonedBBs2, VMap2); + VMap1[L1.getExitBlock()] = ClonedLoop2->getLoopPreheader(); + remapInstructionsInBlocks(ClonedBBs1, VMap1); + + // Build the custom VMap by concatenating VMap1 and VMap2. + for (auto V : VMap1) + VMap[V->first] = V->second; + for (auto V : VMap2) + VMap[V->first] = V->second; + + // VMap.size() != VMap1.size() + VMap2.size() because of redundants and + // L1Exit update in VMap1 above. + + // Branch to either of the versions - using a boolean flag. + Instruction *Term = BooleanBB->getTerminator(); + FusionSwitcher = + BranchInst::Create(L1PH, ClonedLoop1->getLoopPreheader(), + ConstantInt::getTrue(L1PH->getContext()), Term); + Term->eraseFromParent(); + + // The two versions join back at L2 exit. Update DT. + if (DT->dominates(L2.getLoopLatch(), L2.getExitBlock())) + DT->changeImmediateDominator(L2.getExitBlock(), BooleanBB); + + DEBUG(dbgs() << "ClonedLoop1: " << *ClonedLoop1 << "\n"); + DEBUG(dbgs() << "ClonedLoop2: " << *ClonedLoop2 << "\n"); + + FusedLoop = FuseLoops(*ClonedLoop1, *ClonedLoop2); + DEBUG(dbgs() << "FusedLoop: " << *FusedLoop << "\n"); + + // Check dependences. + DEBUG(dbgs() << "Loop fused on versioned path. Checking dependences...\n"); + LAI = &LAA->getInfo(FusedLoop); + DEBUG(LAI->print(dbgs())); + + auto Dependences = LAI->getDepChecker().getDependences(); + // TODO@jiahao: Investigate. + // if (!Dependences || Dependences->empty()) { + // DEBUG(dbgs() << "Failed to get dependences to check fusion legality!" + // << " Skipping...\n"); + // return false; + // } + + // Fusion is illegal if there is a backward dependence between memory accesses + // whose source was in L1 and sink was in L2. ClonedBBs1 and ClonedBBs2 + // contain cloned BBs from L1 and L2 respectively. They are used to check the + // containment of srouce and sink. + for (auto &Dep : *Dependences) { + if (Dep.isPossiblyBackward()) { + Instruction *Source = Dep.getSource(*LAI); + Instruction *Sink = Dep.getDestination(*LAI); + if (std::find(ClonedBBs1.begin(), ClonedBBs1.end(), + Source->getParent()) == ClonedBBs1.end()) + continue; + if (std::find(ClonedBBs2.begin(), ClonedBBs2.end(), Sink->getParent()) == + ClonedBBs2.end()) + continue; + DEBUG(dbgs() << "Loop carried backward dependence prevents fusion!\n"); + return false; + } + } + DEBUG(dbgs() << "Loops are dependence legal to fuse!\n"); + return true; +} + +// Return true if any of the defs made in @L1 is used inside @L2. +bool LoopFuse::DefsUsedAcrossLoops(Loop &L1, Loop &L2) { + auto DefsUsedOutsideL1 = findDefsUsedOutsideOfLoop(&L1); + for (auto *D : DefsUsedOutsideL1) { + for (auto *U : D->users()) { + if (L2.contains(dyn_cast(U)->getParent())) + return true; + } + } + return false; +} + +bool LoopFuse::IsLegalAndProfitable(Loop &L1, Loop &L2) { + // Basic legality. + if (!L1.empty() || !L2.empty()) { + // TODO: Update cloneLoopWithPreheader() to update LoopInfo for subloops + // too and LoopFusion can be done for loops at any depth. + DEBUG(dbgs() << "Not innermost loops! Skipping...\n"); + return false; + } + + if (L1.getLoopDepth() != L2.getLoopDepth()) { + DEBUG(dbgs() << "Loops not at same depth! Skipping...\n"); + return false; + } + + if (!L1.getLoopPreheader() || !L2.getLoopPreheader()) { + DEBUG(dbgs() << "No preheader! Skipping...\n"); + return false; + } + + if (!L1.getExitBlock() || !L2.getExitBlock()) { + DEBUG(dbgs() << "Single exit block not found! Skipping...\n"); + return false; + } + + // Can fuse only bottom-tested loops and loops with latch being the single + // exiting block. + if ((L1.getExitingBlock() != L1.getLoopLatch()) || + (L2.getExitingBlock() != L2.getLoopLatch())) { + DEBUG(dbgs() << "Not a bottom-tested loop! Skipping...\n"); + return false; + } + + // Can fuse only adjacent loops. Adjacency is defined by: + // a. L1Exit has single entry only from L1Latch. + // b. L1Exit and L2Preheader are same i.e the block forms the ConnectingBlock. + // c. ConnectingBlock just branches unconditionally to L2Header. + auto *Br = dyn_cast(L1.getExitBlock()->begin()); + if ((L1.getExitBlock()->getSinglePredecessor() != L1.getLoopLatch()) || + (L1.getExitBlock() != L2.getLoopPreheader()) || + (!Br || Br->isConditional())) { + DEBUG(dbgs() << "Loops not adjacent! Skipping...\n"); + return false; + } + + // Indvars of both loops is known and canonicalized. + PHINode *P1 = L1.getCanonicalInductionVariable(); + PHINode *P2 = L2.getCanonicalInductionVariable(); + if (!P1 || !P2) { + DEBUG(dbgs() << "Unknown induction variables! Skipping...\n"); + return false; + } + + // P1 and P2 are canonical indvars. Backedge taken count check is enough to + // ascertain both loops have same iteration space. + if (SE->getBackedgeTakenCount(&L1) != SE->getBackedgeTakenCount(&L2)) + return false; + + // Cannot fuse if there are uses of L1 defs in L2. + if (DefsUsedAcrossLoops(L1, L2)) + return false; + + // Dependene based legality. + if (!DependenceLegal(L1, L2)) + return false; + + // TODO: Add profitability measures. + + return true; +} + +// Remove Loop @L completely by deleting the BBs and also from @LI, @DT and @SE +// including preheader. Finally connect the single predecessor (the BooleanBB +// that contains FusionSwitcher) of preheader to loop exit. +void LoopFuse::RemoveLoopCompletelyWithPreheader(Loop &L) { + DEBUG(dbgs() << "Removing loop: " << L << "\n"); + BasicBlock *PH = L.getLoopPreheader(); + BasicBlock *Exit = L.getExitBlock(); + assert(Exit && "Expected Exit bb and single pred to preheader!"); + + // No need to RewritePHIs of Exit block given the Loop is deleted because the + // uses remain same if FusedLoop is removed OR uses are already replaced if + // original loops are deleted. + + // Branch to Exit block from FusionSwitcher. + unsigned SuccNum = 0; + if (FusionSwitcher->getSuccessor(1) == PH) + SuccNum = 1; + assert((FusionSwitcher->getSuccessor(SuccNum) == PH)); + FusionSwitcher->setSuccessor(SuccNum, Exit); + if (DT->dominates(L.getLoopLatch(), Exit)) // L1 removal case. + // Exit blocks iDom is FusionSwitcher's block due to versioning. + DT->changeImmediateDominator(Exit, FusionSwitcher->getParent()); + + // Erase each of the loop blocks. Update SE, DT and LI. + SE->forgetLoop(&L); + PH->dropAllReferences(); + for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) { + DT->changeImmediateDominator(*bb, PH); + (*bb)->dropAllReferences(); + } + + PH->eraseFromParent(); + for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) { + // Now nuke bb and its DT. + (*bb)->eraseFromParent(); + DT->eraseNode(*bb); + } + DT->eraseNode(PH); + + SmallVector LBBs; + for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) + LBBs.push_back(*bb); + for (auto *bb : LBBs) + LI->removeBlock(bb); + if (LI->getLoopFor(PH)) + LI->removeBlock(PH); + + LI->markAsRemoved(&L); +} + +// Remove FusionSwitcher and branch directly to given loop @L's header. This +// removes loop's preheader and make FusionSwitcher's block as preheader. +void LoopFuse::RemoveFusionSwitcher(Loop &L) { + assert(FusionSwitcher->isConditional()); + DEBUG(dbgs() << "Removing FusionSwitcher: " << *FusionSwitcher << "\n"); + + BasicBlock *PH = L.getLoopPreheader(); + assert((PH->size() == 1)); + + BranchInst *PHBr = dyn_cast(PH->getTerminator()); + assert(PHBr->isUnconditional()); + + RewritePHI(PHBr, FusionSwitcher->getParent()); + + PHBr->removeFromParent(); + PHBr->insertBefore(FusionSwitcher); + DT->changeImmediateDominator(L.getHeader(), FusionSwitcher->getParent()); + + FusionSwitcher->eraseFromParent(); + PH->eraseFromParent(); + DT->eraseNode(PH); + if (LI->getLoopFor(PH)) + LI->removeBlock(PH); +} + +// Update the uses of defs that reach outside original loop with the defs made +// made in fused loop. +void LoopFuse::UpdateUsesOutsideLoop(Loop &L) { + for (auto *D : findDefsUsedOutsideOfLoop(&L)) { + auto VI = VMap.find(D); + if (VI == VMap.end()) + continue; + + for (auto *U : D->users()) { + if (!L.contains(dyn_cast(U)->getParent())) { + if (auto *P = dyn_cast(U)) { + // Replace U in PHI with + for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) { + if (P->getIncomingValue(i) == U) { + P->removeIncomingValue(i); + P->addIncoming(VI->second, FusedLoop->getLoopLatch()); + } + } + } else + U->replaceUsesOfWith(D, VI->second); + } + } + } +} + +// Add/update phi for defs that reach uses outside the loop from original loop +// @L and from fused loop. Insert the phis into fused loop's exit block, which +// is also the exit block of original L2 loop. @OrigIncomingBlock refers to the +// block from where a def is reached outside of loop - L2 latch. +// TODO: This routine is similar to LoopVersioning's addPHINodes(), but +// rewritten here as access to internal data structures differ. +void LoopFuse::AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock) { + BasicBlock *PHIBlock = FusedLoop->getExitBlock(); + assert(PHIBlock && "Unable to find FusedLoop's ExitBlock!"); + + for (auto *Inst : findDefsUsedOutsideOfLoop(&L)) { + PHINode *PN = nullptr; + auto FusedInst = VMap.find(Inst); + assert((FusedInst != VMap.end()) && + "Expected an equivalent instruction in fused loop!"); + // Update/add phi node for this Inst. + bool FoundInst = false; + for (auto I = PHIBlock->begin(); !FoundInst && (PN = dyn_cast(I)); + ++I) { + for (unsigned i = 0, e = PN->getNumIncomingValues(); !FoundInst && i != e; + ++i) + if (PN->getIncomingValue(i) == Inst) + FoundInst = true; + } + if (!PN) { + PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lfuse", + &PHIBlock->front()); + + for (auto *U : Inst->users()) + if (!L.contains(dyn_cast(U)->getParent())) + U->replaceUsesOfWith(Inst, PN); + + PN->addIncoming(Inst, OrigIncomingBlock); + } + // Add incoming value from fused loop. + PN->addIncoming(FusedInst->second, FusedLoop->getLoopLatch()); + } +} + +bool LoopFuse::run(Loop &L1, Loop &L2) { + assert((LI && LAA && DT && SE)); + DEBUG(dbgs() << "\nTrying to fuse:\n" << L1 << "AND\n" << L2 << "\n"); + + FusionSwitcher = nullptr; + FusedLoop = nullptr; + VMap.clear(); + bool Changed = false; + if (IsLegalAndProfitable(L1, L2)) { + assert((FusedLoop && FusionSwitcher)); + auto *RuntimePtrChecks = LAI->getRuntimePointerChecking(); + if (RuntimePtrChecks->Need) { + // Add runtime checks and add/update phis in exit block for the defs + // reaching from two versions. + Instruction *FirstCheck, *LastCheck; + std::tie(FirstCheck, LastCheck) = LAI->addRuntimeChecks(FusionSwitcher); + // TODO: Add SCEVRuntime checks? + FusionSwitcher->setCondition(LastCheck); + + AddPHIsOutsideLoop(L1, L2.getLoopLatch()); + AddPHIsOutsideLoop(L2, L2.getLoopLatch()); + FusionKind = VERSIONED_FUSION; + + } else { + // Remove original loops and retain FusedLoop. Also update the uses of + // defs from original loops with the defs from fused loop. + UpdateUsesOutsideLoop(L1); + UpdateUsesOutsideLoop(L2); + RemoveLoopCompletelyWithPreheader(L1); + RemoveLoopCompletelyWithPreheader(L2); + + // Remove FusionSwitcher and directly point to FusedLoop header. + if (DT->dominates(FusionSwitcher->getParent(), FusedLoop->getExitBlock())) + DT->changeImmediateDominator(FusedLoop->getExitBlock(), + FusedLoop->getLoopLatch()); + RemoveFusionSwitcher(*FusedLoop); + FusionKind = PURE_FUSION; + } + ++NumLoopsFused; + Changed = true; + + } else { + if (FusedLoop) { + // Loops were versioned to check legality. Rollback to original state. + RemoveLoopCompletelyWithPreheader(*FusedLoop); + + // Remove FusionSwitcher and directly point to L1 header. + if (DT->dominates(FusionSwitcher->getParent(), L2.getExitBlock())) + DT->changeImmediateDominator(L2.getExitBlock(), L2.getLoopLatch()); + RemoveFusionSwitcher(L1); + FusionKind = REVERTED_FUSION; + } + } + + if (LFuseVerify) { + LI->verify(*DT); + DT->verifyDomTree(); + } + + return Changed; +} + +void PopulateInnermostLoopsOf(Loop &L, SmallVectorImpl &Loops) { + if (L.empty()) + Loops.push_back(&L); + for (auto I = L.begin(), E = L.end(); I != E; ++I) + PopulateInnermostLoopsOf(**I, Loops); +} + +bool LoopFuse::runOnFunction(Function &F) { + LI = &getAnalysis().getLoopInfo(); + LAA = &getAnalysis(); + DT = &getAnalysis().getDomTree(); + SE = &getAnalysis().getSE(); + + // Populate innermost loops and try a n^2 combination of loop fusion. + bool Changed = false; + SmallVector Loops; + for (auto L = LI->begin(), Le = LI->end(); L != Le; ++L) + PopulateInnermostLoopsOf(**L, Loops); + + auto L1 = Loops.begin(), L1e = Loops.end(); + while (L1 != L1e) { + auto L2 = Loops.begin(), L2e = Loops.end(); + while (L2 != L2e) { + if (L1 == L2) { + ++L2; + continue; + } + if (run(**L1, **L2)) { + // Remove L1 and L2 from Loops and add FusedLoop. + Loops.erase(L1); + Loops.erase(L2); + Loops.push_back(FusedLoop); + L1 = L2 = Loops.begin(); + L1e = L2e = Loops.end(); + Changed = true; + } else + ++L2; + } + ++L1; + } + + if (LFuseVerify) { + LI->verify(*DT); + DT->verifyDomTree(); + assert((!verifyFunction(F, &dbgs())) && "Function verification failed!"); + } + + return Changed; +} + +char LoopFuse::ID; + +INITIALIZE_PASS_BEGIN(LoopFuse, "loop-fuse", "Loop Fusion", false, false) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(LoopFuse, "loop-fuse", "Loop Fusion", false, false) + +namespace llvm { +FunctionPass *createLoopFusePass() { return new LoopFuse(); } +} diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp index fd22128f7fe6b8..34773d906e0481 100644 --- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp +++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp @@ -32,6 +32,603 @@ static cl::opt DefaultRotationThreshold( "rotation-max-header-size", cl::init(16), cl::Hidden, cl::desc("The default maximum header size for automatic loop rotation")); +STATISTIC(NumRotated, "Number of loops rotated"); + +namespace { +/// A simple loop rotation transformation. +class LoopRotate { + const unsigned MaxHeaderSize; + LoopInfo *LI; + const TargetTransformInfo *TTI; + AssumptionCache *AC; + DominatorTree *DT; + ScalarEvolution *SE; + const SimplifyQuery &SQ; + +public: + LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI, + const TargetTransformInfo *TTI, AssumptionCache *AC, + DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ) + : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), + SQ(SQ) {} + bool processLoop(Loop *L); + +private: + bool rotateLoop(Loop *L, bool SimplifiedLatch); + bool simplifyLoopLatch(Loop *L); +}; +} // end anonymous namespace + +/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the +/// old header into the preheader. If there were uses of the values produced by +/// these instruction that were outside of the loop, we have to insert PHI nodes +/// to merge the two values. Do this now. +static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader, + BasicBlock *OrigPreheader, + ValueToValueMapTy &ValueMap, + SmallVectorImpl *InsertedPHIs) { + // Remove PHI node entries that are no longer live. + BasicBlock::iterator I, E = OrigHeader->end(); + for (I = OrigHeader->begin(); PHINode *PN = dyn_cast(I); ++I) + PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader)); + + // Now fix up users of the instructions in OrigHeader, inserting PHI nodes + // as necessary. + SSAUpdater SSA(InsertedPHIs); + for (I = OrigHeader->begin(); I != E; ++I) { + Value *OrigHeaderVal = &*I; + + // If there are no uses of the value (e.g. because it returns void), there + // is nothing to rewrite. + if (OrigHeaderVal->use_empty()) + continue; + + Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal); + + // The value now exits in two versions: the initial value in the preheader + // and the loop "next" value in the original header. + SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName()); + SSA.AddAvailableValue(OrigHeader, OrigHeaderVal); + SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal); + + // Visit each use of the OrigHeader instruction. + for (Value::use_iterator UI = OrigHeaderVal->use_begin(), + UE = OrigHeaderVal->use_end(); + UI != UE;) { + // Grab the use before incrementing the iterator. + Use &U = *UI; + + // Increment the iterator before removing the use from the list. + ++UI; + + // SSAUpdater can't handle a non-PHI use in the same block as an + // earlier def. We can easily handle those cases manually. + Instruction *UserInst = cast(U.getUser()); + if (!isa(UserInst)) { + BasicBlock *UserBB = UserInst->getParent(); + + // The original users in the OrigHeader are already using the + // original definitions. + if (UserBB == OrigHeader) + continue; + + // Users in the OrigPreHeader need to use the value to which the + // original definitions are mapped. + if (UserBB == OrigPreheader) { + U = OrigPreHeaderVal; + continue; + } + } + + // Anything else can be handled by SSAUpdater. + SSA.RewriteUse(U); + } + + // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug + // intrinsics. + SmallVector DbgValues; + llvm::findDbgValues(DbgValues, OrigHeaderVal); + for (auto &DbgValue : DbgValues) { + // The original users in the OrigHeader are already using the original + // definitions. + BasicBlock *UserBB = DbgValue->getParent(); + if (UserBB == OrigHeader) + continue; + + // Users in the OrigPreHeader need to use the value to which the + // original definitions are mapped and anything else can be handled by + // the SSAUpdater. To avoid adding PHINodes, check if the value is + // available in UserBB, if not substitute undef. + Value *NewVal; + if (UserBB == OrigPreheader) + NewVal = OrigPreHeaderVal; + else if (SSA.HasValueForBlock(UserBB)) + NewVal = SSA.GetValueInMiddleOfBlock(UserBB); + else + NewVal = UndefValue::get(OrigHeaderVal->getType()); + DbgValue->setOperand(0, + MetadataAsValue::get(OrigHeaderVal->getContext(), + ValueAsMetadata::get(NewVal))); + } + } +} + +/// Propagate dbg.value intrinsics through the newly inserted Phis. +static void insertDebugValues(BasicBlock *OrigHeader, + SmallVectorImpl &InsertedPHIs) { + ValueToValueMapTy DbgValueMap; + + // Map existing PHI nodes to their dbg.values. + for (auto &I : *OrigHeader) { + if (auto DbgII = dyn_cast(&I)) { + if (auto *Loc = dyn_cast_or_null(DbgII->getVariableLocation())) + DbgValueMap.insert({Loc, DbgII}); + } + } + + // Then iterate through the new PHIs and look to see if they use one of the + // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will + // propagate the info through the new PHI. + LLVMContext &C = OrigHeader->getContext(); + for (auto PHI : InsertedPHIs) { + for (auto VI : PHI->operand_values()) { + auto V = DbgValueMap.find(VI); + if (V != DbgValueMap.end()) { + auto *DbgII = cast(V->second); + Instruction *NewDbgII = DbgII->clone(); + auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI)); + NewDbgII->setOperand(0, PhiMAV); + BasicBlock *Parent = PHI->getParent(); + NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime()); + } + } + } +} + +/// Rotate loop LP. Return true if the loop is rotated. +/// +/// \param SimplifiedLatch is true if the latch was just folded into the final +/// loop exit. In this case we may want to rotate even though the new latch is +/// now an exiting branch. This rotation would have happened had the latch not +/// been simplified. However, if SimplifiedLatch is false, then we avoid +/// rotating loops in which the latch exits to avoid excessive or endless +/// rotation. LoopRotate should be repeatable and converge to a canonical +/// form. This property is satisfied because simplifying the loop latch can only +/// happen once across multiple invocations of the LoopRotate pass. +bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) { + // If the loop has only one block then there is not much to rotate. + if (L->getBlocks().size() == 1) + return false; + + BasicBlock *OrigHeader = L->getHeader(); + BasicBlock *OrigLatch = L->getLoopLatch(); + + BranchInst *BI = dyn_cast(OrigHeader->getTerminator()); + if (!BI || BI->isUnconditional()) + return false; + + // If the loop header is not one of the loop exiting blocks then + // either this loop is already rotated or it is not + // suitable for loop rotation transformations. + if (!L->isLoopExiting(OrigHeader)) + return false; + + // If the loop latch already contains a branch that leaves the loop then the + // loop is already rotated. + if (!OrigLatch) + return false; + + // Rotate if either the loop latch does *not* exit the loop, or if the loop + // latch was just simplified. + if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch) + return false; + + // Check size of original header and reject loop if it is very big or we can't + // duplicate blocks inside it. + { + SmallPtrSet EphValues; + CodeMetrics::collectEphemeralValues(L, AC, EphValues); + + CodeMetrics Metrics; + Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues); + if (Metrics.notDuplicatable) { + DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable" + << " instructions: "; + L->dump()); + return false; + } + if (Metrics.convergent) { + DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent " + "instructions: "; + L->dump()); + return false; + } + if (Metrics.NumInsts > MaxHeaderSize) + return false; + } + + // Now, this loop is suitable for rotation. + BasicBlock *OrigPreheader = L->getLoopPreheader(); + + // If the loop could not be converted to canonical form, it must have an + // indirectbr in it, just give up. + if (!OrigPreheader) + return false; + + if (isa(OrigPreheader->getTerminator())) { + DEBUG(dbgs() << "LoopRotation: Splitting header due to sync terminator.\n"); + BasicBlock *NewPreheader = SplitEdge(OrigPreheader, OrigHeader, DT, LI); + // SyncInst::Create(NewPreheader, OrigPreheader->getTerminator()); + // OrigPreheader->getTerminator()->eraseFromParent(); + OrigPreheader = NewPreheader; + } + + // Anything ScalarEvolution may know about this loop or the PHI nodes + // in its header will soon be invalidated. + if (SE) + SE->forgetLoop(L); + + DEBUG(dbgs() << "LoopRotation: rotating "; L->dump()); + + // Find new Loop header. NewHeader is a Header's one and only successor + // that is inside loop. Header's other successor is outside the + // loop. Otherwise loop is not suitable for rotation. + BasicBlock *Exit = BI->getSuccessor(0); + BasicBlock *NewHeader = BI->getSuccessor(1); + if (L->contains(Exit)) + std::swap(Exit, NewHeader); + assert(NewHeader && "Unable to determine new loop header"); + assert(L->contains(NewHeader) && !L->contains(Exit) && + "Unable to determine loop header and exit blocks"); + + // This code assumes that the new header has exactly one predecessor. + // Remove any single-entry PHI nodes in it. + assert(NewHeader->getSinglePredecessor() && + "New header doesn't have one pred!"); + FoldSingleEntryPHINodes(NewHeader); + + // Begin by walking OrigHeader and populating ValueMap with an entry for + // each Instruction. + BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end(); + ValueToValueMapTy ValueMap; + + // For PHI nodes, the value available in OldPreHeader is just the + // incoming value from OldPreHeader. + for (; PHINode *PN = dyn_cast(I); ++I) + ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader); + + // For the rest of the instructions, either hoist to the OrigPreheader if + // possible or create a clone in the OldPreHeader if not. + TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator(); + + // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication. + using DbgIntrinsicHash = + std::pair, DIExpression *>; + auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash { + return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()}; + }; + SmallDenseSet DbgIntrinsics; + for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend(); + I != E; ++I) { + if (auto *DII = dyn_cast(&*I)) + DbgIntrinsics.insert(makeHash(DII)); + else + break; + } + + while (I != E) { + Instruction *Inst = &*I++; + + // If the instruction's operands are invariant and it doesn't read or write + // memory, then it is safe to hoist. Doing this doesn't change the order of + // execution in the preheader, but does prevent the instruction from + // executing in each iteration of the loop. This means it is safe to hoist + // something that might trap, but isn't safe to hoist something that reads + // memory (without proving that the loop doesn't write). + if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() && + !Inst->mayWriteToMemory() && !isa(Inst) && + !isa(Inst) && !isa(Inst)) { + Inst->moveBefore(LoopEntryBranch); + continue; + } + + // Otherwise, create a duplicate of the instruction. + Instruction *C = Inst->clone(); + + // Eagerly remap the operands of the instruction. + RemapInstruction(C, ValueMap, + RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); + + // Avoid inserting the same intrinsic twice. + if (auto *DII = dyn_cast(C)) + if (DbgIntrinsics.count(makeHash(DII))) { + C->deleteValue(); + continue; + } + + // With the operands remapped, see if the instruction constant folds or is + // otherwise simplifyable. This commonly occurs because the entry from PHI + // nodes allows icmps and other instructions to fold. + Value *V = SimplifyInstruction(C, SQ); + if (V && LI->replacementPreservesLCSSAForm(C, V)) { + // If so, then delete the temporary instruction and stick the folded value + // in the map. + ValueMap[Inst] = V; + if (!C->mayHaveSideEffects()) { + C->deleteValue(); + C = nullptr; + } + } else { + ValueMap[Inst] = C; + } + if (C) { + // Otherwise, stick the new instruction into the new block! + C->setName(Inst->getName()); + C->insertBefore(LoopEntryBranch); + + if (auto *II = dyn_cast(C)) + if (II->getIntrinsicID() == Intrinsic::assume) + AC->registerAssumption(II); + } + } + + // Along with all the other instructions, we just cloned OrigHeader's + // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's + // successors by duplicating their incoming values for OrigHeader. + TerminatorInst *TI = OrigHeader->getTerminator(); + for (BasicBlock *SuccBB : TI->successors()) + for (BasicBlock::iterator BI = SuccBB->begin(); + PHINode *PN = dyn_cast(BI); ++BI) + PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader); + + // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove + // OrigPreHeader's old terminator (the original branch into the loop), and + // remove the corresponding incoming values from the PHI nodes in OrigHeader. + LoopEntryBranch->eraseFromParent(); + + + SmallVector InsertedPHIs; + // If there were any uses of instructions in the duplicated block outside the + // loop, update them, inserting PHI nodes as required + RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap, + &InsertedPHIs); + + // Attach dbg.value intrinsics to the new phis if that phi uses a value that + // previously had debug metadata attached. This keeps the debug info + // up-to-date in the loop body. + if (!InsertedPHIs.empty()) + insertDebugValues(OrigHeader, InsertedPHIs); + + // NewHeader is now the header of the loop. + L->moveToHeader(NewHeader); + assert(L->getHeader() == NewHeader && "Latch block is our new header"); + + // Inform DT about changes to the CFG. + if (DT) { + // The OrigPreheader branches to the NewHeader and Exit now. Then, inform + // the DT about the removed edge to the OrigHeader (that got removed). + SmallVector Updates; + Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit}); + Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader}); + Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader}); + DT->applyUpdates(Updates); + } + + // At this point, we've finished our major CFG changes. As part of cloning + // the loop into the preheader we've simplified instructions and the + // duplicated conditional branch may now be branching on a constant. If it is + // branching on a constant and if that constant means that we enter the loop, + // then we fold away the cond branch to an uncond branch. This simplifies the + // loop in cases important for nested loops, and it also means we don't have + // to split as many edges. + BranchInst *PHBI = cast(OrigPreheader->getTerminator()); + assert(PHBI->isConditional() && "Should be clone of BI condbr!"); + if (!isa(PHBI->getCondition()) || + PHBI->getSuccessor(cast(PHBI->getCondition())->isZero()) != + NewHeader) { + // The conditional branch can't be folded, handle the general case. + // Split edges as necessary to preserve LoopSimplify form. + + // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and + // thus is not a preheader anymore. + // Split the edge to form a real preheader. + BasicBlock *NewPH = SplitCriticalEdge( + OrigPreheader, NewHeader, + CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); + NewPH->setName(NewHeader->getName() + ".lr.ph"); + + // Preserve canonical loop form, which means that 'Exit' should have only + // one predecessor. Note that Exit could be an exit block for multiple + // nested loops, causing both of the edges to now be critical and need to + // be split. + SmallVector ExitPreds(pred_begin(Exit), pred_end(Exit)); + bool SplitLatchEdge = false; + for (BasicBlock *ExitPred : ExitPreds) { + // We only need to split loop exit edges. + Loop *PredLoop = LI->getLoopFor(ExitPred); + if (!PredLoop || PredLoop->contains(Exit)) + continue; + if (isa(ExitPred->getTerminator())) + continue; + SplitLatchEdge |= L->getLoopLatch() == ExitPred; + BasicBlock *ExitSplit = SplitCriticalEdge( + ExitPred, Exit, + CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA()); + ExitSplit->moveBefore(Exit); + } + assert(SplitLatchEdge && + "Despite splitting all preds, failed to split latch exit?"); + } else { + // We can fold the conditional branch in the preheader, this makes things + // simpler. The first step is to remove the extra edge to the Exit block. + Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/); + BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI); + NewBI->setDebugLoc(PHBI->getDebugLoc()); + PHBI->eraseFromParent(); + + // With our CFG finalized, update DomTree if it is available. + if (DT) DT->deleteEdge(OrigPreheader, Exit); + } + + assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation"); + assert(L->getLoopLatch() && "Invalid loop latch after loop rotation"); + + // Now that the CFG and DomTree are in a consistent state again, try to merge + // the OrigHeader block into OrigLatch. This will succeed if they are + // connected by an unconditional branch. This is just a cleanup so the + // emitted code isn't too gross in this common case. + MergeBlockIntoPredecessor(OrigHeader, DT, LI); + + DEBUG(dbgs() << "LoopRotation: into "; L->dump()); + + ++NumRotated; + return true; +} + +/// Determine whether the instructions in this range may be safely and cheaply +/// speculated. This is not an important enough situation to develop complex +/// heuristics. We handle a single arithmetic instruction along with any type +/// conversions. +static bool shouldSpeculateInstrs(BasicBlock::iterator Begin, + BasicBlock::iterator End, Loop *L) { + bool seenIncrement = false; + bool MultiExitLoop = false; + + if (!L->getExitingBlock()) + MultiExitLoop = true; + + for (BasicBlock::iterator I = Begin; I != End; ++I) { + + if (!isSafeToSpeculativelyExecute(&*I)) + return false; + + if (isa(I)) + continue; + + switch (I->getOpcode()) { + default: + return false; + case Instruction::GetElementPtr: + // GEPs are cheap if all indices are constant. + if (!cast(I)->hasAllConstantIndices()) + return false; + // fall-thru to increment case + LLVM_FALLTHROUGH; + case Instruction::Add: + case Instruction::Sub: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: { + Value *IVOpnd = + !isa(I->getOperand(0)) + ? I->getOperand(0) + : !isa(I->getOperand(1)) ? I->getOperand(1) : nullptr; + if (!IVOpnd) + return false; + + // If increment operand is used outside of the loop, this speculation + // could cause extra live range interference. + if (MultiExitLoop) { + for (User *UseI : IVOpnd->users()) { + auto *UserInst = cast(UseI); + if (!L->contains(UserInst)) + return false; + } + } + + if (seenIncrement) + return false; + seenIncrement = true; + break; + } + case Instruction::Trunc: + case Instruction::ZExt: + case Instruction::SExt: + // ignore type conversions + break; + } + } + return true; +} + +/// Fold the loop tail into the loop exit by speculating the loop tail +/// instructions. Typically, this is a single post-increment. In the case of a +/// simple 2-block loop, hoisting the increment can be much better than +/// duplicating the entire loop header. In the case of loops with early exits, +/// rotation will not work anyway, but simplifyLoopLatch will put the loop in +/// canonical form so downstream passes can handle it. +/// +/// I don't believe this invalidates SCEV. +bool LoopRotate::simplifyLoopLatch(Loop *L) { + BasicBlock *Latch = L->getLoopLatch(); + if (!Latch || Latch->hasAddressTaken()) + return false; + + BranchInst *Jmp = dyn_cast(Latch->getTerminator()); + if (!Jmp || !Jmp->isUnconditional()) + return false; + + BasicBlock *LastExit = Latch->getSinglePredecessor(); + if (!LastExit || !L->isLoopExiting(LastExit)) + return false; + + BranchInst *BI = dyn_cast(LastExit->getTerminator()); + if (!BI) + return false; + + if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L)) + return false; + + DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into " + << LastExit->getName() << "\n"); + + // Hoist the instructions from Latch into LastExit. + LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(), + Latch->begin(), Jmp->getIterator()); + + unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1; + BasicBlock *Header = Jmp->getSuccessor(0); + assert(Header == L->getHeader() && "expected a backward branch"); + + // Remove Latch from the CFG so that LastExit becomes the new Latch. + BI->setSuccessor(FallThruPath, Header); + Latch->replaceSuccessorsPhiUsesWith(LastExit); + Jmp->eraseFromParent(); + + // Nuke the Latch block. + assert(Latch->empty() && "unable to evacuate Latch"); + LI->removeBlock(Latch); + if (DT) + DT->eraseNode(Latch); + Latch->eraseFromParent(); + return true; +} + +/// Rotate \c L, and return true if any modification was made. +bool LoopRotate::processLoop(Loop *L) { + // Save the loop metadata. + MDNode *LoopMD = L->getLoopID(); + + // Simplify the loop latch before attempting to rotate the header + // upward. Rotation may not be needed if the loop tail can be folded into the + // loop exit. + bool SimplifiedLatch = simplifyLoopLatch(L); + + bool MadeChange = rotateLoop(L, SimplifiedLatch); + assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) && + "Loop latch should be exiting after loop-rotate."); + + // Restore the loop metadata. + // NB! We presume LoopRotation DOESN'T ADD its own metadata. + if ((MadeChange || SimplifiedLatch) && LoopMD) + L->setLoopID(LoopMD); + + return MadeChange || SimplifiedLatch; +} + LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication) : EnableHeaderDuplication(EnableHeaderDuplication) {} diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp index 2f6ed05c023b1e..c4dccc91b54056 100644 --- a/llvm/lib/Transforms/Scalar/SCCP.cpp +++ b/llvm/lib/Transforms/Scalar/SCCP.cpp @@ -611,6 +611,14 @@ class SCCPSolver : public InstVisitor { void visitReturnInst(ReturnInst &I); void visitTerminator(Instruction &TI); + void visitReattachInst(ReattachInst &I) { + markOverdefined(&I); + visitTerminator(I); + } + void visitSyncInst(SyncInst &I) { + markOverdefined(&I); + visitTerminator(I); + } void visitCastInst(CastInst &I); void visitSelectInst(SelectInst &I); @@ -734,6 +742,13 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI, return; } + if (isa(&TI) || + isa(&TI) || + isa(&TI)) { + // All destinations are executable. + Succs.assign(TI.getNumSuccessors(), true); + return; + } LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n'); llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } @@ -745,6 +760,66 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) { // be more aggressive and try to consider edges which haven't been marked // yet, but there isn't any need.) return KnownFeasibleEdges.count(Edge(From, To)); + assert(BBExecutable.count(To) && "Dest should always be alive!"); + + // Make sure the source basic block is executable!! + if (!BBExecutable.count(From)) return false; + + // Check to make sure this edge itself is actually feasible now. + TerminatorInst *TI = From->getTerminator(); + if (auto *BI = dyn_cast(TI)) { + if (BI->isUnconditional()) + return true; + + LatticeVal BCValue = getValueState(BI->getCondition()); + + // Overdefined condition variables mean the branch could go either way, + // undef conditions mean that neither edge is feasible yet. + ConstantInt *CI = BCValue.getConstantInt(); + if (!CI) + return !BCValue.isUnknown(); + + // Constant condition variables mean the branch can only go a single way. + return BI->getSuccessor(CI->isZero()) == To; + } + + // Unwinding instructions successors are always executable. + if (TI->isExceptional()) + return true; + + if (auto *SI = dyn_cast(TI)) { + if (SI->getNumCases() < 1) + return true; + + LatticeVal SCValue = getValueState(SI->getCondition()); + ConstantInt *CI = SCValue.getConstantInt(); + + if (!CI) + return !SCValue.isUnknown(); + + return SI->findCaseValue(CI)->getCaseSuccessor() == To; + } + + // In case of indirect branch and its address is a blockaddress, we mark + // the target as executable. + if (auto *IBR = dyn_cast(TI)) { + LatticeVal IBRValue = getValueState(IBR->getAddress()); + BlockAddress *Addr = IBRValue.getBlockAddress(); + + if (!Addr) + return !IBRValue.isUnknown(); + + // At this point, the indirectbr is branching on a blockaddress. + return Addr->getBasicBlock() == To; + } + + if (isa(TI) || + isa(TI) || + isa(TI)) + return true; + + LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n'); + llvm_unreachable("SCCP: Don't know how to handle this terminator!"); } // visit Implementations - Something changed in this instruction, either an diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 68ca6c47c8f1a4..cef9cac89db330 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -3867,6 +3867,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { // a direct store) as needing to be resplit because it is no longer // promotable. if (AllocaInst *OtherAI = dyn_cast(StoreBasePtr)) { + assert((!FunctionContainsDetach || + isAllocaParallelPromotable(OtherAI, *DT)) && + "Alloca must be promotable"); ResplitPromotableAllocas.insert(OtherAI); Worklist.insert(OtherAI); } else if (AllocaInst *OtherAI = dyn_cast( @@ -3983,6 +3986,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) { if (!SplitLoads) { if (AllocaInst *OtherAI = dyn_cast(LoadBasePtr)) { assert(OtherAI != &AI && "We can't re-split our own alloca!"); + assert((!FunctionContainsDetach || + isAllocaParallelPromotable(OtherAI, *DT)) && + "Alloca must be promotable"); ResplitPromotableAllocas.insert(OtherAI); Worklist.insert(OtherAI); } else if (AllocaInst *OtherAI = dyn_cast( @@ -4152,9 +4158,16 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, break; } + // Check if any detaches block promotion. + Promotable &= (!FunctionContainsDetach || + isAllocaParallelPromotable(NewAI, *DT)); + if (Promotable) { if (PHIUsers.empty() && SelectUsers.empty()) { // Promote the alloca. + assert((!FunctionContainsDetach || + isAllocaParallelPromotable(NewAI, *DT)) && + "Alloca must be promotable"); PromotableAllocas.push_back(NewAI); } else { // If we have either PHIs or Selects to speculate, add them to those @@ -4496,11 +4509,28 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT, DT = &RunDT; AC = &RunAC; - BasicBlock &EntryBB = F.getEntryBlock(); - for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); - I != E; ++I) { - if (AllocaInst *AI = dyn_cast(I)) - Worklist.insert(AI); + // BasicBlock &EntryBB = F.getEntryBlock(); + // Scan the function to get its entry block and all entry blocks of detached + // CFG's. We can perform this scan for entry blocks once for the function, + // because this pass preserves the CFG. + SmallVector EntryBlocks; + FunctionContainsDetach = false; + EntryBlocks.push_back(&F.getEntryBlock()); + for (BasicBlock &BB : F) + if (BasicBlock *Pred = BB.getUniquePredecessor()) + if (DetachInst *DI = dyn_cast(Pred->getTerminator())) { + FunctionContainsDetach = true; + if (DI->getDetached() == &BB) + EntryBlocks.push_back(&BB); + } + + for (BasicBlock *BB : EntryBlocks) { + BasicBlock &EntryBB = *BB; + for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end()); + I != E; ++I) { + if (AllocaInst *AI = dyn_cast(I)) + Worklist.insert(AI); + } } bool Changed = false; diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp index 976daf4c78c2fd..67571aeeaf12c6 100644 --- a/llvm/lib/Transforms/Scalar/Scalar.cpp +++ b/llvm/lib/Transforms/Scalar/Scalar.cpp @@ -65,6 +65,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeLegacyLoopSinkPassPass(Registry); initializeLoopDataPrefetchLegacyPassPass(Registry); initializeLoopDeletionLegacyPassPass(Registry); + initializeLoopFusePass(Registry); initializeLoopAccessLegacyAnalysisPass(Registry); initializeLoopInstSimplifyLegacyPassPass(Registry); initializeLoopInterchangePass(Registry); diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp index b7b1db76b49237..f60e856a4d4285 100644 --- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -38,6 +38,8 @@ #include "llvm/IR/Module.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Scalar/SimplifyCFG.h" #include @@ -143,6 +145,71 @@ static bool mergeEmptyReturnBlocks(Function &F) { return Changed; } +static bool removeUselessSyncs(Function &F) { + bool Changed = false; + // Scan all the blocks in the function + check: + for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) { + BasicBlock *BB = &*BBI++; + if (SyncInst *Sync = dyn_cast(BB->getTerminator())) { + // Walk the CFG backwards to try to find a reaching detach instruction. + bool ReachingDetach = false; + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(BB); + while (!WorkList.empty()) { + BasicBlock *PBB = WorkList.pop_back_val(); + if (!Visited.insert(PBB).second) + continue; + + for (pred_iterator PI = pred_begin(PBB), PE = pred_end(PBB); + PI != PE; ++PI) { + BasicBlock *Pred = *PI; + TerminatorInst *PT = Pred->getTerminator(); + // Stop the traversal at the entry block of a detached CFG. + if (DetachInst *DI = dyn_cast(PT)) { + if (DI->getDetached() == PBB) + continue; + else // DI->getContinue() == PBB + // This detach reaches the sync through the continuation edge. + ReachingDetach = true; + } + if (ReachingDetach) + break; + + // Ignore predecessors via a reattach, which belong to child detached + // contexts. + if (isa(PT)) + continue; + + // For a predecessor terminated by a sync instruction, check the sync + // region it belongs to. If the sync belongs to a different sync + // region, add the block that starts that region. Otherwise, ignore + // the predecessor. + if (SyncInst *SI = dyn_cast(PT)) { + if (SI->getSyncRegion() != Sync->getSyncRegion()) + for (User *U : SI->getSyncRegion()->users()) + if (isa(U)) + WorkList.push_back(cast(U)->getParent()); + continue; + } + + WorkList.push_back(Pred); + } + } + + // If no detach reaches this sync, then this sync can be removed. + if (!ReachingDetach) { + BasicBlock* Succ = Sync->getSuccessor(0); + ReplaceInstWithInst(Sync, BranchInst::Create(Succ)); + Changed = true; + if (MergeBlockIntoPredecessor(Succ)) goto check; + } + } + } + return Changed; +} + /// Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, @@ -176,6 +243,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, bool EverChanged = removeUnreachableBlocks(F); EverChanged |= mergeEmptyReturnBlocks(F); EverChanged |= iterativelySimplifyCFG(F, TTI, Options); + EverChanged |= removeUselessSyncs(F); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -191,6 +259,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI, do { EverChanged = iterativelySimplifyCFG(F, TTI, Options); EverChanged |= removeUnreachableBlocks(F); + EverChanged |= removeUselessSyncs(F); } while (EverChanged); return true; diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 0f6db21f73b60e..7a24ab744b4b75 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -684,6 +684,38 @@ static bool eliminateRecursiveTailCall( return true; } +static void getReturnBlocksToSync( + BasicBlock *Entry, SyncInst *Sync, + SmallVectorImpl &ReturnBlocksToSync) { + // Walk the CFG from the entry block, stopping traversal at any sync within + // the same region. Record all blocks found that are terminated by a return + // instruction. + Value *SyncRegion = Sync->getSyncRegion(); + SmallVector WorkList; + SmallPtrSet Visited; + WorkList.push_back(Entry); + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Skip paths that are synced within the same region. + if (SyncInst *SI = dyn_cast(BB->getTerminator())) + if (SI->getSyncRegion() == SyncRegion) + continue; + + // If we find a return, we must add a sync before it if we eliminate a + // recursive tail call. + if (isa(BB->getTerminator())) + ReturnBlocksToSync.push_back(BB); + + // Queue up successors to search. + for (BasicBlock *Succ : successors(BB)) + if (Succ != Sync->getParent()) + WorkList.push_back(Succ); + } +} + static bool foldReturnAndProcessPred( BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail, SmallVectorImpl &ArgumentPHIs, @@ -700,13 +732,17 @@ static bool foldReturnAndProcessPred( // predecessors and perform TRE there. Look for predecessors that end // in unconditional branch and recursive call(s). SmallVector UncondBranchPreds; + SmallVector SyncPreds; for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *Pred = *PI; Instruction *PTI = Pred->getTerminator(); if (BranchInst *BI = dyn_cast(PTI)) if (BI->isUnconditional()) UncondBranchPreds.push_back(BI); + if (SyncInst *SI = dyn_cast(PTI)) + SyncPreds.push_back(SI); } + BasicBlock *OldEntryBlock = &BB->getParent()->getEntryBlock(); while (!UncondBranchPreds.empty()) { BranchInst *BI = UncondBranchPreds.pop_back_val(); @@ -730,6 +766,68 @@ static bool foldReturnAndProcessPred( } } + // If this loop runs, then the previous one could not have erased BB, because + // BB has a predecessor that is not an unconditional branch. + while (!SyncPreds.empty()) { + SyncInst *SI = SyncPreds.pop_back_val(); + BasicBlock *Pred = SI->getParent(); + if (CallInst *CI = + findTRECandidate(SI, CannotTailCallElimCallsMarkedTail, TTI)) { + // Check that all instructions between the candidate tail call and the + // sync can be moved above the call. In particular, we disallow + // accumulator recursion elimination for tail calls before a sync. + BasicBlock::iterator BBI(CI); + for (++BBI; &*BBI != SI; ++BBI) + if (!canMoveAboveCall(&*BBI, CI, AA)) + break; + if (&*BBI != SI) + continue; + + // Get the sync region for this sync. + Value *SyncRegion = SI->getSyncRegion(); + + // Check that the sync region begins in the entry block of the function. + if (cast(SyncRegion)->getParent() != OldEntryBlock) { + DEBUG(dbgs() << "Cannot eliminate tail call " << *CI << + ": sync region does not start in entry block."); + continue; + } + + // Get returns reachable from newly created loop. + SmallVector ReturnBlocksToSync; + getReturnBlocksToSync(OldEntryBlock, SI, ReturnBlocksToSync); + + // Remove the sync. + ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred); + + // Cleanup: if all predecessors of BB have been eliminated by + // FoldReturnIntoUncondBranch, delete it. It is important to empty it, + // because the ret instruction in there is still using a value which + // eliminateRecursiveTailCall will attempt to remove. + if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB)) + BB->eraseFromParent(); + + bool EliminatedTail = + eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail, + ArgumentPHIs, AA); + + // If a recursive tail was eliminated, fix up the syncs and sync region in + // the CFG. + if (EliminatedTail) { + // Move the sync region start to the new entry block. + BasicBlock *NewEntry = &OldEntry->getParent()->getEntryBlock(); + cast(SyncRegion)->moveBefore(&*(NewEntry->begin())); + // Insert syncs before relevant return blocks. + for (BasicBlock *RetBlock : ReturnBlocksToSync) { + BasicBlock *NewRetBlock = SplitBlock(RetBlock, + RetBlock->getTerminator()); + ReplaceInstWithInst(RetBlock->getTerminator(), + SyncInst::Create(NewRetBlock, SyncRegion)); + } + Change = true; + } + } + } return Change; } diff --git a/llvm/lib/Transforms/Tapir/CMakeLists.txt b/llvm/lib/Transforms/Tapir/CMakeLists.txt new file mode 100644 index 00000000000000..568558d64e84ae --- /dev/null +++ b/llvm/lib/Transforms/Tapir/CMakeLists.txt @@ -0,0 +1,18 @@ +add_llvm_library(LLVMTapirOpts + CilkABI.cpp + SmallBlock.cpp + RedundantSpawn.cpp + SpawnRestructure.cpp + SpawnUnswitch.cpp + SyncElimination.cpp + LowerToCilk.cpp + LoopSpawning.cpp + Outline.cpp + Tapir.cpp + + ADDITIONAL_HEADER_DIRS + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms + ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Tapir + ) + +add_dependencies(LLVMTapirOpts intrinsics_gen) diff --git a/llvm/lib/Transforms/Tapir/CilkABI.cpp b/llvm/lib/Transforms/Tapir/CilkABI.cpp new file mode 100644 index 00000000000000..bf679d2e0c5377 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/CilkABI.cpp @@ -0,0 +1,1344 @@ +//===- CilkABI.cpp - Lower Tapir into Cilk runtime system calls -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the CilkABI interface, which is used to convert Tapir +// instructions -- detach, reattach, and sync -- to calls into the Cilk +// runtime system. This interface does the low-level dirty work of passes +// such as LowerToCilk. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/CilkABI.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/EscapeEnumerator.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "cilkabi" + +/// Helper typedefs for cilk struct TypeBuilders. +typedef llvm::TypeBuilder<__cilkrts_stack_frame, false> StackFrameBuilder; +typedef llvm::TypeBuilder<__cilkrts_worker, false> WorkerBuilder; +typedef llvm::TypeBuilder<__cilkrts_pedigree, false> PedigreeBuilder; + +/// Helper methods for storing to and loading from struct fields. +static Value *GEP(IRBuilder<> &B, Value *Base, int field) { + // return B.CreateStructGEP(cast(Base->getType()), + // Base, field); + return B.CreateConstInBoundsGEP2_32(nullptr, Base, 0, field); +} + +static void StoreField(IRBuilder<> &B, Value *Val, Value *Dst, int field, + bool isVolatile = false) { + B.CreateStore(Val, GEP(B, Dst, field), isVolatile); +} + +static Value *LoadField(IRBuilder<> &B, Value *Src, int field, + bool isVolatile = false) { + return B.CreateLoad(GEP(B, Src, field), isVolatile); +} + +/// \brief Emit inline assembly code to save the floating point +/// state, for x86 Only. +static void EmitSaveFloatingPointState(IRBuilder<> &B, Value *SF) { + typedef void (AsmPrototype)(uint32_t*, uint16_t*); + llvm::FunctionType *FTy = + TypeBuilder::get(B.getContext()); + + Value *Asm = InlineAsm::get(FTy, + "stmxcsr $0\n\t" "fnstcw $1", + "*m,*m,~{dirflag},~{fpsr},~{flags}", + /*sideeffects*/ true); + + Value * args[2] = { + GEP(B, SF, StackFrameBuilder::mxcsr), + GEP(B, SF, StackFrameBuilder::fpcsr) + }; + + B.CreateCall(Asm, args); +} + +/// \brief Helper to find a function with the given name, creating it if it +/// doesn't already exist. If the function needed to be created then return +/// false, signifying that the caller needs to add the function body. +template +static bool GetOrCreateFunction(const char *FnName, Module& M, + Function *&Fn, + Function::LinkageTypes Linkage = + Function::InternalLinkage, + bool DoesNotThrow = true) { + LLVMContext &Ctx = M.getContext(); + + Fn = M.getFunction(FnName); + + // if the function already exists then let the + // caller know that it is complete + if (Fn) + return true; + + // Otherwise we have to create it + FunctionType *FTy = TypeBuilder::get(Ctx); + Fn = Function::Create(FTy, Linkage, FnName, &M); + + // Set nounwind if it does not throw. + if (DoesNotThrow) + Fn->setDoesNotThrow(); + + // and let the caller know that the function is incomplete + // and the body still needs to be added + return false; +} + +/// \brief Emit a call to the CILK_SETJMP function. +static CallInst *EmitCilkSetJmp(IRBuilder<> &B, Value *SF, Module& M) { + LLVMContext &Ctx = M.getContext(); + + // We always want to save the floating point state too + EmitSaveFloatingPointState(B, SF); + + Type *Int32Ty = Type::getInt32Ty(Ctx); + Type *Int8PtrTy = Type::getInt8PtrTy(Ctx); + + // Get the buffer to store program state + // Buffer is a void**. + Value *Buf = GEP(B, SF, StackFrameBuilder::ctx); + + // Store the frame pointer in the 0th slot + Value *FrameAddr = + B.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::frameaddress), + ConstantInt::get(Int32Ty, 0)); + + Value *FrameSaveSlot = GEP(B, Buf, 0); + B.CreateStore(FrameAddr, FrameSaveSlot, /*isVolatile=*/true); + + // Store stack pointer in the 2nd slot + Value *StackAddr = B.CreateCall( + Intrinsic::getDeclaration(&M, Intrinsic::stacksave)); + + Value *StackSaveSlot = GEP(B, Buf, 2); + B.CreateStore(StackAddr, StackSaveSlot, /*isVolatile=*/true); + + Buf = B.CreateBitCast(Buf, Int8PtrTy); + + // Call LLVM's EH setjmp, which is lightweight. + Value* F = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp); + + CallInst *SetjmpCall = B.CreateCall(F, Buf); + SetjmpCall->setCanReturnTwice(); + + return SetjmpCall; +} + +/// \brief Get or create a LLVM function for __cilkrts_pop_frame. +/// It is equivalent to the following C code +/// +/// __cilkrts_pop_frame(__cilkrts_stack_frame *sf) { +/// sf->worker->current_stack_frame = sf->call_parent; +/// sf->call_parent = 0; +/// } +static Function *Get__cilkrts_pop_frame(Module &M) { + Function *Fn = 0; + + if (GetOrCreateFunction("__cilkrts_pop_frame", M, Fn)) + return Fn; + + // If we get here we need to add the function body + LLVMContext &Ctx = M.getContext(); + + Function::arg_iterator args = Fn->arg_begin(); + Value *SF = &*args; + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn); + IRBuilder<> B(Entry); + + // sf->worker->current_stack_frame = sf.call_parent; + StoreField(B, + LoadField(B, SF, StackFrameBuilder::call_parent, + /*isVolatile=*/true), + LoadField(B, SF, StackFrameBuilder::worker, + /*isVolatile=*/true), + WorkerBuilder::current_stack_frame, + /*isVolatile=*/true); + + // sf->call_parent = 0; + StoreField(B, + Constant::getNullValue( + TypeBuilder<__cilkrts_stack_frame*, false>::get(Ctx)), + SF, StackFrameBuilder::call_parent, /*isVolatile=*/true); + + B.CreateRetVoid(); + + Fn->addFnAttr(Attribute::InlineHint); + + return Fn; +} + +/// \brief Get or create a LLVM function for __cilkrts_detach. +/// It is equivalent to the following C code +/// +/// void __cilkrts_detach(struct __cilkrts_stack_frame *sf) { +/// struct __cilkrts_worker *w = sf->worker; +/// struct __cilkrts_stack_frame *volatile *tail = w->tail; +/// +/// sf->spawn_helper_pedigree = w->pedigree; +/// sf->call_parent->parent_pedigree = w->pedigree; +/// +/// w->pedigree.rank = 0; +/// w->pedigree.next = &sf->spawn_helper_pedigree; +/// +/// *tail++ = sf->call_parent; +/// w->tail = tail; +/// +/// sf->flags |= CILK_FRAME_DETACHED; +/// } +static Function *Get__cilkrts_detach(Module &M) { + Function *Fn = 0; + + if (GetOrCreateFunction("__cilkrts_detach", M, Fn)) + return Fn; + + // If we get here we need to add the function body + LLVMContext &Ctx = M.getContext(); + + Function::arg_iterator args = Fn->arg_begin(); + Value *SF = &*args; + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn); + IRBuilder<> B(Entry); + + // struct __cilkrts_worker *w = sf->worker; + Value *W = LoadField(B, SF, StackFrameBuilder::worker, + /*isVolatile=*/true); + + // __cilkrts_stack_frame *volatile *tail = w->tail; + Value *Tail = LoadField(B, W, WorkerBuilder::tail, + /*isVolatile=*/true); + + // sf->spawn_helper_pedigree = w->pedigree; + StoreField(B, + LoadField(B, W, WorkerBuilder::pedigree), + SF, StackFrameBuilder::parent_pedigree); + + // sf->call_parent->parent_pedigree = w->pedigree; + StoreField(B, + LoadField(B, W, WorkerBuilder::pedigree), + LoadField(B, SF, StackFrameBuilder::call_parent), + StackFrameBuilder::parent_pedigree); + + // w->pedigree.rank = 0; + { + StructType *STy = PedigreeBuilder::get(Ctx); + llvm::Type *Ty = STy->getElementType(PedigreeBuilder::rank); + StoreField(B, + ConstantInt::get(Ty, 0), + GEP(B, W, WorkerBuilder::pedigree), + PedigreeBuilder::rank); + } + + // w->pedigree.next = &sf->spawn_helper_pedigree; + StoreField(B, + GEP(B, SF, StackFrameBuilder::parent_pedigree), + GEP(B, W, WorkerBuilder::pedigree), + PedigreeBuilder::next); + + // *tail++ = sf->call_parent; + B.CreateStore(LoadField(B, SF, StackFrameBuilder::call_parent, + /*isVolatile=*/true), + Tail, /*isVolatile=*/true); + Tail = B.CreateConstGEP1_32(Tail, 1); + + // w->tail = tail; + StoreField(B, Tail, W, WorkerBuilder::tail, /*isVolatile=*/true); + + // sf->flags |= CILK_FRAME_DETACHED; + { + Value *F = LoadField(B, SF, StackFrameBuilder::flags, /*isVolatile=*/true); + F = B.CreateOr(F, ConstantInt::get(F->getType(), CILK_FRAME_DETACHED)); + StoreField(B, F, SF, StackFrameBuilder::flags, /*isVolatile=*/true); + } + + B.CreateRetVoid(); + + Fn->addFnAttr(Attribute::InlineHint); + + return Fn; +} + +/// \brief Get or create a LLVM function for __cilk_sync. +/// Calls to this function is always inlined, as it saves +/// the current stack/frame pointer values. This function must be marked +/// as returns_twice to allow it to be inlined, since the call to setjmp +/// is marked returns_twice. +/// +/// It is equivalent to the following C code +/// +/// void __cilk_sync(struct __cilkrts_stack_frame *sf) { +/// if (sf->flags & CILK_FRAME_UNSYNCHED) { +/// sf->parent_pedigree = sf->worker->pedigree; +/// SAVE_FLOAT_STATE(*sf); +/// if (!CILK_SETJMP(sf->ctx)) +/// __cilkrts_sync(sf); +/// else if (sf->flags & CILK_FRAME_EXCEPTING) +/// __cilkrts_rethrow(sf); +/// } +/// ++sf->worker->pedigree.rank; +/// } +/// +/// With exceptions disabled in the compiler, the function +/// does not call __cilkrts_rethrow() +static Function *GetCilkSyncFn(Module &M, bool instrument = false) { + Function *Fn = nullptr; + + if (GetOrCreateFunction("__cilk_sync", M, Fn, + Function::InternalLinkage, + /*doesNotThrow*/false)) + return Fn; + + // If we get here we need to add the function body + LLVMContext &Ctx = M.getContext(); + + Function::arg_iterator args = Fn->arg_begin(); + Value *SF = &*args; + + BasicBlock *Entry = BasicBlock::Create(Ctx, "cilk.sync.test", Fn); + BasicBlock *SaveState = BasicBlock::Create(Ctx, "cilk.sync.savestate", Fn); + BasicBlock *SyncCall = BasicBlock::Create(Ctx, "cilk.sync.runtimecall", Fn); + BasicBlock *Excepting = BasicBlock::Create(Ctx, "cilk.sync.excepting", Fn); + // TODO: Detect whether exceptions are needed. + BasicBlock *Rethrow = BasicBlock::Create(Ctx, "cilk.sync.rethrow", Fn); + BasicBlock *Exit = BasicBlock::Create(Ctx, "cilk.sync.end", Fn); + + // Entry + { + IRBuilder<> B(Entry); + + if (instrument) + // cilk_sync_begin + B.CreateCall(CILK_CSI_FUNC(sync_begin, M), SF); + + // if (sf->flags & CILK_FRAME_UNSYNCHED) + Value *Flags = LoadField(B, SF, StackFrameBuilder::flags, + /*isVolatile=*/true); + Flags = B.CreateAnd(Flags, + ConstantInt::get(Flags->getType(), + CILK_FRAME_UNSYNCHED)); + Value *Zero = ConstantInt::get(Flags->getType(), 0); + Value *Unsynced = B.CreateICmpEQ(Flags, Zero); + B.CreateCondBr(Unsynced, Exit, SaveState); + } + + // SaveState + { + IRBuilder<> B(SaveState); + + // sf.parent_pedigree = sf.worker->pedigree; + StoreField(B, + LoadField(B, LoadField(B, SF, StackFrameBuilder::worker, + /*isVolatile=*/true), + WorkerBuilder::pedigree), + SF, StackFrameBuilder::parent_pedigree); + + // if (!CILK_SETJMP(sf.ctx)) + Value *C = EmitCilkSetJmp(B, SF, M); + C = B.CreateICmpEQ(C, ConstantInt::get(C->getType(), 0)); + B.CreateCondBr(C, SyncCall, Excepting); + } + + // SyncCall + { + IRBuilder<> B(SyncCall); + + // __cilkrts_sync(&sf); + B.CreateCall(CILKRTS_FUNC(sync, M), SF); + B.CreateBr(Exit); + } + + // Excepting + { + IRBuilder<> B(Excepting); + if (Rethrow) { + Value *Flags = LoadField(B, SF, StackFrameBuilder::flags, + /*isVolatile=*/true); + Flags = B.CreateAnd(Flags, + ConstantInt::get(Flags->getType(), + CILK_FRAME_EXCEPTING)); + Value *Zero = ConstantInt::get(Flags->getType(), 0); + Value *CanExcept = B.CreateICmpEQ(Flags, Zero); + B.CreateCondBr(CanExcept, Exit, Rethrow); + } else { + B.CreateBr(Exit); + } + } + + // Rethrow + if (Rethrow) { + IRBuilder<> B(Rethrow); + B.CreateCall(CILKRTS_FUNC(rethrow, M), SF)->setDoesNotReturn(); + B.CreateUnreachable(); + } + + // Exit + { + IRBuilder<> B(Exit); + + // ++sf.worker->pedigree.rank; + Value *Rank = LoadField(B, SF, StackFrameBuilder::worker, + /*isVolatile=*/true); + Rank = GEP(B, Rank, WorkerBuilder::pedigree); + Rank = GEP(B, Rank, PedigreeBuilder::rank); + B.CreateStore(B.CreateAdd( + B.CreateLoad(Rank), + ConstantInt::get(Rank->getType()->getPointerElementType(), + 1)), + Rank); + if (instrument) + // cilk_sync_end + B.CreateCall(CILK_CSI_FUNC(sync_end, M), SF); + + B.CreateRetVoid(); + } + + Fn->addFnAttr(Attribute::AlwaysInline); + Fn->addFnAttr(Attribute::ReturnsTwice); + return Fn; +} + +/// \brief Get or create a LLVM function for __cilkrts_enter_frame. +/// It is equivalent to the following C code +/// +/// void __cilkrts_enter_frame_1(struct __cilkrts_stack_frame *sf) +/// { +/// struct __cilkrts_worker *w = __cilkrts_get_tls_worker(); +/// if (w == 0) { /* slow path, rare */ +/// w = __cilkrts_bind_thread_1(); +/// sf->flags = CILK_FRAME_LAST | CILK_FRAME_VERSION; +/// } else { +/// sf->flags = CILK_FRAME_VERSION; +/// } +/// sf->call_parent = w->current_stack_frame; +/// sf->worker = w; +/// /* sf->except_data is only valid when CILK_FRAME_EXCEPTING is set */ +/// w->current_stack_frame = sf; +/// } +static Function *Get__cilkrts_enter_frame_1(Module &M) { + Function *Fn = nullptr; + + if (GetOrCreateFunction("__cilkrts_enter_frame_1", M, Fn)) + return Fn; + + LLVMContext &Ctx = M.getContext(); + Function::arg_iterator args = Fn->arg_begin(); + Value *SF = &*args; + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn); + BasicBlock *SlowPath = BasicBlock::Create(Ctx, "slowpath", Fn); + BasicBlock *FastPath = BasicBlock::Create(Ctx, "fastpath", Fn); + BasicBlock *Cont = BasicBlock::Create(Ctx, "cont", Fn); + + llvm::PointerType *WorkerPtrTy = + TypeBuilder<__cilkrts_worker*, false>::get(Ctx); + StructType *SFTy = StackFrameBuilder::get(Ctx); + + // Block (Entry) + CallInst *W = nullptr; + { + IRBuilder<> B(Entry); + if (fastCilk) + W = B.CreateCall(CILKRTS_FUNC(get_tls_worker_fast, M)); + else + W = B.CreateCall(CILKRTS_FUNC(get_tls_worker, M)); + + Value *Cond = B.CreateICmpEQ(W, ConstantPointerNull::get(WorkerPtrTy)); + B.CreateCondBr(Cond, SlowPath, FastPath); + } + // Block (SlowPath) + CallInst *Wslow = nullptr; + { + IRBuilder<> B(SlowPath); + Wslow = B.CreateCall(CILKRTS_FUNC(bind_thread_1, M)); + llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags); + StoreField(B, + ConstantInt::get(Ty, CILK_FRAME_LAST | CILK_FRAME_VERSION), + SF, StackFrameBuilder::flags, /*isVolatile=*/true); + B.CreateBr(Cont); + } + // Block (FastPath) + { + IRBuilder<> B(FastPath); + llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags); + StoreField(B, + ConstantInt::get(Ty, CILK_FRAME_VERSION), + SF, StackFrameBuilder::flags, /*isVolatile=*/true); + B.CreateBr(Cont); + } + // Block (Cont) + { + IRBuilder<> B(Cont); + Value *Wfast = W; + PHINode *W = B.CreatePHI(WorkerPtrTy, 2); + W->addIncoming(Wslow, SlowPath); + W->addIncoming(Wfast, FastPath); + + StoreField(B, + LoadField(B, W, WorkerBuilder::current_stack_frame, + /*isVolatile=*/true), + SF, StackFrameBuilder::call_parent, + /*isVolatile=*/true); + + StoreField(B, W, SF, StackFrameBuilder::worker, /*isVolatile=*/true); + StoreField(B, SF, W, WorkerBuilder::current_stack_frame, + /*isVolatile=*/true); + + B.CreateRetVoid(); + } + + Fn->addFnAttr(Attribute::InlineHint); + + return Fn; +} + +/// \brief Get or create a LLVM function for __cilkrts_enter_frame_fast. +/// It is equivalent to the following C code +/// +/// void __cilkrts_enter_frame_fast_1(struct __cilkrts_stack_frame *sf) +/// { +/// struct __cilkrts_worker *w = __cilkrts_get_tls_worker(); +/// sf->flags = CILK_FRAME_VERSION; +/// sf->call_parent = w->current_stack_frame; +/// sf->worker = w; +/// /* sf->except_data is only valid when CILK_FRAME_EXCEPTING is set */ +/// w->current_stack_frame = sf; +/// } +static Function *Get__cilkrts_enter_frame_fast_1(Module &M) { + Function *Fn = nullptr; + + if (GetOrCreateFunction("__cilkrts_enter_frame_fast_1", M, Fn)) + return Fn; + + LLVMContext &Ctx = M.getContext(); + Function::arg_iterator args = Fn->arg_begin(); + Value *SF = &*args; + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn); + + IRBuilder<> B(Entry); + Value *W; + + if (fastCilk) + W = B.CreateCall(CILKRTS_FUNC(get_tls_worker_fast, M)); + else + W = B.CreateCall(CILKRTS_FUNC(get_tls_worker, M)); + + StructType *SFTy = StackFrameBuilder::get(Ctx); + llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags); + + StoreField(B, + ConstantInt::get(Ty, CILK_FRAME_VERSION), + SF, StackFrameBuilder::flags, /*isVolatile=*/true); + StoreField(B, + LoadField(B, W, WorkerBuilder::current_stack_frame, + /*isVolatile=*/true), + SF, StackFrameBuilder::call_parent, + /*isVolatile=*/true); + StoreField(B, W, SF, StackFrameBuilder::worker, /*isVolatile=*/true); + StoreField(B, SF, W, WorkerBuilder::current_stack_frame, /*isVolatile=*/true); + + B.CreateRetVoid(); + + Fn->addFnAttr(Attribute::InlineHint); + + return Fn; +} + +// /// \brief Get or create a LLVM function for __cilk_parent_prologue. +// /// It is equivalent to the following C code +// /// +// /// void __cilk_parent_prologue(__cilkrts_stack_frame *sf) { +// /// __cilkrts_enter_frame_1(sf); +// /// } +// static Function *GetCilkParentPrologue(Module &M) { +// Function *Fn = 0; + +// if (GetOrCreateFunction("__cilk_parent_prologue", M, Fn)) +// return Fn; + +// // If we get here we need to add the function body +// LLVMContext &Ctx = M.getContext(); + +// Function::arg_iterator args = Fn->arg_begin(); +// Value *SF = &*args; + +// BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn); +// IRBuilder<> B(Entry); + +// // __cilkrts_enter_frame_1(sf) +// B.CreateCall(CILKRTS_FUNC(enter_frame_1, M), SF); + +// B.CreateRetVoid(); + +// Fn->addFnAttr(Attribute::InlineHint); + +// return Fn; +// } + +/// \brief Get or create a LLVM function for __cilk_parent_epilogue. +/// It is equivalent to the following C code +/// +/// void __cilk_parent_epilogue(__cilkrts_stack_frame *sf) { +/// __cilkrts_pop_frame(sf); +/// if (sf->flags != CILK_FRAME_VERSION) +/// __cilkrts_leave_frame(sf); +/// } +static Function *GetCilkParentEpilogue(Module &M, bool instrument = false) { + Function *Fn = nullptr; + + if (GetOrCreateFunction("__cilk_parent_epilogue", M, Fn)) + return Fn; + + // If we get here we need to add the function body + LLVMContext &Ctx = M.getContext(); + + Function::arg_iterator args = Fn->arg_begin(); + Value *SF = &*args; + + BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn), + *B1 = BasicBlock::Create(Ctx, "body", Fn), + *Exit = BasicBlock::Create(Ctx, "exit", Fn); + + // Entry + { + IRBuilder<> B(Entry); + + if (instrument) + // cilk_leave_begin + B.CreateCall(CILK_CSI_FUNC(leave_begin, M), SF); + + // __cilkrts_pop_frame(sf) + B.CreateCall(CILKRTS_FUNC(pop_frame, M), SF); + + // if (sf->flags != CILK_FRAME_VERSION) + Value *Flags = LoadField(B, SF, StackFrameBuilder::flags, + /*isVolatile=*/true); + Value *Cond = B.CreateICmpNE(Flags, + ConstantInt::get(Flags->getType(), + CILK_FRAME_VERSION)); + B.CreateCondBr(Cond, B1, Exit); + } + + // B1 + { + IRBuilder<> B(B1); + + // __cilkrts_leave_frame(sf); + B.CreateCall(CILKRTS_FUNC(leave_frame, M), SF); + B.CreateBr(Exit); + } + + // Exit + { + IRBuilder<> B(Exit); + if (instrument) + // cilk_leave_end + B.CreateCall(CILK_CSI_FUNC(leave_end, M)); + B.CreateRetVoid(); + } + + Fn->addFnAttr(Attribute::InlineHint); + + return Fn; +} + +static const StringRef stack_frame_name = "__cilkrts_sf"; +static const StringRef worker8_name = "__cilkrts_wc8"; + +// static llvm::Value *LookupStackFrame(Function &F) { +// return F.getValueSymbolTable()->lookup(stack_frame_name); +// } + +/// \brief Create the __cilkrts_stack_frame for the spawning function. +static AllocaInst *CreateStackFrame(Function &F) { + // assert(!LookupStackFrame(F) && "already created the stack frame"); + + LLVMContext &Ctx = F.getContext(); + const DataLayout &DL = F.getParent()->getDataLayout(); + Type *SFTy = StackFrameBuilder::get(Ctx); + + Instruction *I = F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime(); + + AllocaInst *SF = new AllocaInst(SFTy, DL.getAllocaAddrSpace(), + /*size*/nullptr, 8, + /*name*/stack_frame_name, /*insert before*/I); + if (!I) + F.getEntryBlock().getInstList().push_back(SF); + + return SF; +} + +Value* GetOrInitCilkStackFrame(Function& F, + ValueToValueMapTy &DetachCtxToStackFrame, + bool Helper = true, bool instrument = false) { + // Value* V = LookupStackFrame(F); + Value *V = DetachCtxToStackFrame[&F]; + if (V) return V; + + AllocaInst* alloc = CreateStackFrame(F); + DetachCtxToStackFrame[&F] = alloc; + BasicBlock::iterator II = F.getEntryBlock().getFirstInsertionPt(); + AllocaInst* curinst; + do { + curinst = dyn_cast(II); + II++; + } while (curinst != alloc); + Value *StackSave; + IRBuilder<> IRB(&(F.getEntryBlock()), II); + + if (instrument) { + Type *Int8PtrTy = IRB.getInt8PtrTy(); + Value *ThisFn = ConstantExpr::getBitCast(&F, Int8PtrTy); + Value *ReturnAddress = + IRB.CreateCall(Intrinsic::getDeclaration(F.getParent(), + Intrinsic::returnaddress), + IRB.getInt32(0)); + StackSave = + IRB.CreateCall(Intrinsic::getDeclaration(F.getParent(), + Intrinsic::stacksave)); + if (Helper) { + Value *begin_args[3] = { alloc, ThisFn, ReturnAddress }; + IRB.CreateCall(CILK_CSI_FUNC(enter_helper_begin, *F.getParent()), + begin_args); + } else { + Value *begin_args[4] = { IRB.getInt32(0), alloc, ThisFn, ReturnAddress }; + IRB.CreateCall(CILK_CSI_FUNC(enter_begin, *F.getParent()), begin_args); + } + } + Value *args[1] = { alloc }; + if (Helper) + IRB.CreateCall(CILKRTS_FUNC(enter_frame_fast_1, *F.getParent()), args); + else + IRB.CreateCall(CILKRTS_FUNC(enter_frame_1, *F.getParent()), args); + /* inst->insertAfter(alloc); */ + + if (instrument) { + Value* end_args[2] = { alloc, StackSave }; + IRB.CreateCall(CILK_CSI_FUNC(enter_end, *F.getParent()), end_args); + } + + EscapeEnumerator EE(F, "cilkabi_epilogue", false); + while (IRBuilder<> *AtExit = EE.Next()) { + if (isa(AtExit->GetInsertPoint())) + AtExit->CreateCall(GetCilkParentEpilogue(*F.getParent(), instrument), + args, ""); + } + + // // The function exits are unified before lowering. + // ReturnInst *retInst = nullptr; + // for (BasicBlock &BB : F) { + // TerminatorInst* TI = BB.getTerminator(); + // if (!TI) continue; + // if (ReturnInst* RI = llvm::dyn_cast(TI)) { + // assert(!retInst && "Multiple returns found."); + // retInst = RI; + // } + // } + + // assert(retInst && "No returns found."); + // CallInst::Create(GetCilkParentEpilogue(*F.getParent(), instrument), args, "", + // retInst); + return alloc; +} + +static inline +bool makeFunctionDetachable(Function &extracted, + ValueToValueMapTy &DetachCtxToStackFrame, + bool instrument = false) { + Module *M = extracted.getParent(); + // LLVMContext& Context = extracted.getContext(); + // const DataLayout& DL = M->getDataLayout(); + /* + __cilkrts_stack_frame sf; + __cilkrts_enter_frame_fast_1(&sf); + __cilkrts_detach(); + *x = f(y); + */ + + Value *SF = CreateStackFrame(extracted); + DetachCtxToStackFrame[&extracted] = SF; + assert(SF); + Value *args[1] = { SF }; + + // Scan function to see if it detaches. + bool SimpleHelper = true; + for (BasicBlock &BB : extracted) { + if (isa(BB.getTerminator())) { + SimpleHelper = false; + break; + } + } + if (!SimpleHelper) + DEBUG(dbgs() << "Detachable helper function itself detaches.\n"); + + BasicBlock::iterator II = extracted.getEntryBlock().getFirstInsertionPt(); + AllocaInst* curinst; + do { + curinst = dyn_cast(II); + II++; + } while (curinst != SF); + Value *StackSave; + IRBuilder<> IRB(&(extracted.getEntryBlock()), II); + + if (instrument) { + Type *Int8PtrTy = IRB.getInt8PtrTy(); + Value *ThisFn = ConstantExpr::getBitCast(&extracted, Int8PtrTy); + Value *ReturnAddress = + IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::returnaddress), + IRB.getInt32(0)); + StackSave = + IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave)); + if (SimpleHelper) { + Value *begin_args[3] = { SF, ThisFn, ReturnAddress }; + IRB.CreateCall(CILK_CSI_FUNC(enter_helper_begin, *M), begin_args); + } else { + Value *begin_args[4] = { IRB.getInt32(0), SF, ThisFn, ReturnAddress }; + IRB.CreateCall(CILK_CSI_FUNC(enter_begin, *M), begin_args); + } + } + + if (SimpleHelper) + IRB.CreateCall(CILKRTS_FUNC(enter_frame_fast_1, *M), args); + else + IRB.CreateCall(CILKRTS_FUNC(enter_frame_1, *M), args); + + if (instrument) { + Value *end_args[2] = { SF, StackSave }; + IRB.CreateCall(CILK_CSI_FUNC(enter_end, *M), end_args); + } + + // Call __cilkrts_detach + { + if (instrument) + IRB.CreateCall(CILK_CSI_FUNC(detach_begin, *M), args); + + IRB.CreateCall(CILKRTS_FUNC(detach, *M), args); + + if (instrument) + IRB.CreateCall(CILK_CSI_FUNC(detach_end, *M)); + } + + EscapeEnumerator EE(extracted, "cilkabi_epilogue", false); + while (IRBuilder<> *AtExit = EE.Next()) { + if (isa(AtExit->GetInsertPoint())) + AtExit->CreateCall(GetCilkParentEpilogue(*M, instrument), args, ""); + else if (ResumeInst *RI = dyn_cast(AtExit->GetInsertPoint())) { + /* + sf.flags = sf.flags | CILK_FRAME_EXCEPTING; + sf.except_data = Exn; + */ + IRBuilder<> B(RI); + Value *Exn = AtExit->CreateExtractValue(RI->getValue(), + ArrayRef(0)); + Value *Flags = LoadField(*AtExit, SF, StackFrameBuilder::flags, + /*isVolatile=*/true); + Flags = AtExit->CreateOr(Flags, + ConstantInt::get(Flags->getType(), + CILK_FRAME_EXCEPTING)); + StoreField(*AtExit, Exn, SF, StackFrameBuilder::except_data); + /* + __cilkrts_pop_frame(&sf); + if (sf->flags) + __cilkrts_leave_frame(&sf); + */ + AtExit->CreateCall(GetCilkParentEpilogue(*M, instrument), args, ""); + // CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", RI); + } + } + + // // Handle returns + // ReturnInst* Ret = nullptr; + // for (BasicBlock &BB : extracted) { + // TerminatorInst* TI = BB.getTerminator(); + // if (!TI) continue; + // if (ReturnInst* RI = dyn_cast(TI)) { + // assert(Ret == nullptr && "Multiple return"); + // Ret = RI; + // } + // } + // assert(Ret && "No return from extract function"); + + // /* + // __cilkrts_pop_frame(&sf); + // if (sf->flags) + // __cilkrts_leave_frame(&sf); + // */ + // CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", Ret); + + // // Handle resumes + // for (BasicBlock &BB : extracted) { + // if (!isa(BB.getTerminator())) + // continue; + // ResumeInst *RI = cast(BB.getTerminator()); + // /* + // sf.flags = sf.flags | CILK_FRAME_EXCEPTING; + // sf.except_data = Exn; + // */ + // IRBuilder<> B(RI); + // Value *Exn = B.CreateExtractValue(RI->getValue(), ArrayRef(0)); + // Value *Flags = LoadField(B, SF, StackFrameBuilder::flags, + // /*isVolatile=*/true); + // Flags = B.CreateOr(Flags, + // ConstantInt::get(Flags->getType(), + // CILK_FRAME_EXCEPTING)); + // StoreField(B, Exn, SF, StackFrameBuilder::except_data); + // /* + // __cilkrts_pop_frame(&sf); + // if (sf->flags) + // __cilkrts_leave_frame(&sf); + // */ + // CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", RI); + // } + + return true; +} + +//############################################################################## + +/// \brief Get/Create the worker count for the spawning function. +Value* llvm::cilk::GetOrCreateWorker8(Function &F) { + // Value* W8 = F.getValueSymbolTable()->lookup(worker8_name); + // if (W8) return W8; + IRBuilder<> B(F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime()); + Value *P0 = B.CreateCall(CILKRTS_FUNC(get_nworkers, *F.getParent())); + Value *P8 = B.CreateMul(P0, ConstantInt::get(P0->getType(), 8), worker8_name); + return P8; +} + +void llvm::cilk::createSync(SyncInst &SI, ValueToValueMapTy &DetachCtxToStackFrame, + bool instrument) { + Function &Fn = *(SI.getParent()->getParent()); + Module &M = *(Fn.getParent()); + + Value *SF = GetOrInitCilkStackFrame(Fn, DetachCtxToStackFrame, + /*isFast*/false, instrument); + Value *args[] = { SF }; + assert( args[0] && "sync used in function without frame!" ); + CallInst *CI = CallInst::Create(GetCilkSyncFn(M, instrument), args, "", + /*insert before*/&SI); + CI->setDebugLoc(SI.getDebugLoc()); + BasicBlock *Succ = SI.getSuccessor(0); + SI.eraseFromParent(); + BranchInst::Create(Succ, CI->getParent()); +} + +bool llvm::cilk::verifyDetachedCFG(const DetachInst &Detach, DominatorTree &DT, + bool error) { + BasicBlock *Spawned = Detach.getDetached(); + BasicBlock *Continue = Detach.getContinue(); + BasicBlockEdge DetachEdge(Detach.getParent(), Spawned); + + SmallVector Todo; + SmallPtrSet functionPieces; + SmallVector WorkListEH; + Todo.push_back(Spawned); + + while (!Todo.empty()) { + BasicBlock *BB = Todo.pop_back_val(); + + if (!functionPieces.insert(BB).second) + continue; + + TerminatorInst* Term = BB->getTerminator(); + if (Term == nullptr) return false; + if (ReattachInst* Inst = dyn_cast(Term)) { + //only analyze reattaches going to the same continuation + if (Inst->getSuccessor(0) != Continue) continue; + continue; + } else if (DetachInst* Inst = dyn_cast(Term)) { + assert(Inst != &Detach && "Found recursive Detach!"); + Todo.push_back(Inst->getSuccessor(0)); + Todo.push_back(Inst->getSuccessor(1)); + continue; + } else if (SyncInst* Inst = dyn_cast(Term)) { + //only sync inner elements, consider as branch + Todo.push_back(Inst->getSuccessor(0)); + continue; + } else if (isa(Term) || isa(Term) || + isa(Term)) { + for (BasicBlock *Succ : successors(BB)) { + if (!DT.dominates(DetachEdge, Succ)) + // We assume that this block is an exception-handling block and save + // it for later processing. + WorkListEH.push_back(Succ); + else + Todo.push_back(Succ); + } + continue; + } else if (isa(Term) || isa(Term)) { + continue; + } else { + DEBUG(Term->dump()); + DEBUG(Term->getParent()->getParent()->dump()); + assert(!error && "Detached block did not absolutely terminate in reattach"); + return false; + } + } + { + SmallPtrSet Visited; + while (!WorkListEH.empty()) { + BasicBlock *BB = WorkListEH.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Make sure that the control flow through these exception-handling blocks + // cannot re-enter the blocks being outlined. + assert(!functionPieces.count(BB) && + "EH blocks for a detached region reenter that region."); + + // Make sure that the control flow through these exception-handling blocks + // doesn't perform an ordinary return. + assert(!isa(BB->getTerminator()) && + "EH block terminated by return."); + + // Make sure that the control flow through these exception-handling blocks + // doesn't reattach to the detached CFG's continuation. + if (ReattachInst *RI = dyn_cast(BB->getTerminator())) + assert(RI->getSuccessor(0) != Continue && + "Exit block reaches a reattach to the continuation."); + + for (BasicBlock *Succ : successors(BB)) + WorkListEH.push_back(Succ); + } + } + return true; +} + +bool llvm::cilk::populateDetachedCFG( + const DetachInst &Detach, DominatorTree &DT, + SmallPtrSetImpl &functionPieces, + SmallVectorImpl &reattachB, + SmallPtrSetImpl &ExitBlocks, + bool replace, bool error) { + SmallVector Todo; + SmallVector WorkListEH; + + BasicBlock *Spawned = Detach.getDetached(); + BasicBlock *Continue = Detach.getContinue(); + BasicBlockEdge DetachEdge(Detach.getParent(), Spawned); + Todo.push_back(Spawned); + + while (!Todo.empty()) { + BasicBlock *BB = Todo.pop_back_val(); + + if (!functionPieces.insert(BB).second) + continue; + + TerminatorInst *Term = BB->getTerminator(); + if (Term == nullptr) return false; + if (isa(Term)) { + // only analyze reattaches going to the same continuation + if (Term->getSuccessor(0) != Continue) continue; + if (replace) { + BranchInst* toReplace = BranchInst::Create(Continue); + ReplaceInstWithInst(Term, toReplace); + reattachB.push_back(BB); + } + continue; + } else if (isa(Term)) { + assert(Term != &Detach && "Found recursive detach!"); + Todo.push_back(Term->getSuccessor(0)); + Todo.push_back(Term->getSuccessor(1)); + continue; + } else if (isa(Term)) { + //only sync inner elements, consider as branch + Todo.push_back(Term->getSuccessor(0)); + continue; + } else if (isa(Term) || isa(Term) || + isa(Term)) { + for (BasicBlock *Succ : successors(BB)) { + if (!DT.dominates(DetachEdge, Succ)) { + // We assume that this block is an exception-handling block and save + // it for later processing. + ExitBlocks.insert(Succ); + WorkListEH.push_back(Succ); + } else { + Todo.push_back(Succ); + } + } + // We don't bother cloning unreachable exits from the detached CFG at this + // point. We're cloning the entire detached CFG anyway when we outline + // the function. + continue; + } else if (isa(Term) || isa(Term)) { + continue; + } else { + DEBUG(Term->dump()); + DEBUG(Term->getParent()->getParent()->dump()); + assert(!error && "Detached block did not absolutely terminate in reattach"); + return false; + } + } + + // Find the exit-handling blocks. + { + SmallPtrSet Visited; + while (!WorkListEH.empty()) { + BasicBlock *BB = WorkListEH.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Make sure that the control flow through these exception-handling blocks + // cannot re-enter the blocks being outlined. + assert(!functionPieces.count(BB) && + "EH blocks for a detached region reenter that region."); + + // Make sure that the control flow through these exception-handling blocks + // doesn't perform an ordinary return. + assert(!isa(BB->getTerminator()) && + "EH block terminated by return."); + + // Make sure that the control flow through these exception-handling blocks + // doesn't reattach to the detached CFG's continuation. + if (ReattachInst *RI = dyn_cast(BB->getTerminator())) + assert(RI->getSuccessor(0) != Continue && + "Exit block reaches a reattach to the continuation."); + + // if (isa(BB-getTerminator())) + // ResumeBlocks.push_back(BB); + + for (BasicBlock *Succ : successors(BB)) { + ExitBlocks.insert(Succ); + WorkListEH.push_back(Succ); + } + } + + // Visited now contains exception-handling blocks that we want to clone as + // part of outlining. + for (BasicBlock *EHBlock : Visited) + functionPieces.insert(EHBlock); + } + + return true; +} + +//Returns true if success +Function *llvm::cilk::extractDetachBodyToFunction(DetachInst &detach, + DominatorTree &DT, + AssumptionCache &AC, + CallInst **call) { + BasicBlock *Detacher = detach.getParent(); + Function &F = *(Detacher->getParent()); + + BasicBlock *Spawned = detach.getDetached(); + BasicBlock *Continue = detach.getContinue(); + + SmallPtrSet functionPieces; + SmallVector reattachB; + SmallPtrSet ExitBlocks; + + // if (!Spawned->getUniquePredecessor()) + // dbgs() << *Spawned; + assert(Spawned->getUniquePredecessor() && + "Entry block of detached CFG has multiple predecessors."); + assert(Spawned->getUniquePredecessor() == Detacher && + "Broken CFG."); + + // if (getNumPred(Spawned) > 1) { + // dbgs() << "Found multiple predecessors to a detached-CFG entry block " + // << Spawned->getName() << ".\n"; + // BasicBlock* ts = BasicBlock::Create(Spawned->getContext(), Spawned->getName()+".fx", &F, Detacher); + // IRBuilder<> b(ts); + // b.CreateBr(Spawned); + // detach.setSuccessor(0,ts); + // llvm::BasicBlock::iterator i = Spawned->begin(); + // while (auto phi = llvm::dyn_cast(i)) { + // int idx = phi->getBasicBlockIndex(detach.getParent()); + // phi->setIncomingBlock(idx, ts); + // ++i; + // } + // Spawned = ts; + // } + + if (!populateDetachedCFG(detach, DT, functionPieces, reattachB, + ExitBlocks, true)) + return nullptr; + + // functionPieces.erase(Spawned); + // std::vector blocks(functionPieces.begin(), functionPieces.end()); + // blocks.insert(blocks.begin(), Spawned); + // functionPieces.insert(Spawned); + + // Check the spawned block's predecessors. + for (BasicBlock *BB : functionPieces) { + int detached_count = 0; + if (ExitBlocks.count(BB)) + continue; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { + BasicBlock *Pred = *PI; + if (detached_count == 0 && BB == Spawned && Pred == detach.getParent()) { + detached_count = 1; + continue; + } + assert(functionPieces.count(Pred) && + "Block inside of detached context branched into from outside branch context"); + } + } + + // Get the inputs and outputs for the detached CFG. + SetVector Inputs, Outputs; + findInputsOutputs(functionPieces, Inputs, Outputs, &ExitBlocks); + // extractor.findInputsOutputs(Inputs, Outputs); + assert(Outputs.empty() && + "All results from detached CFG should be passed by memory already."); + + // Clone the detached CFG into a helper function. + ValueToValueMapTy VMap; + Function *extracted; + { + SmallVector Returns; // Ignore returns cloned. + std::vector blocks(functionPieces.begin(), functionPieces.end()); + + extracted = CreateHelper(Inputs, Outputs, blocks, + Spawned, Detacher, Continue, + VMap, F.getParent(), + F.getSubprogram() != nullptr, Returns, ".cilk", + &ExitBlocks, nullptr, nullptr, nullptr, nullptr); + + assert(Returns.empty() && "Returns cloned when cloning detached CFG."); + + // Use a fast calling convention for the helper. + extracted->setCallingConv(CallingConv::Fast); + // extracted->setCallingConv(F.getCallingConv()); + + extracted->addFnAttr(Attribute::NoInline); + } + + // Add alignment assumptions to arguments of helper, based on alignment of + // values in old function. + AddAlignmentAssumptions(&F, Inputs, VMap, &detach, &AC, &DT); + + // Add call to new helper function in original function. + CallInst *TopCall; + { + // Create call instruction. + IRBuilder<> Builder(&detach); + TopCall = Builder.CreateCall(extracted, Inputs.getArrayRef()); + // Use a fast calling convention for the helper. + TopCall->setCallingConv(CallingConv::Fast); + // TopCall->setCallingConv(extracted->getCallingConv()); + TopCall->setDebugLoc(detach.getDebugLoc()); + } + if (call) + *call = TopCall; + + // Move allocas in the newly cloned detached CFG to the entry block of the + // helper. + { + // Collect reattach instructions. + SmallVector ReattachPoints; + for (pred_iterator PI = pred_begin(Continue), PE = pred_end(Continue); + PI != PE; ++PI) { + BasicBlock *Pred = *PI; + if (!isa(Pred->getTerminator())) continue; + if (functionPieces.count(Pred)) + ReattachPoints.push_back(cast(VMap[Pred])->getTerminator()); + } + + // Move allocas in cloned detached block to entry of helper function. + BasicBlock *ClonedDetachedBlock = cast(VMap[Spawned]); + MoveStaticAllocasInBlock(&extracted->getEntryBlock(), ClonedDetachedBlock, + ReattachPoints); + + // We should not need to add new llvm.stacksave/llvm.stackrestore + // intrinsics, because calling and returning from the helper will + // automatically manage the stack. + } + + return extracted; +} + +Function *llvm::cilk::createDetach(DetachInst &detach, + ValueToValueMapTy &DetachCtxToStackFrame, + DominatorTree &DT, AssumptionCache &AC, + bool instrument) { + BasicBlock *detB = detach.getParent(); + Function &F = *(detB->getParent()); + + BasicBlock *Spawned = detach.getDetached(); + BasicBlock *Continue = detach.getContinue(); + + Module *M = F.getParent(); + //replace with branch to succesor + //entry / cilk.spawn.savestate + Value *SF = GetOrInitCilkStackFrame(F, DetachCtxToStackFrame, + /*isFast=*/false, instrument); + // assert(SF && "null stack frame unexpected"); + + // dbgs() << *detB << *Spawned << *Continue; + + // if (!Spawned->getUniquePredecessor()) + // SplitEdge(detB, Spawned, &DT, nullptr); + + // dbgs() << *detB << *(detach.getDetached()); + + CallInst *cal = nullptr; + Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal); + assert(extracted && "could not extract detach body to function"); + + // Unlink the detached CFG in the original function. The heavy lifting of + // removing the outlined detached-CFG is left to subsequent DCE. + BranchInst *ContinueBr; + { + // Replace the detach with a branch to the continuation. + ContinueBr = BranchInst::Create(Continue); + ReplaceInstWithInst(&detach, ContinueBr); + + // Rewrite phis in the detached block. + BasicBlock::iterator BI = Spawned->begin(); + while (PHINode *P = dyn_cast(BI)) { + // int j = P->getBasicBlockIndex(detB); + // assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); + P->removeIncomingValue(detB); + ++BI; + } + } + + Value *SetJmpRes; + { + IRBuilder<> B(cal); + + if (instrument) + // cilk_spawn_prepare + B.CreateCall(CILK_CSI_FUNC(spawn_prepare, *M), SF); + + // Need to save state before spawning + SetJmpRes = EmitCilkSetJmp(B, SF, *M); + + if (instrument) + // cilk_spawn_or_continue + B.CreateCall(CILK_CSI_FUNC(spawn_or_continue, *M), SetJmpRes); + } + + // Conditionally call the new helper function based on the result of the + // setjmp. + { + BasicBlock *CallBlock = SplitBlock(detB, cal, &DT); + BasicBlock *CallCont = SplitBlock(CallBlock, + CallBlock->getTerminator(), &DT); + IRBuilder<> B(detB->getTerminator()); + SetJmpRes = B.CreateICmpEQ(SetJmpRes, + ConstantInt::get(SetJmpRes->getType(), 0)); + B.CreateCondBr(SetJmpRes, CallBlock, CallCont); + detB->getTerminator()->eraseFromParent(); + } + + makeFunctionDetachable(*extracted, DetachCtxToStackFrame, instrument); + + return extracted; +} diff --git a/llvm/lib/Transforms/Tapir/LLVMBuild.txt b/llvm/lib/Transforms/Tapir/LLVMBuild.txt new file mode 100644 index 00000000000000..9b7ec2935c92fc --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/Transforms/Tapir/LLVMBuild.txt ---------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = TapirOpts +parent = Transforms +required_libraries = Analysis Core Scalar Support TransformUtils diff --git a/llvm/lib/Transforms/Tapir/LoopSpawning.cpp b/llvm/lib/Transforms/Tapir/LoopSpawning.cpp new file mode 100644 index 00000000000000..a62e445eecf277 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LoopSpawning.cpp @@ -0,0 +1,2413 @@ +//===- LoopSpawning.cpp - Spawn loop iterations efficiently ---------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Modify Tapir loops to spawn their iterations efficiently. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/LoopSpawning.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/ValueMap.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Scalar/IndVarSimplify.h" +#include "llvm/Transforms/Scalar/SimplifyCFG.h" +#include "llvm/Transforms/Scalar/LoopDeletion.h" +#include "llvm/Transforms/Tapir.h" +#include "llvm/Transforms/Tapir/CilkABI.h" +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/TapirUtils.h" +#include "llvm/Transforms/Utils/ValueMapper.h" +#include + +using std::make_pair; + +using namespace llvm; + +#define LS_NAME "loop-spawning" +#define DEBUG_TYPE LS_NAME + +STATISTIC(LoopsAnalyzed, "Number of Tapir loops analyzed"); +STATISTIC(LoopsConvertedToDAC, + "Number of Tapir loops converted to divide-and-conquer iteration spawning"); +STATISTIC(LoopsConvertedToCilkABI, + "Number of Tapir loops converted to use the Cilk ABI for loops"); + +namespace { +// Forward declarations. +class LoopSpawningHints; + +// /// \brief This modifies LoopAccessReport to initialize message with +// /// tapir-loop-specific part. +// class LoopSpawningReport : public LoopAccessReport { +// public: +// LoopSpawningReport(Instruction *I = nullptr) +// : LoopAccessReport("loop-spawning: ", I) {} + +// /// \brief This allows promotion of the loop-access analysis report into the +// /// loop-spawning report. It modifies the message to add the +// /// loop-spawning-specific part of the message. +// explicit LoopSpawningReport(const LoopAccessReport &R) +// : LoopAccessReport(Twine("loop-spawning: ") + R.str(), +// R.getInstr()) {} +// }; + + +/// Utility class for getting and setting loop spawning hints in the form +/// of loop metadata. +/// This class keeps a number of loop annotations locally (as member variables) +/// and can, upon request, write them back as metadata on the loop. It will +/// initially scan the loop for existing metadata, and will update the local +/// values based on information in the loop. +class LoopSpawningHints { + enum HintKind { HK_STRATEGY }; + + /// Hint - associates name and validation with the hint value. + struct Hint { + const char *Name; + unsigned Value; // This may have to change for non-numeric values. + HintKind Kind; + + Hint(const char *Name, unsigned Value, HintKind Kind) + : Name(Name), Value(Value), Kind(Kind) {} + + bool validate(unsigned Val) { + switch (Kind) { + case HK_STRATEGY: + return (Val < ST_END); + } + return false; + } + }; + + /// Spawning strategy + Hint Strategy; + + /// Return the loop metadata prefix. + static StringRef Prefix() { return "tapir.loop."; } + +public: + enum SpawningStrategy { + ST_SEQ, + ST_DAC, + ST_END, + }; + + static std::string printStrategy(enum SpawningStrategy Strat) { + switch(Strat) { + case LoopSpawningHints::ST_SEQ: + return "Spawn iterations sequentially"; + case LoopSpawningHints::ST_DAC: + return "Use divide-and-conquer"; + case LoopSpawningHints::ST_END: + default: + return "Unknown"; + } + } + + LoopSpawningHints(const Loop *L, OptimizationRemarkEmitter &ORE) + : Strategy("spawn.strategy", ST_SEQ, HK_STRATEGY), + TheLoop(L), ORE(ORE) { + // Populate values with existing loop metadata. + getHintsFromMetadata(); + } + + // /// Dumps all the hint information. + // std::string emitRemark() const { + // LoopSpawningReport R; + // R << "Strategy = " << printStrategy(getStrategy()); + + // return R.str(); + // } + + enum SpawningStrategy getStrategy() const { + return (SpawningStrategy)Strategy.Value; + } + +private: + /// Find hints specified in the loop metadata and update local values. + void getHintsFromMetadata() { + MDNode *LoopID = TheLoop->getLoopID(); + if (!LoopID) + return; + + // First operand should refer to the loop id itself. + assert(LoopID->getNumOperands() > 0 && "requires at least one operand"); + assert(LoopID->getOperand(0) == LoopID && "invalid loop id"); + + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + const MDString *S = nullptr; + SmallVector Args; + + // The expected hint is either a MDString or a MDNode with the first + // operand a MDString. + if (const MDNode *MD = dyn_cast(LoopID->getOperand(i))) { + if (!MD || MD->getNumOperands() == 0) + continue; + S = dyn_cast(MD->getOperand(0)); + for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i) + Args.push_back(MD->getOperand(i)); + } else { + S = dyn_cast(LoopID->getOperand(i)); + assert(Args.size() == 0 && "too many arguments for MDString"); + } + + if (!S) + continue; + + // Check if the hint starts with the loop metadata prefix. + StringRef Name = S->getString(); + if (Args.size() == 1) + setHint(Name, Args[0]); + } + } + + /// Checks string hint with one operand and set value if valid. + void setHint(StringRef Name, Metadata *Arg) { + if (!Name.startswith(Prefix())) + return; + Name = Name.substr(Prefix().size(), StringRef::npos); + + const ConstantInt *C = mdconst::dyn_extract(Arg); + if (!C) + return; + unsigned Val = C->getZExtValue(); + + Hint *Hints[] = {&Strategy}; + for (auto H : Hints) { + if (Name == H->Name) { + if (H->validate(Val)) + H->Value = Val; + else + DEBUG(dbgs() << LS_NAME << " ignoring invalid hint '" << + Name << "'\n"); + break; + } + } + } + + /// Create a new hint from name / value pair. + MDNode *createHintMetadata(StringRef Name, unsigned V) const { + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Metadata *MDs[] = {MDString::get(Context, Name), + ConstantAsMetadata::get( + ConstantInt::get(Type::getInt32Ty(Context), V))}; + return MDNode::get(Context, MDs); + } + + /// Matches metadata with hint name. + bool matchesHintMetadataName(MDNode *Node, ArrayRef HintTypes) { + MDString *Name = dyn_cast(Node->getOperand(0)); + if (!Name) + return false; + + for (auto H : HintTypes) + if (Name->getString().endswith(H.Name)) + return true; + return false; + } + + /// Sets current hints into loop metadata, keeping other values intact. + void writeHintsToMetadata(ArrayRef HintTypes) { + if (HintTypes.size() == 0) + return; + + // Reserve the first element to LoopID (see below). + SmallVector MDs(1); + // If the loop already has metadata, then ignore the existing operands. + MDNode *LoopID = TheLoop->getLoopID(); + if (LoopID) { + for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) { + MDNode *Node = cast(LoopID->getOperand(i)); + // If node in update list, ignore old value. + if (!matchesHintMetadataName(Node, HintTypes)) + MDs.push_back(Node); + } + } + + // Now, add the missing hints. + for (auto H : HintTypes) + MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value)); + + // Replace current metadata node with new one. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + MDNode *NewLoopID = MDNode::get(Context, MDs); + // Set operand 0 to refer to the loop id itself. + NewLoopID->replaceOperandWith(0, NewLoopID); + + TheLoop->setLoopID(NewLoopID); + } + + /// The loop these hints belong to. + const Loop *TheLoop; + + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; +}; + +// static void emitAnalysisDiag(const Loop *TheLoop, +// OptimizationRemarkEmitter &ORE, +// const LoopAccessReport &Message) { +// const char *Name = LS_NAME; +// LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE); +// } + +static void emitMissedWarning(Function *F, Loop *L, + const LoopSpawningHints &LH, + OptimizationRemarkEmitter *ORE) { + // ORE->emit(OptimizationRemarkMissed( + // LS_NAME, "LSHint", L->getStartLoc(), L->getHeader()) + // << "Strategy = " + // << LoopSpawningHints::printStrategy(LH.getStrategy())); + switch (LH.getStrategy()) { + case LoopSpawningHints::ST_DAC: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "failed to use divide-and-conquer loop spawning"); + break; + case LoopSpawningHints::ST_SEQ: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "SpawningDisabled", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "loop-spawning transformation disabled"); + break; + case LoopSpawningHints::ST_END: + ORE->emit(DiagnosticInfoOptimizationFailure( + DEBUG_TYPE, "FailedRequestedSpawning", + L->getStartLoc(), L->getHeader()) + << "Tapir loop not transformed: " + << "unknown loop-spawning strategy"); + break; + } +} + +/// LoopOutline serves as a base class for different variants of LoopSpawning. +/// LoopOutline implements common parts of LoopSpawning transformations, namely, +/// lifting a Tapir loop into a separate helper function. +class LoopOutline { +public: + + LoopOutline(Loop *OrigLoop, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter &ORE) + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), AC(AC), ORE(ORE), + ExitBlock(nullptr) + { + // Use the loop latch to determine the canonical exit block for this loop. + TerminatorInst *TI = OrigLoop->getLoopLatch()->getTerminator(); + if (2 != TI->getNumSuccessors()) + return; + ExitBlock = TI->getSuccessor(0); + if (ExitBlock == OrigLoop->getHeader()) + ExitBlock = TI->getSuccessor(1); + } + + virtual bool processLoop() = 0; + + virtual ~LoopOutline() {} + +protected: + PHINode* canonicalizeIVs(Type *Ty); + Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit); + void unlinkLoop(); + + /// The original loop. + Loop *OrigLoop; + + /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies + /// dynamic knowledge to simplify SCEV expressions and converts them to a + /// more usable form. + // PredicatedScalarEvolution &PSE; + ScalarEvolution &SE; + /// Loop info. + LoopInfo *LI; + /// Dominator tree. + DominatorTree *DT; + /// Assumption cache. + AssumptionCache *AC; + /// Interface to emit optimization remarks. + OptimizationRemarkEmitter &ORE; + + /// The exit block of this loop. We compute our own exit block, based on the + /// latch, and handle other exit blocks (i.e., for exception handling) in a + /// special manner. + BasicBlock *ExitBlock; + +// private: +// /// Report an analysis message to assist the user in diagnosing loops that are +// /// not transformed. These are handled as LoopAccessReport rather than +// /// VectorizationReport because the << operator of LoopSpawningReport returns +// /// LoopAccessReport. +// void emitAnalysis(const LoopAccessReport &Message) const { +// emitAnalysisDiag(OrigLoop, *ORE, Message); +// } +}; + +/// DACLoopSpawning implements the transformation to spawn the iterations of a +/// Tapir loop in a recursive divide-and-conquer fashion. +class DACLoopSpawning : public LoopOutline { +public: + // DACLoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, + // LoopInfo *LI, DominatorTree *DT, + // const TargetLibraryInfo *TLI, + // const TargetTransformInfo *TTI, + // OptimizationRemarkEmitter *ORE) + // : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), + // TLI(TLI), TTI(TTI), ORE(ORE) + // {} + + DACLoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter &ORE) + : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE) + {} + + bool processLoop(); + + virtual ~DACLoopSpawning() {} + +protected: + Value* computeGrainsize(Value *Limit); + void implementDACIterSpawnOnHelper(Function *Helper, + BasicBlock *Preheader, + BasicBlock *Header, + PHINode *CanonicalIV, + Argument *Limit, + Argument *Grainsize, + Instruction *SyncRegion, + DominatorTree *DT, + LoopInfo *LI, + bool CanonicalIVFlagNUW = false, + bool CanonicalIVFlagNSW = false); + +// private: +// /// Report an analysis message to assist the user in diagnosing loops that are +// /// not transformed. These are handled as LoopAccessReport rather than +// /// VectorizationReport because the << operator of LoopSpawningReport returns +// /// LoopAccessReport. +// void emitAnalysis(const LoopAccessReport &Message) const { +// emitAnalysisDiag(OrigLoop, *ORE, Message); +// } +}; + +/// CilkABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops. +class CilkABILoopSpawning : public LoopOutline { +public: + CilkABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE, + LoopInfo *LI, DominatorTree *DT, + AssumptionCache *AC, + OptimizationRemarkEmitter &ORE) + : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE) + {} + + bool processLoop(); + + virtual ~CilkABILoopSpawning() {} + +protected: + // PHINode* canonicalizeIVs(Type *Ty); + Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit); + +// private: +// /// Report an analysis message to assist the user in diagnosing loops that are +// /// not transformed. These are handled as LoopAccessReport rather than +// /// VectorizationReport because the << operator of LoopSpawningReport returns +// /// LoopAccessReport. +// void emitAnalysis(const LoopAccessReport &Message) const { +// emitAnalysisDiag(OrigLoop, *ORE, Message); +// } +}; + +struct LoopSpawningImpl { + // LoopSpawningImpl(Function &F, LoopInfo &LI, ScalarEvolution &SE, + // DominatorTree &DT, + // const TargetTransformInfo &TTI, + // const TargetLibraryInfo *TLI, + // AliasAnalysis &AA, AssumptionCache &AC, + // OptimizationRemarkEmitter &ORE) + // : F(&F), LI(&LI), SE(&SE), DT(&DT), TTI(&TTI), TLI(TLI), + // AA(&AA), AC(&AC), ORE(&ORE) {} + // LoopSpawningImpl(Function &F, + // function_ref GetLI, + // function_ref GetSE, + // function_ref GetDT, + // OptimizationRemarkEmitter &ORE) + // : F(F), GetLI(GetLI), LI(nullptr), GetSE(GetSE), GetDT(GetDT), + // ORE(ORE) + // {} + LoopSpawningImpl(Function &F, + LoopInfo &LI, + ScalarEvolution &SE, + DominatorTree &DT, + AssumptionCache &AC, + OptimizationRemarkEmitter &ORE) + : F(F), LI(LI), SE(SE), DT(DT), AC(AC), ORE(ORE) {} + + bool run(); + +private: + void addTapirLoop(Loop *L, SmallVectorImpl &V); + bool isTapirLoop(const Loop *L); + bool processLoop(Loop *L); + + Function &F; + // function_ref GetLI; + LoopInfo &LI; + // function_ref GetSE; + // function_ref GetDT; + ScalarEvolution &SE; + DominatorTree &DT; + // const TargetTransformInfo *TTI; + // const TargetLibraryInfo *TLI; + // AliasAnalysis *AA; + AssumptionCache &AC; + OptimizationRemarkEmitter &ORE; +}; +} // end anonymous namespace + +/// Canonicalize the induction variables in the loop. Return the canonical +/// induction variable created or inserted by the scalar evolution expander. +PHINode* LoopOutline::canonicalizeIVs(Type *Ty) { + Loop *L = OrigLoop; + + BasicBlock* Header = L->getHeader(); + Module* M = Header->getParent()->getParent(); + + SCEVExpander Exp(SE, M->getDataLayout(), "ls"); + + PHINode *CanonicalIV = Exp.getOrInsertCanonicalInductionVariable(L, Ty); + DEBUG(dbgs() << "LS Canonical induction variable " << *CanonicalIV << "\n"); + + SmallVector DeadInsts; + Exp.replaceCongruentIVs(L, DT, DeadInsts); + for (WeakTrackingVH V : DeadInsts) { + DEBUG(dbgs() << "LS erasing dead inst " << *V << "\n"); + Instruction *I = cast(V); + I->eraseFromParent(); + } + + return CanonicalIV; +} + +/// \brief Replace the latch of the loop to check that IV is always less than or +/// equal to the limit. +/// +/// This method assumes that the loop has a single loop latch. +Value* LoopOutline::canonicalizeLoopLatch(PHINode *IV, Value *Limit) { + Loop *L = OrigLoop; + + Value *NewCondition; + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "No single loop latch found for loop."); + + IRBuilder<> Builder(&*Latch->getFirstInsertionPt()); + + // This process assumes that IV's increment is in Latch. + + // Create comparison between IV and Limit at top of Latch. + NewCondition = Builder.CreateICmpULT(IV, Limit); + + // Replace the conditional branch at the end of Latch. + BranchInst *LatchBr = dyn_cast_or_null(Latch->getTerminator()); + assert(LatchBr && LatchBr->isConditional() && + "Latch does not terminate with a conditional branch."); + Builder.SetInsertPoint(Latch->getTerminator()); + Builder.CreateCondBr(NewCondition, Header, ExitBlock); + + // Erase the old conditional branch. + Value *OldCond = LatchBr->getCondition(); + LatchBr->eraseFromParent(); + if (!OldCond->hasNUsesOrMore(1)) + if (Instruction *OldCondInst = dyn_cast(OldCond)) + OldCondInst->eraseFromParent(); + + return NewCondition; +} + +/// Unlink the specified loop, and update analysis accordingly. The heavy +/// lifting of deleting the loop is carried out by a run of LoopDeletion after +/// this pass. +void LoopOutline::unlinkLoop() { + Loop *L = OrigLoop; + + // Get components of the old loop. + BasicBlock *Preheader = L->getLoopPreheader(); + assert(Preheader && "Loop does not have a unique preheader."); + BasicBlock *Latch = L->getLoopLatch(); + + // Invalidate the analysis of the old loop. + SE.forgetLoop(L); + + // Redirect the preheader to branch directly to loop exit. + assert(1 == Preheader->getTerminator()->getNumSuccessors() && + "Preheader does not have a unique successor."); + Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(), + ExitBlock); + + // Rewrite phis in the exit block to get their inputs from + // the preheader instead of the exiting block. + BasicBlock::iterator BI = ExitBlock->begin(); + while (PHINode *P = dyn_cast(BI)) { + int j = P->getBasicBlockIndex(Latch); + assert(j >= 0 && "Can't find exiting block in exit block's phi node!"); + P->setIncomingBlock(j, Preheader); + P->removeIncomingValue(Latch); + ++BI; + } + + // Rewrite phis in the header block to not receive an input from + // the preheader. + BI = L->getHeader()->begin(); + while (PHINode *P = dyn_cast(BI)) { + P->removeIncomingValue(Preheader); + ++BI; + } +} + +/// \brief Compute the grainsize of the loop, based on the limit. +/// +/// The grainsize is computed by the following equation: +/// +/// Grainsize = min(2048, ceil(Limit / (8 * workers))) +/// +/// This computation is inserted into the preheader of the loop. +/// +/// TODO: This method is the only method that depends on the CilkABI. +/// Generalize this method for other grainsize calculations and to query TLI. +Value* DACLoopSpawning::computeGrainsize(Value *Limit) { + Loop *L = OrigLoop; + + Value *Grainsize; + BasicBlock *Preheader = L->getLoopPreheader(); + assert(Preheader && "No Preheader found for loop."); + + IRBuilder<> Builder(Preheader->getTerminator()); + + // Get 8 * workers + Value *Workers8 = Builder.CreateIntCast(cilk::GetOrCreateWorker8(*Preheader->getParent()), + Limit->getType(), false); + // Compute ceil(limit / 8 * workers) = (limit + 8 * workers - 1) / (8 * workers) + Value *SmallLoopVal = + Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, Workers8), + ConstantInt::get(Limit->getType(), 1)), + Workers8); + // Compute min + Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048); + Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal); + Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal); + + return Grainsize; +} + +/// \brief Method to help convertLoopToDACIterSpawn convert the Tapir +/// loop cloned into function Helper to spawn its iterations in a +/// parallel divide-and-conquer fashion. +/// +/// Example: Suppose that Helper contains the following Tapir loop: +/// +/// Helper(iter_t start, iter_t end, iter_t grain, ...) { +/// iter_t i = start; +/// ... Other loop setup ... +/// do { +/// spawn { ... loop body ... }; +/// } while (i++ < end); +/// sync; +/// } +/// +/// Then this method transforms Helper into the following form: +/// +/// Helper(iter_t start, iter_t end, iter_t grain, ...) { +/// recur: +/// iter_t itercount = end - start; +/// if (itercount > grain) { +/// // Invariant: itercount >= 2 +/// count_t miditer = start + itercount / 2; +/// spawn Helper(start, miditer, grain, ...); +/// start = miditer + 1; +/// goto recur; +/// } +/// +/// iter_t i = start; +/// ... Other loop setup ... +/// do { +/// ... Loop Body ... +/// } while (i++ < end); +/// sync; +/// } +/// +void DACLoopSpawning::implementDACIterSpawnOnHelper(Function *Helper, + BasicBlock *Preheader, + BasicBlock *Header, + PHINode *CanonicalIV, + Argument *Limit, + Argument *Grainsize, + Instruction *SyncRegion, + DominatorTree *DT, + LoopInfo *LI, + bool CanonicalIVFlagNUW, + bool CanonicalIVFlagNSW) { + // Serialize the cloned copy of the loop. + assert(Preheader->getParent() == Helper && + "Preheader does not belong to helper function."); + assert(Header->getParent() == Helper && + "Header does not belong to helper function."); + assert(CanonicalIV->getParent() == Header && + "CanonicalIV does not belong to header"); + assert(isa(Header->getTerminator()) && + "Cloned header is not terminated by a detach."); + DetachInst *DI = dyn_cast(Header->getTerminator()); + SerializeDetachedCFG(DI, DT); + + // Convert the cloned loop into the strip-mined loop body. + + BasicBlock *DACHead = Preheader; + if (&(Helper->getEntryBlock()) == Preheader) + // Split the entry block. We'll want to create a backedge into + // the split block later. + DACHead = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI); + + BasicBlock *RecurHead, *RecurDet, *RecurCont; + Value *IterCount; + Value *CanonicalIVInput; + PHINode *CanonicalIVStart; + { + Instruction *PreheaderOrigFront = &(DACHead->front()); + IRBuilder<> Builder(PreheaderOrigFront); + // Create branch based on grainsize. + DEBUG(dbgs() << "LS CanonicalIV: " << *CanonicalIV << "\n"); + CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(DACHead); + CanonicalIVStart = Builder.CreatePHI(CanonicalIV->getType(), 2, + CanonicalIV->getName()+".dac"); + CanonicalIVInput->replaceAllUsesWith(CanonicalIVStart); + IterCount = Builder.CreateSub(Limit, CanonicalIVStart, + "itercount"); + Value *IterCountCmp = Builder.CreateICmpUGT(IterCount, Grainsize); + TerminatorInst *RecurTerm = + SplitBlockAndInsertIfThen(IterCountCmp, PreheaderOrigFront, + /*Unreachable=*/false, + /*BranchWeights=*/nullptr, + DT); + RecurHead = RecurTerm->getParent(); + // Create skeleton of divide-and-conquer recursion: + // DACHead -> RecurHead -> RecurDet -> RecurCont -> DACHead + RecurDet = SplitBlock(RecurHead, RecurHead->getTerminator(), + DT, LI); + RecurCont = SplitBlock(RecurDet, RecurDet->getTerminator(), + DT, LI); + RecurCont->getTerminator()->replaceUsesOfWith(RecurTerm->getSuccessor(0), + DACHead); + } + + // Compute mid iteration in RecurHead. + Value *MidIter, *MidIterPlusOne; + { + IRBuilder<> Builder(&(RecurHead->front())); + MidIter = Builder.CreateAdd(CanonicalIVStart, + Builder.CreateLShr(IterCount, 1, + "halfcount"), + "miditer", + CanonicalIVFlagNUW, CanonicalIVFlagNSW); + } + + // Create recursive call in RecurDet. + { + // Create input array for recursive call. + IRBuilder<> Builder(&(RecurDet->front())); + SetVector RecurInputs; + Function::arg_iterator AI = Helper->arg_begin(); + assert(cast(CanonicalIVInput) == &*AI && + "First argument does not match original input to canonical IV."); + RecurInputs.insert(CanonicalIVStart); + ++AI; + assert(Limit == &*AI && + "Second argument does not match original input to the loop limit."); + RecurInputs.insert(MidIter); + ++AI; + for (Function::arg_iterator AE = Helper->arg_end(); + AI != AE; ++AI) + RecurInputs.insert(&*AI); + // RecurInputs.insert(CanonicalIVStart); + // // for (PHINode *IV : IVs) + // // RecurInputs.insert(DACStart[IV]); + // RecurInputs.insert(Limit); + // RecurInputs.insert(Grainsize); + // for (Value *V : BodyInputs) + // RecurInputs.insert(VMap[V]); + DEBUG({ + dbgs() << "RecurInputs: "; + for (Value *Input : RecurInputs) + dbgs() << *Input << ", "; + dbgs() << "\n"; + }); + + // Create call instruction. + CallInst *RecurCall = Builder.CreateCall(Helper, RecurInputs.getArrayRef()); + RecurCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); + // Use a fast calling convention for the helper. + RecurCall->setCallingConv(CallingConv::Fast); + // RecurCall->setCallingConv(Helper->getCallingConv()); + // // Update CG graph with the recursive call we just added. + // CG[Helper]->addCalledFunction(RecurCall, CG[Helper]); + } + + // Set up continuation of detached recursive call. We effectively + // inline this tail call automatically. + { + IRBuilder<> Builder(&(RecurCont->front())); + MidIterPlusOne = Builder.CreateAdd(MidIter, + ConstantInt::get(Limit->getType(), 1), + "miditerplusone", + CanonicalIVFlagNUW, + CanonicalIVFlagNSW); + } + + // Finish setup of new phi node for canonical IV. + { + CanonicalIVStart->addIncoming(CanonicalIVInput, Preheader); + CanonicalIVStart->addIncoming(MidIterPlusOne, RecurCont); + } + + /// Make the recursive DAC parallel. + { + IRBuilder<> Builder(RecurHead->getTerminator()); + // Create the detach. + DetachInst *DI = Builder.CreateDetach(RecurDet, RecurCont, SyncRegion); + DI->setDebugLoc(Header->getTerminator()->getDebugLoc()); + RecurHead->getTerminator()->eraseFromParent(); + // Create the reattach. + Builder.SetInsertPoint(RecurDet->getTerminator()); + ReattachInst *RI = Builder.CreateReattach(RecurCont, SyncRegion); + RI->setDebugLoc(Header->getTerminator()->getDebugLoc()); + RecurDet->getTerminator()->eraseFromParent(); + } +} + +/// Helper routine to get all exit blocks of a loop that are unreachable. +static void getEHExits(Loop *L, const BasicBlock *DesignatedExitBlock, + SmallVectorImpl &EHExits) { + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + + SmallVector WorkList; + for (BasicBlock *Exit : ExitBlocks) { + if (Exit == DesignatedExitBlock) continue; + EHExits.push_back(Exit); + WorkList.push_back(Exit); + } + + // Traverse the CFG from these frontier blocks to find all blocks involved in + // exception-handling exit code. + SmallPtrSet Visited; + while (!WorkList.empty()) { + BasicBlock *BB = WorkList.pop_back_val(); + if (!Visited.insert(BB).second) + continue; + + // Check that the exception handling blocks do not reenter the loop. + assert(!L->contains(BB) && + "Exception handling blocks re-enter loop."); + + for (BasicBlock *Succ : successors(BB)) { + EHExits.push_back(Succ); + WorkList.push_back(Succ); + } + } +} + +/// Top-level call to convert loop to spawn its iterations in a +/// divide-and-conquer fashion. +bool DACLoopSpawning::processLoop() { + Loop *L = OrigLoop; + + BasicBlock *Header = L->getHeader(); + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *Latch = L->getLoopLatch(); + + DEBUG({ + LoopBlocksDFS DFS(L); + DFS.perform(LI); + dbgs() << "Blocks in loop (from DFS):\n"; + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + dbgs() << *BB; + }); + + using namespace ore; + + // Check that this loop has a valid exit block after the latch. + if (!ExitBlock) { + DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit", + L->getStartLoc(), + Header) + << "invalid latch exit"); + return false; + } + + // Get special exits from this loop. + SmallVector EHExits; + getEHExits(L, ExitBlock, EHExits); + + // Check the exit blocks of the loop. + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + + for (const BasicBlock *Exit : ExitBlocks) { + if (Exit == ExitBlock) continue; + if (Exit->isLandingPad()) { + DEBUG({ + const LandingPadInst *LPI = Exit->getLandingPadInst(); + dbgs() << "landing pad found: " << *LPI << "\n"; + for (const User *U : LPI->users()) + dbgs() << "\tuser " << *U << "\n"; + }); + } + } + SmallPtrSet HandledExits; + for (BasicBlock *BB : EHExits) + HandledExits.insert(BB); + for (BasicBlock *Exit : ExitBlocks) { + if (Exit == ExitBlock) continue; + if (!HandledExits.count(Exit)) { + DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit", + L->getStartLoc(), + Header) + << "bad exit block found"); + return false; + } + } + + Function *F = Header->getParent(); + Module* M = F->getParent(); + + DEBUG(dbgs() << "LS loop header:" << *Header); + DEBUG(dbgs() << "LS loop latch:" << *Latch); + DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n"); + + /// Get loop limit. + const SCEV *Limit = SE.getExitCount(L, Latch); + DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); + // PredicatedScalarEvolution PSE(SE, *L); + // const SCEV *PLimit = PSE.getExitCount(L, Latch); + // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n"); + // emitAnalysis(LoopSpawningReport() + // << "computed loop limit " << *Limit << "\n"); + if (SE.getCouldNotCompute() == Limit) { + DEBUG(dbgs() << "SE could not compute loop limit.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", + L->getStartLoc(), + Header) + << "could not compute limit"); + return false; + } + // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(), + // Header) + // << "loop limit: " << NV("Limit", Limit)); + /// Clean up the loop's induction variables. + PHINode *CanonicalIV = canonicalizeIVs(Limit->getType()); + if (!CanonicalIV) { + DEBUG(dbgs() << "Could not get canonical IV.\n"); + // emitAnalysis(LoopSpawningReport() + // << "Could not get a canonical IV.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", + L->getStartLoc(), + Header) + << "could not find or create canonical IV"); + return false; + } + const SCEVAddRecExpr *CanonicalSCEV = + cast(SE.getSCEV(CanonicalIV)); + + // Remove all IV's other than CanonicalIV. + // First, check that we can do this. + bool CanRemoveIVs = true; + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + if (CanonicalIV == PN) continue; + // dbgs() << "IV " << *PN; + const SCEV *S = SE.getSCEV(PN); + // dbgs() << " SCEV " << *S << "\n"; + if (SE.getCouldNotCompute() == S) { + // emitAnalysis(LoopSpawningReport(PN) + // << "Could not compute the scalar evolution.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN) + << "could not compute scalar evolution of " + << NV("PHINode", PN)); + CanRemoveIVs = false; + } + } + + if (!CanRemoveIVs) { + DEBUG(dbgs() << "Could not compute scalar evolutions for all IV's.\n"); + return false; + } + + //////////////////////////////////////////////////////////////////////// + // We now have everything we need to extract the loop. It's time to + // do some surgery. + + SCEVExpander Exp(SE, M->getDataLayout(), "ls"); + + // Remove the IV's (other than CanonicalIV) and replace them with + // their stronger forms. + // + // TODO?: We can probably adapt this loop->DAC process such that we + // don't require all IV's to be canonical. + { + SmallVector IVsToRemove; + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + if (PN == CanonicalIV) continue; + const SCEV *S = SE.getSCEV(PN); + DEBUG(dbgs() << "Removing the IV " << *PN << " (" << *S << ")\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "RemoveIV", PN) + << "removing the IV " + << NV("PHINode", PN)); + Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV); + PN->replaceAllUsesWith(NewIV); + IVsToRemove.push_back(PN); + } + for (PHINode *PN : IVsToRemove) + PN->eraseFromParent(); + } + + // All remaining IV's should be canonical. Collect them. + // + // TODO?: We can probably adapt this loop->DAC process such that we + // don't require all IV's to be canonical. + SmallVector IVs; + bool AllCanonical = true; + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + DEBUG({ + const SCEVAddRecExpr *PNSCEV = + dyn_cast(SE.getSCEV(PN)); + assert(PNSCEV && "PHINode did not have corresponding SCEVAddRecExpr"); + assert(PNSCEV->getStart()->isZero() && + "PHINode SCEV does not start at 0"); + dbgs() << "LS step recurrence for SCEV " << *PNSCEV << " is " + << *(PNSCEV->getStepRecurrence(SE)) << "\n"; + assert(PNSCEV->getStepRecurrence(SE)->isOne() && + "PHINode SCEV step is not 1"); + }); + if (ConstantInt *C = + dyn_cast(PN->getIncomingValueForBlock(Preheader))) { + if (C->isZero()) { + DEBUG({ + if (PN != CanonicalIV) { + const SCEVAddRecExpr *PNSCEV = + dyn_cast(SE.getSCEV(PN)); + dbgs() << "Saving the canonical IV " << *PN << " (" << *PNSCEV << ")\n"; + } + }); + if (PN != CanonicalIV) + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "SaveIV", PN) + << "saving the canonical the IV " + << NV("PHINode", PN)); + IVs.push_back(PN); + } + } else { + AllCanonical = false; + DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN << + "\n"); + // emitAnalysis(LoopSpawningReport(PN) + // << "Found a remaining non-canonical IV.\n"); + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN) + << "found a remaining noncanonical IV"); + } + } + if (!AllCanonical) + return false; + + // Insert the computation for the loop limit into the Preheader. + Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(), + Preheader->getTerminator()); + DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n"); + + // Canonicalize the loop latch. + assert(SE.isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT, + CanonicalSCEV, Limit) && + "Loop backedge is not guarded by canonical comparison with limit."); + Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar); + + // Insert computation of grainsize into the Preheader. + // For debugging: + // Value *GrainVar = ConstantInt::get(Limit->getType(), 2); + Value *GrainVar = computeGrainsize(LimitVar); + DEBUG(dbgs() << "GrainVar: " << *GrainVar << "\n"); + // emitAnalysis(LoopSpawningReport() + // << "grainsize value " << *GrainVar << "\n"); + // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UsingGrainsize", + // L->getStartLoc(), Header) + // << "grainsize: " << NV("Grainsize", GrainVar)); + + /// Clone the loop into a new function. + + // Get the inputs and outputs for the Loop blocks. + SetVector Inputs, Outputs; + SetVector BodyInputs, BodyOutputs; + ValueToValueMapTy VMap, InputMap; + std::vector LoopBlocks; + SmallPtrSet ExitsToSplit; + + // Get the sync region containing this Tapir loop. + const Instruction *InputSyncRegion; + { + const DetachInst *DI = cast(Header->getTerminator()); + InputSyncRegion = cast(DI->getSyncRegion()); + } + + // Add start iteration, end iteration, and grainsize to inputs. + { + LoopBlocks = L->getBlocks(); + // // Add exit blocks terminated by unreachable. There should not be any other + // // exit blocks in the loop. + // SmallSet UnreachableExits; + // for (BasicBlock *Exit : ExitBlocks) { + // if (Exit == ExitBlock) continue; + // assert(isa(Exit->getTerminator()) && + // "Found problematic exit block."); + // UnreachableExits.insert(Exit); + // } + + // Add unreachable and exception-handling exits to the set of loop blocks to + // clone. + DEBUG({ + dbgs() << "Handled exits of loop:"; + for (BasicBlock *HE : HandledExits) + dbgs() << *HE; + dbgs() << "\n"; + }); + for (BasicBlock *HE : HandledExits) + LoopBlocks.push_back(HE); + { + const DetachInst *DI = cast(Header->getTerminator()); + BasicBlockEdge DetachEdge(Header, DI->getDetached()); + for (BasicBlock *HE : HandledExits) + if (!DT || !DT->dominates(DetachEdge, HE)) + ExitsToSplit.insert(HE); + DEBUG({ + dbgs() << "Loop exits to split:"; + for (BasicBlock *ETS : ExitsToSplit) + dbgs() << *ETS; + dbgs() << "\n"; + }); + } + + // DEBUG({ + // dbgs() << "LoopBlocks: "; + // for (BasicBlock *LB : LoopBlocks) + // dbgs() << LB->getName() << "(" + // << *(LB->getTerminator()) << "), "; + // dbgs() << "\n"; + // }); + + // Get the inputs and outputs for the loop body. + { + // CodeExtractor Ext(LoopBlocks, DT); + // Ext.findInputsOutputs(BodyInputs, BodyOutputs); + SmallPtrSet Blocks; + for (BasicBlock *BB : LoopBlocks) + Blocks.insert(BB); + findInputsOutputs(Blocks, BodyInputs, BodyOutputs, &ExitsToSplit); + } + + // Add argument for start of CanonicalIV. + DEBUG({ + Value *CanonicalIVInput = + CanonicalIV->getIncomingValueForBlock(Preheader); + // CanonicalIVInput should be the constant 0. + assert(isa(CanonicalIVInput) && + "Input to canonical IV from preheader is not constant."); + }); + Argument *StartArg = new Argument(CanonicalIV->getType(), + CanonicalIV->getName()+".start"); + Inputs.insert(StartArg); + InputMap[CanonicalIV] = StartArg; + + // Add argument for end. + // + // In the general case, the loop limit is the result of some computation + // that the pass added to the loop's preheader. In this case, the variable + // storing the loop limit is used exactly once, in the canonicalized loop + // latch. In this case, the pass wants to prevent outlining from passing + // the loop-limit variable as an arbitrary argument to the outlined + // function. Hence, this pass adds the loop-limit variable as an argument + // manually. + // + // There are two special cases to consider: the loop limit is a constant, or + // the loop limit is used elsewhere within the loop. To handle these two + // cases, this pass adds an explict argument for the end of the loop, to + // supports the subsequent transformation to using recursive + // divide-and-conquer. After the loop is outlined, this pass will rewrite + // the latch in the outlined loop to use this explicit argument. + // Furthermore, this pass does not prevent outliner from recognizing the + // loop limit as a potential argument to the function. + if (isa(LimitVar) || !LimitVar->hasOneUse()) { + Argument *EndArg = new Argument(LimitVar->getType(), "end"); + Inputs.insert(EndArg); + InputMap[LimitVar] = EndArg; + } else { + // If the limit var is not constant and has exactly one use, then the + // limit var is the result of some nontrivial computation, and that one + // use is the new condition inserted. + Inputs.insert(LimitVar); + InputMap[LimitVar] = LimitVar; + } + + // Add argument for grainsize. + if (isa(GrainVar)) { + Argument *GrainArg = new Argument(GrainVar->getType(), "grainsize"); + Inputs.insert(GrainArg); + InputMap[GrainVar] = GrainArg; + } else { + Inputs.insert(GrainVar); + InputMap[GrainVar] = GrainVar; + } + + // Put all of the inputs together, and clear redundant inputs from + // the set for the loop body. + SmallVector BodyInputsToRemove; + for (Value *V : BodyInputs) + if (V == InputSyncRegion) + BodyInputsToRemove.push_back(V); + else if (!Inputs.count(V)) + Inputs.insert(V); + else + BodyInputsToRemove.push_back(V); + for (Value *V : BodyInputsToRemove) + BodyInputs.remove(V); + DEBUG({ + for (Value *V : BodyInputs) + dbgs() << "Remaining body input: " << *V << "\n"; + }); + for (Value *V : BodyOutputs) + dbgs() << "EL output: " << *V << "\n"; + assert(0 == BodyOutputs.size() && + "All results from parallel loop should be passed by memory already."); + } + DEBUG({ + for (Value *V : Inputs) + dbgs() << "EL input: " << *V << "\n"; + for (Value *V : Outputs) + dbgs() << "EL output: " << *V << "\n"; + }); + + // Clone the loop blocks into a new helper function. + Function *Helper; + { + SmallVector Returns; // Ignore returns cloned. + + // LowerDbgDeclare(*(Header->getParent())); + + Helper = CreateHelper(Inputs, Outputs, LoopBlocks, + Header, Preheader, ExitBlock, + VMap, M, + F->getSubprogram() != nullptr, Returns, ".ls", + &ExitsToSplit, InputSyncRegion, + nullptr, nullptr, nullptr); + + assert(Returns.empty() && "Returns cloned when cloning loop."); + + // Use a fast calling convention for the helper. + Helper->setCallingConv(CallingConv::Fast); + // Helper->setCallingConv(Header->getParent()->getCallingConv()); + } + + // Add a sync to the helper's return. + BasicBlock *HelperHeader = cast(VMap[Header]); + { + BasicBlock *HelperExit = cast(VMap[ExitBlock]); + assert(isa(HelperExit->getTerminator())); + BasicBlock *NewHelperExit = SplitBlock(HelperExit, + HelperExit->getTerminator(), + DT, LI); + IRBuilder<> Builder(&(HelperExit->front())); + SyncInst *NewSync = Builder.CreateSync( + NewHelperExit, + cast(VMap[InputSyncRegion])); + // Set debug info of new sync to match that of terminator of the header of + // the cloned loop. + NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc()); + HelperExit->getTerminator()->eraseFromParent(); + } + + // // Add syncs to the helper's cloned resume blocks. + // for (BasicBlock *BB : Resumes) { + // BasicBlock *HelperResume = cast(VMap[BB]); + // assert(isa(HelperResume->getTerminator())); + // BasicBlock *NewHelperResume = SplitBlock(HelperResume, + // HelperResume->getTerminator(), + // DT, LI); + // IRBuilder<> Builder(&(HelperResume->front())); + // SyncInst *NewSync = Builder.CreateSync(NewHelperResume); + // // Set debug info of new sync to match that of terminator of the header of + // // the cloned loop. + // NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc()); + // HelperResume->getTerminator()->eraseFromParent(); + // } + + BasicBlock *NewPreheader = cast(VMap[Preheader]); + PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); + + // Rewrite the cloned IV's to start at the start iteration argument. + { + // Rewrite clone of canonical IV to start at the start iteration + // argument. + Argument *NewCanonicalIVStart = cast(VMap[InputMap[CanonicalIV]]); + { + int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader); + assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && + "Cloned canonical IV does not inherit a constant value from cloned preheader."); + NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart); + } + + // Rewrite other cloned IV's to start at their value at the start + // iteration. + const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart); + DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n"); + for (PHINode *IV : IVs) { + if (CanonicalIV == IV) continue; + + // Get the value of the IV at the start iteration. + DEBUG(dbgs() << "IV " << *IV); + const SCEV *IVSCEV = SE.getSCEV(IV); + DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")"); + const SCEVAddRecExpr *IVSCEVAddRec = cast(IVSCEV); + const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE); + DEBUG(dbgs() << " expands at iter " << *StartIterSCEV << + " to " << *IVAtIter << "\n"); + + // NOTE: Expanded code should not refer to other IV's. + Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(), + NewPreheader->getTerminator()); + + + // Set the value that the cloned IV inherits from the cloned preheader. + PHINode *NewIV = cast(VMap[IV]); + int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader); + assert(isa(NewIV->getIncomingValue(NewPreheaderIdx)) && + "Cloned IV does not inherit a constant value from cloned preheader."); + NewIV->setIncomingValue(NewPreheaderIdx, IVStart); + } + + // Remap the newly added instructions in the new preheader to use + // values local to the helper. + for (Instruction &II : *NewPreheader) + RemapInstruction(&II, VMap, RF_IgnoreMissingLocals, + /*TypeMapper=*/nullptr, /*Materializer=*/nullptr); + } + + // The loop has been outlined by this point. To handle the special cases + // where the loop limit was constant or used elsewhere within the loop, this + // pass rewrites the outlined loop-latch condition to use the explicit + // end-iteration argument. + if (isa(LimitVar) || !LimitVar->hasOneUse()) { + CmpInst *HelperCond = cast(VMap[NewCond]); + assert(((isa(LimitVar) && + HelperCond->getOperand(1) == LimitVar) || + (!LimitVar->hasOneUse() && + HelperCond->getOperand(1) == VMap[LimitVar])) && + "Unexpected condition in loop latch."); + IRBuilder<> Builder(HelperCond); + Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), + VMap[InputMap[LimitVar]]); + HelperCond->replaceAllUsesWith(NewHelperCond); + HelperCond->eraseFromParent(); + DEBUG(dbgs() << "Rewritten Latch: " << + *(cast(NewHelperCond)->getParent())); + } + + // DEBUGGING: Simply serialize the cloned loop. + // BasicBlock *NewHeader = cast(VMap[Header]); + // SerializeDetachedCFG(cast(NewHeader->getTerminator()), nullptr); + implementDACIterSpawnOnHelper(Helper, NewPreheader, + cast(VMap[Header]), + cast(VMap[CanonicalIV]), + cast(VMap[InputMap[LimitVar]]), + cast(VMap[InputMap[GrainVar]]), + cast(VMap[InputSyncRegion]), + /*DT=*/nullptr, /*LI=*/nullptr, + CanonicalSCEV->getNoWrapFlags(SCEV::FlagNUW), + CanonicalSCEV->getNoWrapFlags(SCEV::FlagNSW)); + + if (verifyFunction(*Helper, &dbgs())) + return false; + + // Update allocas in cloned loop body. + { + // Collect reattach instructions. + SmallVector ReattachPoints; + for (pred_iterator PI = pred_begin(Latch), PE = pred_end(Latch); + PI != PE; ++PI) { + BasicBlock *Pred = *PI; + if (!isa(Pred->getTerminator())) continue; + if (L->contains(Pred)) + ReattachPoints.push_back(cast(VMap[Pred])->getTerminator()); + } + // The cloned loop should be serialized by this point. + BasicBlock *ClonedLoopBodyEntry = + cast(VMap[Header])->getSingleSuccessor(); + assert(ClonedLoopBodyEntry && + "Head of cloned loop body has multiple successors."); + bool ContainsDynamicAllocas = + MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedLoopBodyEntry, + ReattachPoints); + + // If the cloned loop contained dynamic alloca instructions, wrap the cloned + // loop with llvm.stacksave/llvm.stackrestore intrinsics. + if (ContainsDynamicAllocas) { + Module *M = Helper->getParent(); + // Get the two intrinsics we care about. + Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave); + Function *StackRestore = + Intrinsic::getDeclaration(M,Intrinsic::stackrestore); + + // Insert the llvm.stacksave. + CallInst *SavedPtr = IRBuilder<>(&*ClonedLoopBodyEntry, + ClonedLoopBodyEntry->begin()) + .CreateCall(StackSave, {}, "savedstack"); + + // Insert a call to llvm.stackrestore before the reattaches in the + // original Tapir loop. + for (Instruction *ExitPoint : ReattachPoints) + IRBuilder<>(ExitPoint).CreateCall(StackRestore, SavedPtr); + } + } + + if (verifyFunction(*Helper, &dbgs())) + return false; + + // Add alignment assumptions to arguments of helper, based on alignment of + // values in old function. + AddAlignmentAssumptions(F, Inputs, VMap, + Preheader->getTerminator(), AC, DT); + + // Add call to new helper function in original function. + { + // Setup arguments for call. + SmallVector TopCallArgs; + // Add start iteration 0. + assert(CanonicalSCEV->getStart()->isZero() && + "Canonical IV does not start at zero."); + TopCallArgs.push_back(ConstantInt::get(CanonicalIV->getType(), 0)); + // Add loop limit. + TopCallArgs.push_back(LimitVar); + // Add grainsize. + TopCallArgs.push_back(GrainVar); + // Add the rest of the arguments. + for (Value *V : BodyInputs) + TopCallArgs.push_back(V); + DEBUG({ + for (Value *TCArg : TopCallArgs) + dbgs() << "Top call arg: " << *TCArg << "\n"; + }); + + // Create call instruction. + IRBuilder<> Builder(Preheader->getTerminator()); + CallInst *TopCall = Builder.CreateCall(Helper, + ArrayRef(TopCallArgs)); + + // Use a fast calling convention for the helper. + TopCall->setCallingConv(CallingConv::Fast); + // TopCall->setCallingConv(Helper->getCallingConv()); + TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); + // // Update CG graph with the call we just added. + // CG[F]->addCalledFunction(TopCall, CG[Helper]); + } + + // Remove sync of loop in parent. + { + // Get the sync region for this loop's detached iterations. + DetachInst *HeadDetach = cast(Header->getTerminator()); + Value *SyncRegion = HeadDetach->getSyncRegion(); + // Check the Tapir instructions contained in this sync region. Look for a + // single sync instruction among those Tapir instructions. Meanwhile, + // verify that the only detach instruction in this sync region is the detach + // in theloop header. If these conditions are met, then we assume that the + // sync applies to this loop. Otherwise, something more complicated is + // going on, and we give up. + SyncInst *LoopSync = nullptr; + bool SingleSyncJustForLoop = true; + for (User *U : SyncRegion->users()) { + // Skip the detach in the loop header. + if (HeadDetach == U) continue; + // Remember the first sync instruction we find. If we find multiple sync + // instructions, then something nontrivial is going on. + if (SyncInst *SI = dyn_cast(U)) { + if (!LoopSync) + LoopSync = SI; + else + SingleSyncJustForLoop = false; + } + // If we find a detach instruction that is not the loop header's, then + // something nontrivial is going on. + if (isa(U)) + SingleSyncJustForLoop = false; + } + if (LoopSync && SingleSyncJustForLoop) + // Replace the sync with a branch. + ReplaceInstWithInst(LoopSync, + BranchInst::Create(LoopSync->getSuccessor(0))); + else if (!LoopSync) + DEBUG(dbgs() << "No sync found for this loop."); + else + DEBUG(dbgs() << "No single sync found that only affects this loop."); + } + + ++LoopsConvertedToDAC; + + unlinkLoop(); + + return Helper; +} + +/// \brief Replace the latch of the loop to check that IV is always less than or +/// equal to the limit. +/// +/// This method assumes that the loop has a single loop latch. +Value* CilkABILoopSpawning::canonicalizeLoopLatch(PHINode *IV, Value *Limit) { + Loop *L = OrigLoop; + + Value *NewCondition; + BasicBlock *Header = L->getHeader(); + BasicBlock *Latch = L->getLoopLatch(); + assert(Latch && "No single loop latch found for loop."); + + IRBuilder<> Builder(&*Latch->getFirstInsertionPt()); + + // This process assumes that IV's increment is in Latch. + + // Create comparison between IV and Limit at top of Latch. + NewCondition = + Builder.CreateICmpULT(Builder.CreateAdd(IV, + ConstantInt::get(IV->getType(), 1)), + Limit); + + // Replace the conditional branch at the end of Latch. + BranchInst *LatchBr = dyn_cast_or_null(Latch->getTerminator()); + assert(LatchBr && LatchBr->isConditional() && + "Latch does not terminate with a conditional branch."); + Builder.SetInsertPoint(Latch->getTerminator()); + Builder.CreateCondBr(NewCondition, Header, ExitBlock); + + // Erase the old conditional branch. + LatchBr->eraseFromParent(); + + return NewCondition; +} + +/// Top-level call to convert a Tapir loop to be processed using an appropriate +/// Cilk ABI call. +bool CilkABILoopSpawning::processLoop() { + Loop *L = OrigLoop; + + BasicBlock *Header = L->getHeader(); + BasicBlock *Preheader = L->getLoopPreheader(); + BasicBlock *Latch = L->getLoopLatch(); + + using namespace ore; + + // Check the exit blocks of the loop. + if (!ExitBlock) { + DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit", + L->getStartLoc(), + Header) + << "invalid latch exit"); + return false; + } + + SmallVector ExitBlocks; + L->getExitBlocks(ExitBlocks); + for (const BasicBlock *Exit : ExitBlocks) { + if (Exit == ExitBlock) continue; + if (!isa(Exit->getTerminator())) { + DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit", + L->getStartLoc(), + Header) + << "bad exit block found"); + return false; + } + } + + Function *F = Header->getParent(); + Module* M = F->getParent(); + + DEBUG(dbgs() << "LS loop header:" << *Header); + DEBUG(dbgs() << "LS loop latch:" << *Latch); + + // DEBUG(dbgs() << "LS SE backedge taken count: " << *(SE.getBackedgeTakenCount(L)) << "\n"); + // DEBUG(dbgs() << "LS SE max backedge taken count: " << *(SE.getMaxBackedgeTakenCount(L)) << "\n"); + DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n"); + + /// Get loop limit. + const SCEV *BETC = SE.getExitCount(L, Latch); + const SCEV *Limit = SE.getAddExpr(BETC, SE.getOne(BETC->getType())); + DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n"); + // PredicatedScalarEvolution PSE(SE, *L); + // const SCEV *PLimit = PSE.getExitCount(L, Latch); + // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n"); + // emitAnalysis(LoopSpawningReport() + // << "computed loop limit " << *Limit << "\n"); + if (SE.getCouldNotCompute() == Limit) { + DEBUG(dbgs() << "SE could not compute loop limit.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit", + L->getStartLoc(), + Header) + << "could not compute limit"); + return false; + } + // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(), + // Header) + // << "loop limit: " << NV("Limit", Limit)); + /// Clean up the loop's induction variables. + PHINode *CanonicalIV = canonicalizeIVs(Limit->getType()); + if (!CanonicalIV) { + DEBUG(dbgs() << "Could not get canonical IV.\n"); + // emitAnalysis(LoopSpawningReport() + // << "Could not get a canonical IV.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV", + L->getStartLoc(), + Header) + << "could not find or create canonical IV"); + return false; + } + const SCEVAddRecExpr *CanonicalSCEV = + cast(SE.getSCEV(CanonicalIV)); + + // Remove all IV's other can CanonicalIV. + // First, check that we can do this. + bool CanRemoveIVs = true; + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + if (CanonicalIV == PN) continue; + // dbgs() << "IV " << *PN; + const SCEV *S = SE.getSCEV(PN); + // dbgs() << " SCEV " << *S << "\n"; + if (SE.getCouldNotCompute() == S) { + // emitAnalysis(LoopSpawningReport(PN) + // << "Could not compute the scalar evolution.\n"); + ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN) + << "could not compute scalar evolution of " + << NV("PHINode", PN)); + CanRemoveIVs = false; + } + } + + if (!CanRemoveIVs) { + DEBUG(dbgs() << "Could not compute scalar evolutions for all IV's.\n"); + return false; + } + + //////////////////////////////////////////////////////////////////////// + // We now have everything we need to extract the loop. It's time to + // do some surgery. + + SCEVExpander Exp(SE, M->getDataLayout(), "ls"); + + // Remove the IV's (other than CanonicalIV) and replace them with + // their stronger forms. + // + // TODO?: We can probably adapt this process such that we don't require all + // IV's to be canonical. + { + SmallVector IVsToRemove; + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + if (PN == CanonicalIV) continue; + const SCEV *S = SE.getSCEV(PN); + Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV); + PN->replaceAllUsesWith(NewIV); + IVsToRemove.push_back(PN); + } + for (PHINode *PN : IVsToRemove) + PN->eraseFromParent(); + } + + // All remaining IV's should be canonical. Collect them. + // + // TODO?: We can probably adapt this process such that we don't require all + // IV's to be canonical. + SmallVector IVs; + bool AllCanonical = true; + for (BasicBlock::iterator II = Header->begin(); isa(II); ++II) { + PHINode *PN = cast(II); + DEBUG({ + const SCEVAddRecExpr *PNSCEV = + dyn_cast(SE.getSCEV(PN)); + assert(PNSCEV && "PHINode did not have corresponding SCEVAddRecExpr"); + assert(PNSCEV->getStart()->isZero() && + "PHINode SCEV does not start at 0"); + dbgs() << "LS step recurrence for SCEV " << *PNSCEV << " is " + << *(PNSCEV->getStepRecurrence(SE)) << "\n"; + assert(PNSCEV->getStepRecurrence(SE)->isOne() && + "PHINode SCEV step is not 1"); + }); + if (ConstantInt *C = + dyn_cast(PN->getIncomingValueForBlock(Preheader))) { + if (C->isZero()) + IVs.push_back(PN); + } else { + AllCanonical = false; + DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN << "\n"); + // emitAnalysis(LoopSpawningReport(PN) + // << "Found a remaining non-canonical IV.\n"); + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN) + << "found a remaining noncanonical IV"); + } + } + if (!AllCanonical) + return false; + + // Insert the computation for the loop limit into the Preheader. + Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(), + Preheader->getTerminator()); + DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n"); + + // Canonicalize the loop latch. + Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar); + + /// Clone the loop into a new function. + + // Get the inputs and outputs for the Loop blocks. + SetVector Inputs, Outputs; + SetVector BodyInputs, BodyOutputs; + ValueToValueMapTy VMap, InputMap; + std::vector LoopBlocks; + AllocaInst* closure; + // Add start iteration, end iteration, and grainsize to inputs. + { + LoopBlocks = L->getBlocks(); + // // Add exit blocks terminated by unreachable. There should not be any other + // // exit blocks in the loop. + // SmallSet UnreachableExits; + // for (BasicBlock *Exit : ExitBlocks) { + // if (Exit == ExitBlock) continue; + // assert(isa(Exit->getTerminator()) && + // "Found problematic exit block."); + // UnreachableExits.insert(Exit); + // } + + // // Add unreachable and exception-handling exits to the set of loop blocks to + // // clone. + // for (BasicBlock *BB : UnreachableExits) + // LoopBlocks.push_back(BB); + // for (BasicBlock *BB : EHExits) + // LoopBlocks.push_back(BB); + + // DEBUG({ + // dbgs() << "LoopBlocks: "; + // for (BasicBlock *LB : LoopBlocks) + // dbgs() << LB->getName() << "(" + // << *(LB->getTerminator()) << "), "; + // dbgs() << "\n"; + // }); + + // Get the inputs and outputs for the loop body. + { + // CodeExtractor Ext(LoopBlocks, DT); + // Ext.findInputsOutputs(BodyInputs, BodyOutputs); + SmallPtrSet Blocks; + for (BasicBlock *BB : LoopBlocks) + Blocks.insert(BB); + findInputsOutputs(Blocks, BodyInputs, BodyOutputs); + } + + // Add argument for start of CanonicalIV. + DEBUG({ + Value *CanonicalIVInput = + CanonicalIV->getIncomingValueForBlock(Preheader); + // CanonicalIVInput should be the constant 0. + assert(isa(CanonicalIVInput) && + "Input to canonical IV from preheader is not constant."); + }); + Argument *StartArg = new Argument(CanonicalIV->getType(), + CanonicalIV->getName()+".start"); + Inputs.insert(StartArg); + InputMap[CanonicalIV] = StartArg; + + // Add argument for end. + Value* ea; + if (isa(LimitVar)) { + Argument *EndArg = new Argument(LimitVar->getType(), "end"); + Inputs.insert(EndArg); + ea = InputMap[LimitVar] = EndArg; + } else { + Inputs.insert(LimitVar); + ea = InputMap[LimitVar] = LimitVar; + } + + // Put all of the inputs together, and clear redundant inputs from + // the set for the loop body. + SmallVector BodyInputsToRemove; + SmallVector StructInputs; + SmallVector StructIT; + for (Value *V : BodyInputs) { + if (!Inputs.count(V)) { + StructInputs.push_back(V); + StructIT.push_back(V->getType()); + } + else + BodyInputsToRemove.push_back(V); + } + StructType* ST = StructType::create(StructIT); + IRBuilder<> B(L->getLoopPreheader()->getTerminator()); + IRBuilder<> B2(L->getHeader()->getFirstNonPHIOrDbgOrLifetime()); + closure = B.CreateAlloca(ST); + for(unsigned i=0; iuse_begin(), E = StructInputs[i]->use_end(); + for (; UI != E;) { + Use &U = *UI; + ++UI; + auto *Usr = dyn_cast(U.getUser()); + if (Usr && !L->contains(Usr->getParent())) + continue; + U.set(l2); + } + } + Inputs.insert(closure); + //llvm::errs() << "\n"; + //for(auto& a : Inputs) a->dump(); + //llvm::errs() << "\n"; + //StartArg->dump(); + //ea->dump(); + Inputs.remove(StartArg); + Inputs.insert(StartArg); + Inputs.remove(ea); + Inputs.insert(ea); + //llvm::errs() << "\n"; + //for(auto& a : Inputs) a->dump(); + //llvm::errs() << "\n"; + for (Value *V : BodyInputsToRemove) + BodyInputs.remove(V); + assert(0 == BodyOutputs.size() && + "All results from parallel loop should be passed by memory already."); + } + DEBUG({ + for (Value *V : Inputs) + dbgs() << "EL input: " << *V << "\n"; + for (Value *V : Outputs) + dbgs() << "EL output: " << *V << "\n"; + }); + + + Function *Helper; + { + SmallVector Returns; // Ignore returns cloned. + + // LowerDbgDeclare(*(Header->getParent())); + + Helper = CreateHelper(Inputs, Outputs, L->getBlocks(), + Header, Preheader, ExitBlock/*L->getExitBlock()*/, + VMap, M, + F->getSubprogram() != nullptr, Returns, ".ls", + nullptr, nullptr, nullptr); + + assert(Returns.empty() && "Returns cloned when cloning loop."); + + // Use a fast calling convention for the helper. + //Helper->setCallingConv(CallingConv::Fast); + // Helper->setCallingConv(Header->getParent()->getCallingConv()); + } + + BasicBlock *NewPreheader = cast(VMap[Preheader]); + PHINode *NewCanonicalIV = cast(VMap[CanonicalIV]); + + // Rewrite the cloned IV's to start at the start iteration argument. + { + // Rewrite clone of canonical IV to start at the start iteration + // argument. + Argument *NewCanonicalIVStart = cast(VMap[InputMap[CanonicalIV]]); + { + int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader); + assert(isa(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) && + "Cloned canonical IV does not inherit a constant value from cloned preheader."); + NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart); + } + + // Rewrite other cloned IV's to start at their value at the start + // iteration. + const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart); + DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n"); + for (PHINode *IV : IVs) { + if (CanonicalIV == IV) continue; + + // Get the value of the IV at the start iteration. + DEBUG(dbgs() << "IV " << *IV); + const SCEV *IVSCEV = SE.getSCEV(IV); + DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")"); + const SCEVAddRecExpr *IVSCEVAddRec = cast(IVSCEV); + const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE); + DEBUG(dbgs() << " expands at iter " << *StartIterSCEV << + " to " << *IVAtIter << "\n"); + + // NOTE: Expanded code should not refer to other IV's. + Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(), + NewPreheader->getTerminator()); + + + // Set the value that the cloned IV inherits from the cloned preheader. + PHINode *NewIV = cast(VMap[IV]); + int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader); + assert(isa(NewIV->getIncomingValue(NewPreheaderIdx)) && + "Cloned IV does not inherit a constant value from cloned preheader."); + NewIV->setIncomingValue(NewPreheaderIdx, IVStart); + } + + // Remap the newly added instructions in the new preheader to use + // values local to the helper. + for (Instruction &II : *NewPreheader) + RemapInstruction(&II, VMap, RF_IgnoreMissingLocals, + /*TypeMapper=*/nullptr, /*Materializer=*/nullptr); + } + + // If the loop limit is constant, then rewrite the loop latch + // condition to use the end-iteration argument. + if (isa(LimitVar)) { + CmpInst *HelperCond = cast(VMap[NewCond]); + assert(HelperCond->getOperand(1) == LimitVar); + IRBuilder<> Builder(HelperCond); + Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0), + VMap[InputMap[LimitVar]]); + HelperCond->replaceAllUsesWith(NewHelperCond); + HelperCond->eraseFromParent(); + } + + // For debugging: + BasicBlock *NewHeader = cast(VMap[Header]); + SerializeDetachedCFG(cast(NewHeader->getTerminator()), nullptr); + { + Value* v = &*Helper->arg_begin(); + auto UI = v->use_begin(), E = v->use_end(); + for (; UI != E;) { + Use &U = *UI; + ++UI; + auto *Usr = dyn_cast(U.getUser()); + Usr->moveBefore(Helper->getEntryBlock().getTerminator()); + + auto UI2 = Usr->use_begin(), E2 = Usr->use_end(); + for (; UI2 != E2;) { + Use &U2 = *UI2; + ++UI2; + auto *Usr2 = dyn_cast(U2.getUser()); + Usr2->moveBefore(Helper->getEntryBlock().getTerminator()); + } + } + } + + if (verifyFunction(*Helper, &dbgs())) + return false; + + // Add call to new helper function in original function. + { + // Setup arguments for call. + SetVector TopCallArgs; + // Add start iteration 0. + assert(CanonicalSCEV->getStart()->isZero() && + "Canonical IV does not start at zero."); + TopCallArgs.insert(ConstantInt::get(CanonicalIV->getType(), 0)); + // Add loop limit. + TopCallArgs.insert(LimitVar); + // Add grainsize. + //TopCallArgs.insert(GrainVar); + // Add the rest of the arguments. + for (Value *V : BodyInputs) + TopCallArgs.insert(V); + + // Create call instruction. + IRBuilder<> Builder(Preheader->getTerminator()); + + llvm::Function* F; + if( ((llvm::IntegerType*)LimitVar->getType())->getBitWidth() == 32 ) + F = CILKRTS_FUNC(cilk_for_32, *M); + else { + assert( ((llvm::IntegerType*)LimitVar->getType())->getBitWidth() == 64 ); + F = CILKRTS_FUNC(cilk_for_64, *M); + } + llvm::Value* args[] = { + Builder.CreatePointerCast(Helper, F->getFunctionType()->getParamType(0)), + Builder.CreatePointerCast(closure, F->getFunctionType()->getParamType(1)), + LimitVar, + ConstantInt::get(IntegerType::get(F->getContext(), sizeof(int)*8),0) + }; + + /*CallInst *TopCall = */Builder.CreateCall(F, args); + + // Use a fast calling convention for the helper. + //TopCall->setCallingConv(CallingConv::Fast); + // TopCall->setCallingConv(Helper->getCallingConv()); + //TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc()); + // // Update CG graph with the call we just added. + // CG[F]->addCalledFunction(TopCall, CG[Helper]); + } + + ++LoopsConvertedToCilkABI; + + unlinkLoop(); + + return Helper; +} + +/// Checks if this loop is a Tapir loop. Right now we check that the loop is +/// in a canonical form: +/// 1) The header detaches the body. +/// 2) The loop contains a single latch. +/// 3) The body reattaches to the latch (which is necessary for a valid +/// detached CFG). +/// 4) The loop only branches to the exit block from the header or the latch. +bool LoopSpawningImpl::isTapirLoop(const Loop *L) { + const BasicBlock *Header = L->getHeader(); + const BasicBlock *Latch = L->getLoopLatch(); + // const BasicBlock *Exit = L->getExitBlock(); + + // DEBUG(dbgs() << "LS checking if Tapir loop: " << *L); + + // Header must be terminated by a detach. + if (!isa(Header->getTerminator())) { + DEBUG(dbgs() << "LS loop header is not terminated by a detach: " << *L << "\n"); + return false; + } + + // Loop must have a unique latch. + if (nullptr == Latch) { + DEBUG(dbgs() << "LS loop does not have a unique latch: " << *L << "\n"); + return false; + } + + // // Loop must have a unique exit block. + // if (nullptr == Exit) { + // DEBUG(dbgs() << "LS loop does not have a unique exit block: " << *L << "\n"); + // SmallVector ExitBlocks; + // L->getUniqueExitBlocks(ExitBlocks); + // for (BasicBlock *Exit : ExitBlocks) + // DEBUG(dbgs() << *Exit); + // return false; + // } + + // Continuation of header terminator must be the latch. + const DetachInst *HeaderDetach = cast(Header->getTerminator()); + const BasicBlock *Continuation = HeaderDetach->getContinue(); + if (Continuation != Latch) { + DEBUG(dbgs() << "LS continuation of detach in header is not the latch: " + << *L << "\n"); + return false; + } + + // All other predecessors of Latch are terminated by reattach instructions. + for (auto PI = pred_begin(Latch), PE = pred_end(Latch); PI != PE; ++PI) { + const BasicBlock *Pred = *PI; + if (Header == Pred) continue; + if (!isa(Pred->getTerminator())) { + DEBUG(dbgs() << "LS Latch has a predecessor that is not terminated " + << "by a reattach: " << *L << "\n"); + return false; + } + } + + // Get the exit block from Latch. + BasicBlock *Exit = Latch->getTerminator()->getSuccessor(0); + if (Header == Exit) + Exit = Latch->getTerminator()->getSuccessor(1); + + // The only predecessors of Exit inside the loop are Header and Latch. + for (auto PI = pred_begin(Exit), PE = pred_end(Exit); PI != PE; ++PI) { + const BasicBlock *Pred = *PI; + if (!L->contains(Pred)) + continue; + if (Header != Pred && Latch != Pred) { + DEBUG(dbgs() << "LS Loop branches to exit block from a block " + << "other than the header or latch" << *L << "\n"); + return false; + } + } + + return true; +} + +/// This routine recursively examines all descendants of the specified loop and +/// adds all Tapir loops in that tree to the vector. This routine performs a +/// pre-order traversal of the tree of loops and pushes each Tapir loop found +/// onto the end of the vector. +void LoopSpawningImpl::addTapirLoop(Loop *L, SmallVectorImpl &V) { + if (isTapirLoop(L)) { + V.push_back(L); + return; + } + + LoopSpawningHints Hints(L, ORE); + + DEBUG(dbgs() << "LS: Loop hints:" + << " strategy = " << Hints.printStrategy(Hints.getStrategy()) + << "\n"); + + using namespace ore; + + if (LoopSpawningHints::ST_SEQ != Hints.getStrategy()) { + DEBUG(dbgs() << "LS: Marked loop is not a valid Tapir loop.\n" + << "\tLoop hints:" + << " strategy = " << Hints.printStrategy(Hints.getStrategy()) + << "\n"); + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NotTapir", + L->getStartLoc(), L->getHeader()) + << "marked loop is not a valid Tapir loop"); + } + + for (Loop *InnerL : *L) + addTapirLoop(InnerL, V); +} + +#ifndef NDEBUG +/// \return string containing a file name and a line # for the given loop. +static std::string getDebugLocString(const Loop *L) { + std::string Result; + if (L) { + raw_string_ostream OS(Result); + if (const DebugLoc LoopDbgLoc = L->getStartLoc()) + LoopDbgLoc.print(OS); + else + // Just print the module name. + OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier(); + OS.flush(); + } + return Result; +} +#endif + +bool LoopSpawningImpl::run() { + // Build up a worklist of inner-loops to vectorize. This is necessary as + // the act of vectorizing or partially unrolling a loop creates new loops + // and can invalidate iterators across the loops. + SmallVector Worklist; + + // Examine all top-level loops in this function, and call addTapirLoop to push + // those loops onto the work list. + for (Loop *L : LI) + addTapirLoop(L, Worklist); + + LoopsAnalyzed += Worklist.size(); + + // Now walk the identified inner loops. + bool Changed = false; + while (!Worklist.empty()) + // Process the work list of loops backwards. For each tree of loops in this + // function, addTapirLoop pushed those loops onto the work list according to + // a pre-order tree traversal. Therefore, processing the work list + // backwards leads us to process innermost loops first. + Changed |= processLoop(Worklist.pop_back_val()); + + // Process each loop nest in the function. + return Changed; +} + +// Top-level routine to process a given loop. +bool LoopSpawningImpl::processLoop(Loop *L) { +#ifndef NDEBUG + const std::string DebugLocStr = getDebugLocString(L); +#endif /* NDEBUG */ + + // Function containing loop + Function *F = L->getHeader()->getParent(); + + DEBUG(dbgs() << "\nLS: Checking a Tapir loop in \"" + << L->getHeader()->getParent()->getName() << "\" from " + << DebugLocStr << ": " << *L << "\n"); + + LoopSpawningHints Hints(L, ORE); + + DEBUG(dbgs() << "LS: Loop hints:" + << " strategy = " << Hints.printStrategy(Hints.getStrategy()) + << "\n"); + + using namespace ore; + + // Get the loop preheader. LoopSimplify should guarantee that the loop + // preheader is not terminated by a sync. + BasicBlock *Preheader = L->getLoopPreheader(); + if (!Preheader) { + DEBUG(dbgs() << "LS: Loop lacks a preheader.\n"); + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoPreheader", + L->getStartLoc(), L->getHeader()) + << "loop lacks a preheader"); + emitMissedWarning(F, L, Hints, &ORE); + return false; + } else if (!isa(Preheader->getTerminator())) { + DEBUG(dbgs() << "LS: Loop preheader is not terminated by a branch.\n"); + ORE.emit(OptimizationRemarkMissed(LS_NAME, "ComplexPreheader", + L->getStartLoc(), L->getHeader()) + << "loop preheader not terminated by a branch"); + emitMissedWarning(F, L, Hints, &ORE); + return false; + } + + switch(Hints.getStrategy()) { + case LoopSpawningHints::ST_SEQ: + DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n"); + break; + case LoopSpawningHints::ST_DAC: + DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n"); + { + DebugLoc DLoc = L->getStartLoc(); + BasicBlock *Header = L->getHeader(); + DACLoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + // CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE); + // DACLoopSpawning DLS(L, SE, LI, DT, TLI, TTI, ORE); + if (DLS.processLoop()) { + DEBUG({ + if (verifyFunction(*L->getHeader()->getParent())) { + dbgs() << "Transformed function is invalid.\n"; + return false; + } + }); + // Report success. + ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header) + << "spawning iterations using divide-and-conquer"); + return true; + } else { + // Report failure. + ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc, + Header) + << "cannot spawn iterations using divide-and-conquer"); + emitMissedWarning(F, L, Hints, &ORE); + return false; + } + } + break; + case LoopSpawningHints::ST_END: + dbgs() << "LS: Hints specify unknown spawning strategy.\n"; + break; + } + return false; +} + +// PreservedAnalyses LoopSpawningPass::run(Module &M, ModuleAnalysisManager &AM) { +// // Find functions that detach for processing. +// SmallVector WorkList; +// for (Function &F : M) +// for (BasicBlock &BB : F) +// if (isa(BB.getTerminator())) +// WorkList.push_back(&F); + +// if (WorkList.empty()) +// return PreservedAnalyses::all(); + +// bool Changed = false; +// while (!WorkList.empty()) { +// Function *F = WorkList.back(); +// auto &TLI = AM.getResult(M); +// auto &FAM = AM.getResult(M).getManager(); +// auto &LI = FAM.getResult(*F); +// auto &SE = FAM.getResult(*F); +// auto &DT = FAM.getResult(*F); +// auto &TTI = FAM.getResult(*F); +// auto &AA = FAM.getResult(*F); +// auto &AC = FAM.getResult(*F); +// auto &ORE = FAM.getResult(*F); +// LoopSpawningImpl Impl(*F, LI, SE, DT, TTI, &TLI, AA, AC, ORE); +// Changed |= Impl.run(); +// WorkList.pop_back(); +// } + +// if (Changed) +// return PreservedAnalyses::none(); +// return PreservedAnalyses::all(); +// } + +PreservedAnalyses LoopSpawningPass::run(Function &F, + FunctionAnalysisManager &AM) { + // Determine if function detaches. + bool DetachingFunction = false; + for (BasicBlock &BB : F) + if (isa(BB.getTerminator())) + DetachingFunction = true; + + if (!DetachingFunction) + return PreservedAnalyses::all(); + + auto &LI = AM.getResult(F); + auto &SE = AM.getResult(F); + auto &DT = AM.getResult(F); + // auto &TTI = AM.getResult(F); + // auto &TLI = AM.getResult(M); + // auto &AA = AM.getResult(F); + auto &AC = AM.getResult(F); + auto &ORE = + AM.getResult(F); + // OptimizationRemarkEmitter ORE(F); + + bool Changed = LoopSpawningImpl(F, LI, SE, DT, AC, ORE).run(); + + AM.invalidate(F); + + if (Changed) + return PreservedAnalyses::none(); + return PreservedAnalyses::all(); +} + +namespace { +struct LoopSpawning : public FunctionPass { + /// Pass identification, replacement for typeid + static char ID; + + explicit LoopSpawning() : FunctionPass(ID) { + initializeLoopSpawningPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + bool DetachingFunction = false; + for (BasicBlock &BB : F) + if (isa(BB.getTerminator())) + DetachingFunction = true; + + if (!DetachingFunction) + return false; + + auto &LI = getAnalysis().getLoopInfo(); + auto &SE = getAnalysis().getSE(); + auto &DT = getAnalysis().getDomTree(); + // auto *TTI = &getAnalysis().getTTI(*F); + // auto *TLIP = getAnalysisIfAvailable(); + // auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + // auto *TLI = &getAnalysis().getTLI(); + // auto *AA = &getAnalysis(*F).getAAResults(); + auto &AC = getAnalysis().getAssumptionCache(F); + auto &ORE = + getAnalysis().getORE(); + // OptimizationRemarkEmitter ORE(F); + return LoopSpawningImpl(F, LI, SE, DT, AC, ORE).run(); + } + + // bool runOnModule(Module &M) override { + // if (skipModule(M)) + // return false; + + // // Find functions that detach for processing. + // SmallVector WorkList; + // for (Function &F : M) + // for (BasicBlock &BB : F) + // if (isa(BB.getTerminator())) + // WorkList.push_back(&F); + + // if (WorkList.empty()) + // return false; + + // auto GetLI = [this](Function &F) -> LoopInfo & { + // return getAnalysis(F).getLoopInfo(); + // }; + // auto GetSE = [this](Function &F) -> ScalarEvolution & { + // return getAnalysis(F).getSE(); + // }; + // auto GetDT = [this](Function &F) -> DominatorTree & { + // return this->getAnalysis(F).getDomTree(); + // }; + + // bool Changed = false; + // while (!WorkList.empty()) { + // // Process the next function. + // Function *F = WorkList.back(); + // // auto *LI = &getAnalysis(*F).getLoopInfo(); + // // auto *SE = &getAnalysis(*F).getSE(); + // // auto *DT = &getAnalysis(*F).getDomTree(); + // // auto *TTI = &getAnalysis().getTTI(*F); + // // auto *TLIP = getAnalysisIfAvailable(); + // // auto *TLI = TLIP ? &TLIP->getTLI() : nullptr; + // // auto *TLI = &getAnalysis().getTLI(); + // // auto *AA = &getAnalysis(*F).getAAResults(); + // // auto *AC = &getAnalysis().getAssumptionCache(*F); + // auto &ORE = + // getAnalysis(*F).getORE(); + // // OptimizationRemarkEmitter ORE(F); + // // LoopSpawningImpl Impl(*F, GetLI, GetSE, GetDT, *TTI, TLI, *AA, *AC, ORE); + // LoopSpawningImpl Impl(*F, GetLI, GetSE, GetDT, ORE); + // Changed |= Impl.run(); + + // WorkList.pop_back(); + // } + // return Changed; + // } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + AU.addRequired(); + // AU.addRequired(); + // getAAResultsAnalysisUsage(AU); + // AU.addRequired(); + AU.addRequired(); + } +}; +} + +char LoopSpawning::ID = 0; +// static RegisterPass X(LS_NAME, "Transform Tapir loops to spawn iterations efficiently", false, false); +static const char ls_name[] = "Loop Spawning"; +INITIALIZE_PASS_BEGIN(LoopSpawning, LS_NAME, ls_name, false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(LoopSimplify) +INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +// INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis) +// INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) +INITIALIZE_PASS_END(LoopSpawning, LS_NAME, ls_name, false, false) + +namespace llvm { +Pass *createLoopSpawningPass() { + return new LoopSpawning(); +} +} diff --git a/llvm/lib/Transforms/Tapir/LowerToCilk.cpp b/llvm/lib/Transforms/Tapir/LowerToCilk.cpp new file mode 100644 index 00000000000000..2d8b1ccb82572e --- /dev/null +++ b/llvm/lib/Transforms/Tapir/LowerToCilk.cpp @@ -0,0 +1,219 @@ +//===- LowerToCilk.cpp - Convert Tapir into Cilk runtime calls ------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass converts functions that include Tapir instructions to call out to +// the Cilk runtime system. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/CilkABI.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/IR/Verifier.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Tapir.h" + +#define DEBUG_TYPE "tapir2cilk" + +using namespace llvm; + +static cl::opt ClInstrumentCilk("instrument-cilk", cl::init(false), + cl::Hidden, + cl::desc("Instrument Cilk events")); + +cl::opt fastCilk("fast-cilk", cl::init(false), cl::Hidden, + cl::desc("Attempt faster cilk call implementation")); + +namespace { + +struct LowerTapirToCilk : public ModulePass { + static char ID; // Pass identification, replacement for typeid + bool DisablePostOpts; + bool Instrument; + explicit LowerTapirToCilk(bool DisablePostOpts = false, bool Instrument = false) + : ModulePass(ID), DisablePostOpts(DisablePostOpts), + Instrument(Instrument) { + initializeLowerTapirToCilkPass(*PassRegistry::getPassRegistry()); + } + + StringRef getPassName() const override { + return "Simple Lowering of Tapir to Cilk ABI"; + } + + bool runOnModule(Module &M) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } +private: + ValueToValueMapTy DetachCtxToStackFrame; + bool unifyReturns(Function &F); + SmallVectorImpl *processFunction(Function &F, DominatorTree &DT, + AssumptionCache &AC); +}; +} // End of anonymous namespace + +char LowerTapirToCilk::ID = 0; +INITIALIZE_PASS_BEGIN(LowerTapirToCilk, "tapir2cilk", + "Simple Lowering of Tapir to Cilk ABI", false, false) +INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) +INITIALIZE_PASS_END(LowerTapirToCilk, "tapir2cilk", + "Simple Lowering of Tapir to Cilk ABI", false, false) + +// Helper function to inline calls to compiler-generated Cilk Plus runtime +// functions when possible. This inlining is necessary to properly implement +// some Cilk runtime "calls," such as __cilkrts_detach(). +static inline void inlineCilkFunctions(Function &F) { + bool inlining = true; + while (inlining) { + inlining = false; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (CallInst *cal = dyn_cast(&*I)) + if (Function *fn = cal->getCalledFunction()) + if (fn->getName().startswith("__cilk")) { + InlineFunctionInfo IFI; + if (InlineFunction(cal, IFI)) { + if (fn->getNumUses()==0) + fn->eraseFromParent(); + inlining = true; + break; + } + } + } + + if (verifyFunction(F, &errs())) { + DEBUG(F.dump()); + assert(0); + } +} + +bool LowerTapirToCilk::unifyReturns(Function &F) { + SmallVector ReturningBlocks; + for (BasicBlock &BB : F) + if (isa(BB.getTerminator())) + ReturningBlocks.push_back(&BB); + + // If this function already has a single return, then terminate early. + if (ReturningBlocks.size() == 1) + return false; + + BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), + "UnifiedReturnBlock", &F); + PHINode *PN = nullptr; + if (F.getReturnType()->isVoidTy()) { + ReturnInst::Create(F.getContext(), nullptr, NewRetBlock); + } else { + // If the function doesn't return void... add a PHI node to the block... + PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(), + "UnifiedRetVal"); + NewRetBlock->getInstList().push_back(PN); + ReturnInst::Create(F.getContext(), PN, NewRetBlock); + } + + // Loop over all of the blocks, replacing the return instruction with an + // unconditional branch. + // + for (BasicBlock *BB : ReturningBlocks) { + // Add an incoming element to the PHI node for every return instruction that + // is merging into this new block... + if (PN) + PN->addIncoming(BB->getTerminator()->getOperand(0), BB); + + BB->getInstList().pop_back(); // Remove the return insn + BranchInst::Create(NewRetBlock, BB); + } + return true; +} + +SmallVectorImpl +*LowerTapirToCilk::processFunction(Function &F, DominatorTree &DT, + AssumptionCache &AC) { + if (fastCilk && F.getName()=="main") { + IRBuilder<> start(F.getEntryBlock().getFirstNonPHIOrDbg()); + auto m = start.CreateCall(CILKRTS_FUNC(init, *F.getParent())); + m->moveBefore(F.getEntryBlock().getTerminator()); + } + + if (unifyReturns(F)) + DT.recalculate(F); + + // Lower Tapir instructions in this function. Collect the set of helper + // functions generated by this process. + SmallVector *NewHelpers = new SmallVector(); + for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) { + if (DetachInst* DI = dyn_cast_or_null(I->getTerminator())) { + // Lower a detach instruction, and collect the helper function generated + // in this process for executing the detached task. + Function *Helper = cilk::createDetach(*DI, DetachCtxToStackFrame, DT, AC, + ClInstrumentCilk || Instrument); + NewHelpers->push_back(Helper); + } else if (SyncInst* SI = dyn_cast_or_null(I->getTerminator())) { + // Lower a sync instruction. + cilk::createSync(*SI, DetachCtxToStackFrame, + ClInstrumentCilk || Instrument); + } + } + + if (verifyFunction(F, &errs())) { + DEBUG(F.dump()); + assert(0); + } + + // Inline Cilk runtime calls in the function and generated helper functions. + inlineCilkFunctions(F); + for (Function *H : *NewHelpers) + inlineCilkFunctions(*H); + + return NewHelpers; +} + +bool LowerTapirToCilk::runOnModule(Module &M) { + if (skipModule(M)) + return false; + + // Add functions that detach to the work list. + SmallVector WorkList; + for (Function &F : M) + for (BasicBlock &BB : F) + if (isa(BB.getTerminator())) { + WorkList.push_back(&F); + break; + } + + if (WorkList.empty()) + return false; + + bool Changed = false; + std::unique_ptr> NewHelpers; + while (!WorkList.empty()) { + // Process the next function. + Function *F = WorkList.back(); + WorkList.pop_back(); + DominatorTree &DT = getAnalysis(*F).getDomTree(); + AssumptionCacheTracker &ACT = getAnalysis(); + NewHelpers.reset(processFunction(*F, DT, ACT.getAssumptionCache(*F))); + Changed |= !NewHelpers->empty(); + // Check the generated helper functions to see if any need to be processed, + // that is, to see if any of them themselves detach a subtask. + for (Function *Helper : *NewHelpers) + for (BasicBlock &BB : *Helper) + if (isa(BB.getTerminator())) + WorkList.push_back(Helper); + } + return Changed; +} + +// createLowerTapirToCilkPass - Provide an entry point to create this pass. +// +namespace llvm { +ModulePass *createLowerTapirToCilkPass(bool DisablePostOpts, bool Instrument) { + return new LowerTapirToCilk(DisablePostOpts, Instrument); +} +} diff --git a/llvm/lib/Transforms/Tapir/Outline.cpp b/llvm/lib/Transforms/Tapir/Outline.cpp new file mode 100644 index 00000000000000..ce347c4bf7fdf6 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/Outline.cpp @@ -0,0 +1,379 @@ +//===- TapirOutline.cpp - Outlining for Tapir -----------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements helper functions for outlining portions of code +// containing Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir/Outline.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +#define DEBUG_TYPE "outlining" + +/// definedInRegion - Return true if the specified value is defined in the +/// extracted region. +static bool definedInRegion(const SmallPtrSetImpl &Blocks, + Value *V) { + if (Instruction *I = dyn_cast(V)) + if (Blocks.count(I->getParent())) + return true; + return false; +} + +/// definedInCaller - Return true if the specified value is defined in the +/// function being code extracted, but not in the region being extracted. +/// These values must be passed in as live-ins to the function. +static bool definedInCaller(const SmallPtrSetImpl &Blocks, + Value *V) { + if (isa(V)) return true; + if (Instruction *I = dyn_cast(V)) + if (!Blocks.count(I->getParent())) + return true; + return false; +} + +void llvm::findInputsOutputs(const SmallPtrSetImpl &Blocks, + ValueSet &Inputs, + ValueSet &Outputs, + const SmallPtrSetImpl *ExitBlocks) { + for (BasicBlock *BB : Blocks) { + // If a used value is defined outside the region, it's an input. If an + // instruction is used outside the region, it's an output. + for (Instruction &II : *BB) { + for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE; + ++OI) { + // The PHI nodes in each exit block will be updated after the exit block + // is cloned. Hence, we don't want to count their uses of values + // defined outside the region. + if (ExitBlocks->count(BB)) + if (PHINode *PN = dyn_cast(&II)) + if (!Blocks.count(PN->getIncomingBlock(*OI))) + continue; + if (definedInCaller(Blocks, *OI)) + Inputs.insert(*OI); + } + + for (User *U : II.users()) + if (!definedInRegion(Blocks, U)) { + Outputs.insert(&II); + break; + } + } + } +} + +// Clone Blocks into NewFunc, transforming the old arguments into references to +// VMap values. +// +/// TODO: Fix the std::vector part of the type of this function. +void llvm::CloneIntoFunction(Function *NewFunc, const Function *OldFunc, + std::vector Blocks, + ValueToValueMapTy &VMap, + bool ModuleLevelChanges, + SmallVectorImpl &Returns, + const StringRef NameSuffix, + SmallPtrSetImpl *ExitBlocks, + DISubprogram *SP, + ClonedCodeInfo *CodeInfo, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + // Get the predecessors of the exit blocks + SmallPtrSet ExitBlockPreds, ClonedEBPreds; + for (BasicBlock *EB : *ExitBlocks) + for (BasicBlock *Pred : predecessors(EB)) + ExitBlockPreds.insert(Pred); + + // When we remap instructions, we want to avoid duplicating inlined + // DISubprograms, so record all subprograms we find as we duplicate + // instructions and then freeze them in the MD map. + DebugInfoFinder DIFinder; + + // Loop over all of the basic blocks in the function, cloning them as + // appropriate. + for (const BasicBlock *BB : Blocks) { + // Record all exit block predecessors that are cloned. + if (ExitBlockPreds.count(BB)) + ClonedEBPreds.insert(BB); + + // Create a new basic block and copy instructions into it! + BasicBlock *CBB = CloneBasicBlock(BB, VMap, NameSuffix, NewFunc, CodeInfo, + SP ? &DIFinder : nullptr); + + // Add basic block mapping. + VMap[BB] = CBB; + + // It is only legal to clone a function if a block address within that + // function is never referenced outside of the function. Given that, we + // want to map block addresses from the old function to block addresses in + // the clone. (This is different from the generic ValueMapper + // implementation, which generates an invalid blockaddress when + // cloning a function.) + if (BB->hasAddressTaken()) { + Constant *OldBBAddr = BlockAddress::get(const_cast(OldFunc), + const_cast(BB)); + VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB); + } + + // Note return instructions for the caller. + if (ReturnInst *RI = dyn_cast(CBB->getTerminator())) + Returns.push_back(RI); + } + + // For each exit block, clean up its phi nodes to exclude predecessors that + // were not cloned. + if (ExitBlocks) { + for (BasicBlock *EB : *ExitBlocks) { + // Get the predecessors of this exit block that were not cloned. + SmallVector PredNotCloned; + for (BasicBlock *Pred : predecessors(EB)) + if (!ClonedEBPreds.count(Pred)) + PredNotCloned.push_back(Pred); + + // Iterate over the phi nodes in the cloned exit block and remove incoming + // values from predecessors that were not cloned. + BasicBlock *ClonedEB = cast(VMap[EB]); + BasicBlock::iterator BI = ClonedEB->begin(); + while (PHINode *PN = dyn_cast(BI)) { + for (BasicBlock *DeadPred : PredNotCloned) + if (PN->getBasicBlockIndex(DeadPred) > -1) + PN->removeIncomingValue(DeadPred); + ++BI; + } + } + } + + // for (DISubprogram *ISP : DIFinder.subprograms()) { + // if (ISP != SP) { + // VMap.MD()[ISP].reset(ISP); + // } + // } + + // Loop over all of the instructions in the function, fixing up operand + // references as we go. This uses VMap to do all the hard work. + for (const BasicBlock *BB : Blocks) { + BasicBlock *CBB = cast(VMap[BB]); + // Loop over all instructions, fixing each one as we find it... + for (Instruction &II : *CBB) + RemapInstruction(&II, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); + } +} + +/// Create a helper function whose signature is based on Inputs and +/// Outputs as follows: f(in0, ..., inN, out0, ..., outN) +/// +/// TODO: Fix the std::vector part of the type of this function. +Function *llvm::CreateHelper(const ValueSet &Inputs, + const ValueSet &Outputs, + std::vector Blocks, + BasicBlock *Header, + const BasicBlock *OldEntry, + const BasicBlock *OldExit, + ValueToValueMapTy &VMap, + Module *DestM, + bool ModuleLevelChanges, + SmallVectorImpl &Returns, + const StringRef NameSuffix, + SmallPtrSetImpl *ExitBlocks, + const Instruction *InputSyncRegion, + ClonedCodeInfo *CodeInfo, + ValueMapTypeRemapper *TypeMapper, + ValueMaterializer *Materializer) { + DEBUG(dbgs() << "inputs: " << Inputs.size() << "\n"); + DEBUG(dbgs() << "outputs: " << Outputs.size() << "\n"); + + Function *OldFunc = Header->getParent(); + Type *RetTy = Type::getVoidTy(Header->getContext()); + + std::vector paramTy; + + // Add the types of the input values to the function's argument list + for (Value *value : Inputs) { + DEBUG(dbgs() << "value used in func: " << *value << "\n"); + paramTy.push_back(value->getType()); + } + + // Add the types of the output values to the function's argument list. + for (Value *output : Outputs) { + DEBUG(dbgs() << "instr used in func: " << *output << "\n"); + paramTy.push_back(PointerType::getUnqual(output->getType())); + } + + DEBUG({ + dbgs() << "Function type: " << *RetTy << " f("; + for (Type *i : paramTy) + dbgs() << *i << ", "; + dbgs() << ")\n"; + }); + + FunctionType *FTy = FunctionType::get(RetTy, paramTy, false); + + // Create the new function + Function *NewFunc = Function::Create(FTy, + GlobalValue::InternalLinkage, + OldFunc->getName() + "_" + + Header->getName() + NameSuffix, DestM); + + // Set names for input and output arguments. + Function::arg_iterator DestI = NewFunc->arg_begin(); + for (Value *I : Inputs) + if (VMap.count(I) == 0) { // Is this argument preserved? + DestI->setName(I->getName()+NameSuffix); // Copy the name over... + VMap[I] = &*DestI++; // Add mapping to VMap + } + for (Value *I : Outputs) + if (VMap.count(I) == 0) { // Is this argument preserved? + DestI->setName(I->getName()+NameSuffix); // Copy the name over... + VMap[I] = &*DestI++; // Add mapping to VMap + } + + // Copy all attributes other than those stored in the AttributeSet. We need + // to remap the parameter indices of the AttributeSet. + AttributeList NewAttrs = NewFunc->getAttributes(); + NewFunc->copyAttributesFrom(OldFunc); + NewFunc->setAttributes(NewAttrs); + + // Fix up the personality function that got copied over. + if (OldFunc->hasPersonalityFn()) + NewFunc->setPersonalityFn( + MapValue(OldFunc->getPersonalityFn(), VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer)); + + SmallVector NewArgAttrs(NewFunc->arg_size()); + AttributeList OldAttrs = OldFunc->getAttributes(); + + // Clone any argument attributes + for (Argument &OldArg : OldFunc->args()) { + // Check if we're passing this argument to the helper. We check Inputs here + // instead of the VMap to avoid potentially populating the VMap with a null + // entry for the old argument. + if (Inputs.count(&OldArg) || Outputs.count(&OldArg)) { + Argument *NewArg = dyn_cast(VMap[&OldArg]); + NewArgAttrs[NewArg->getArgNo()] = + OldAttrs.getParamAttributes(OldArg.getArgNo()); + } + } + + // Ignore the return attributes of the old function. + NewFunc->setAttributes( + AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(), + AttributeSet(), NewArgAttrs)); + + // Clone the metadata from the old function into the new. + bool MustCloneSP = + OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent(); + DISubprogram *SP = OldFunc->getSubprogram(); + if (SP) { + assert(!MustCloneSP || ModuleLevelChanges); + // Add mappings for some DebugInfo nodes that we don't want duplicated + // even if they're distinct. + auto &MD = VMap.MD(); + MD[SP->getUnit()].reset(SP->getUnit()); + MD[SP->getType()].reset(SP->getType()); + MD[SP->getFile()].reset(SP->getFile()); + // If we're not cloning into the same module, no need to clone the + // subprogram + if (!MustCloneSP) + MD[SP].reset(SP); + } + + SmallVector, 1> MDs; + OldFunc->getAllMetadata(MDs); + for (auto MD : MDs) { + NewFunc->addMetadata( + MD.first, + *MapMetadata(MD.second, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer)); + } + + // We assume that the Helper reads and writes its arguments. If the parent + // function had stronger attributes on memory access -- specifically, if the + // parent is marked as only reading memory -- we must replace this attribute + // with an appropriate weaker form. + if (OldFunc->onlyReadsMemory()) { + NewFunc->removeFnAttr(Attribute::ReadNone); + NewFunc->removeFnAttr(Attribute::ReadOnly); + NewFunc->setOnlyAccessesArgMemory(); + } + + // Inherit the calling convention from the parent. + NewFunc->setCallingConv(OldFunc->getCallingConv()); + + // The new function needs a root node because other nodes can branch to the + // head of the region, but the entry node of a function cannot have preds. + BasicBlock *NewEntry = BasicBlock::Create(Header->getContext(), + OldEntry->getName()+NameSuffix, + NewFunc); + // The new function also needs an exit node. + BasicBlock *NewExit = BasicBlock::Create(Header->getContext(), + OldExit->getName()+NameSuffix, + NewFunc); + + // Add mappings to the NewEntry and NewExit. + VMap[OldEntry] = NewEntry; + VMap[OldExit] = NewExit; + + // Create new sync region to replace the old one containing any cloned Tapir + // instructions, and add the appropriate mappings. + if (InputSyncRegion) { + Instruction *NewSR = InputSyncRegion->clone(); + if (InputSyncRegion->hasName()) + NewSR->setName(InputSyncRegion->getName()+NameSuffix); + NewEntry->getInstList().push_back(NewSR); + VMap[InputSyncRegion] = NewSR; + } + + // Clone Blocks into the new function. + CloneIntoFunction(NewFunc, OldFunc, Blocks, VMap, ModuleLevelChanges, + Returns, NameSuffix, ExitBlocks, SP, CodeInfo, + TypeMapper, Materializer); + + // Add a branch in the new function to the cloned Header. + BranchInst::Create(cast(VMap[Header]), NewEntry); + // Add a return in the new function. + ReturnInst::Create(Header->getContext(), NewExit); + + return NewFunc; +} + +// Add alignment assumptions to parameters of outlined function, based on known +// alignment data in the caller. +void llvm::AddAlignmentAssumptions(const Function *Caller, + const ValueSet &Inputs, + ValueToValueMapTy &VMap, + const Instruction *CallSite, + AssumptionCache *AC, + DominatorTree *DT) { + auto &DL = Caller->getParent()->getDataLayout(); + for (Value *ArgVal : Inputs) { + // Ignore arguments to non-pointer types + if (!ArgVal->getType()->isPointerTy()) continue; + Argument *Arg = cast(VMap[ArgVal]); + // Ignore arguments to non-pointer types + if (!Arg->getType()->isPointerTy()) continue; + // If the argument already has an alignment attribute, skip it. + if (Arg->getParamAlignment()) continue; + // Get any known alignment information for this argument's value. + unsigned Align = getKnownAlignment(ArgVal, DL, CallSite, AC, DT); + // If we have alignment data, add it as an attribute to the outlined + // function's parameter. + if (Align) + Arg->addAttr(Attribute::getWithAlignment(Arg->getContext(), Align)); + } +} diff --git a/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp b/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp new file mode 100644 index 00000000000000..8b9242b1424e4a --- /dev/null +++ b/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp @@ -0,0 +1,87 @@ + +#include "llvm/Transforms/Tapir.h" + +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/CFG.h" + +using namespace llvm; + +namespace { +struct RedundantSpawn : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + RedundantSpawn() : FunctionPass(ID) { + //initializeRedundantSpawnPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + //AU.addRequired(); + //AU.addPreserved(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + F.setName("RedundantSpawn_"+F.getName()); + + bool effective = false; + do { + effective = false; + TerminatorInst* prior = nullptr; + BasicBlock* start = nullptr; + bool lookForDetach = false; + int rank = 0; + for (BasicBlock &BB: F) { + if (isa(BB.getTerminator()) && BB.size() == 1) { + lookForDetach = true; + start = &BB; + effective = true; + break; + } + if (prior != nullptr && isa(prior)) + rank +=1; + if (prior != nullptr && isa(prior)) + rank -=1; + prior = BB.getTerminator(); + } + if (lookForDetach) { + BasicBlock* current = start; + int currentRank = rank; + while (true) { + for (BasicBlock *Pred : predecessors(current)) { + current = Pred; + break; + } + if (isa(current->getTerminator()) && currentRank == rank) { + BranchInst* replaceReattach = BranchInst::Create(start->getSingleSuccessor()); + BranchInst* replaceDetach = BranchInst::Create(current->getTerminator()->getSuccessor(0)); + ReplaceInstWithInst(start->getTerminator(), replaceReattach); + ReplaceInstWithInst(current->getTerminator(), replaceDetach); + break; + } + if (isa(current->getTerminator())) + currentRank -= 1; + if (isa(current->getTerminator())) + currentRank += 1; + } + } + } while (effective); + + return true; + } +}; +} + +char RedundantSpawn::ID = 0; +static RegisterPass X("redundantspawn", "Do RedundantSpawn pass", false, false); + +// Public interface to the RedundantSpawn pass +FunctionPass *llvm::createRedundantSpawnPass() { + return new RedundantSpawn(); +} diff --git a/llvm/lib/Transforms/Tapir/SmallBlock.cpp b/llvm/lib/Transforms/Tapir/SmallBlock.cpp new file mode 100644 index 00000000000000..c46e90baeb620a --- /dev/null +++ b/llvm/lib/Transforms/Tapir/SmallBlock.cpp @@ -0,0 +1,68 @@ + +#include "llvm/Transforms/Tapir.h" + +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" + +using namespace llvm; + +namespace { +struct SmallBlock : public FunctionPass { + static const int threshold = 10; + static char ID; // Pass identification, replacement for typeid + SmallBlock() : FunctionPass(ID) { + //initializeSmallBlockPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + //AU.addRequired(); + //AU.addPreserved(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + F.setName("SmallBlock_"+F.getName()); + + BasicBlock* b = nullptr; + BasicBlock* prior = nullptr; + bool effective; + int count = 0; + do { + effective = false; + for (BasicBlock &BB: F) { + count += BB.size(); + if (isa(BB.getTerminator())) { + b = &BB; + count = 0; + } + if (isa(BB.getTerminator()) && count < threshold && prior != b) { + // b ensured to be the corresponding reattach + effective = true; + prior = b; + BranchInst* replaceReattach = BranchInst::Create(BB.getSingleSuccessor()); + BranchInst* replaceDetach = BranchInst::Create(b->getTerminator()->getSuccessor(0)); + ReplaceInstWithInst(BB.getTerminator(), replaceReattach); + ReplaceInstWithInst(b->getTerminator(), replaceDetach); + } + } + } while (effective); + + return true; + } +}; +} + +char SmallBlock::ID = 0; +static RegisterPass X("smallblock", "Do SmallBlock pass", false, false); + +// Public interface to the SmallBlock pass +FunctionPass *llvm::createSmallBlockPass() { + return new SmallBlock(); +} diff --git a/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp b/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp new file mode 100644 index 00000000000000..2b0b15ca1900a6 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp @@ -0,0 +1,48 @@ + +#include "llvm/Transforms/Tapir.h" + +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/CFG.h" + +using namespace llvm; + +namespace { +struct SpawnRestructure : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + SpawnRestructure() : FunctionPass(ID) { + //initializeSpawnRestructurePass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + //AU.addRequired(); + //AU.addPreserved(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + F.setName("SpawnRestructure_"+F.getName()); + + for (BasicBlock &BB: F) { + + } + + return true; + } +}; +} + +char SpawnRestructure::ID = 0; +static RegisterPass X("spawnrestructure", "Do SpawnRestructure pass", false, false); + +// Public interface to the RedundantSpawn pass +FunctionPass *llvm::createSpawnRestructurePass() { + return new SpawnRestructure(); +} diff --git a/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp b/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp new file mode 100644 index 00000000000000..9206c90b987393 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp @@ -0,0 +1,96 @@ + +#include "llvm/Transforms/Tapir.h" + +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/IR/CFG.h" + +using namespace llvm; + +namespace { +struct SpawnUnswitch : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + SpawnUnswitch() : FunctionPass(ID) { + //initializeSpawnUnswitchPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + //AU.addRequired(); + //AU.addPreserved(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + F.setName("SpawnUnswitch_"+F.getName()); + + + bool effective; + do { + effective = false; + BasicBlock* body = nullptr; + BasicBlock* end = nullptr; + + for (BasicBlock &BB: F) { + if (BB.size() == 1 && isa(BB.getTerminator())) { + end = BB.getSingleSuccessor(); + int count = 0; + for (BasicBlock *Pred : predecessors(&BB)) { + for (BasicBlock *PredPred : predecessors(Pred)) { + if (!isa(PredPred->getTerminator())) { + body = Pred; + } + } + count++; + } + if (count == 2) { // only predecessors are det.achd and if.then + for (BasicBlock *Pred : predecessors(&BB)) { + if (Pred->size() == 2 && isa(Pred->getTerminator())) { // if clause only compares register contents + Instruction* cmp = nullptr; + for (Instruction &I : *Pred) { + cmp = &I; + break; + } + for (BasicBlock *PredPred : predecessors(Pred)) { + if (DetachInst *DI = dyn_cast(PredPred->getTerminator())) { // outer spawn + Value *SyncRegion = DI->getSyncRegion(); + effective = true; + // move cmp instruction to outside spawn + Instruction *pi = PredPred->getTerminator(); + cmp->moveBefore(pi); + + // branch now to detach or end + TerminatorInst* temp = Pred->getTerminator(); + BranchInst* replaceDetach = BranchInst::Create(Pred, end, ((BranchInst*)temp)->getCondition()); + ReplaceInstWithInst(PredPred->getTerminator(), replaceDetach); + + // detach now goes straight to body + DetachInst* newDetach = DetachInst::Create(body, end, SyncRegion); + ReplaceInstWithInst(Pred->getTerminator(), newDetach); + } + } + } + } + } + } + } + } while (effective); + + return true; + } +}; +} + +char SpawnUnswitch::ID = 0; +static RegisterPass X("spawnunswitch", "Do SpawnUnswitch pass", false, false); + +// Public interface to the RedundantSpawn pass +FunctionPass *llvm::createSpawnUnswitchPass() { + return new SpawnUnswitch(); +} diff --git a/llvm/lib/Transforms/Tapir/SyncElimination.cpp b/llvm/lib/Transforms/Tapir/SyncElimination.cpp new file mode 100644 index 00000000000000..62301069348471 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/SyncElimination.cpp @@ -0,0 +1,273 @@ +//===- SyncElimination.cpp - Eliminate unnecessary sync calls ----------------===// + +#include "llvm/Transforms/Tapir.h" + +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/CFG.h" +#include "llvm/ADT/SmallSet.h" + +#include +#include + +using namespace llvm; + +namespace { + +typedef SmallSet BasicBlockSet; +typedef std::deque BasicBlockDeque; + +struct SyncElimination : public FunctionPass { + static char ID; // Pass identification, replacement for typeid + + SyncElimination() : FunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + + bool runOnFunction(Function &F) override { + if (skipFunction(F)) + return false; + + errs() << "SyncElimination: Found function: " << F.getName() << "\n"; + + bool ChangedAny = false; + + while (true) { + bool Changed = false; + + for (BasicBlock &block: F) { + if (isa(block.getTerminator())) { + if (processSyncInstBlock(block)) { + Changed = true; + ChangedAny = true; + break; + } + } + } + + if (!Changed) { + break; + } + } + + return ChangedAny; + } + +private: + + // We will explain what Rosetta and Vegas are later. Or rename them. + // We promise. + + // Rosetta-finding code + + void findRosetta(const BasicBlock &BB, BasicBlockSet &OutputSet) { + assert(isa(BB.getTerminator())); + + BasicBlockSet Visited; + BasicBlockDeque Frontier; + std::map DetachLevel; + + DetachLevel[&BB] = 0; + Frontier.push_back(&BB); + OutputSet.insert(&BB); + + while (!Frontier.empty()) { + const BasicBlock *Current = Frontier.front(); + Frontier.pop_front(); + + for (const BasicBlock *Pred: predecessors(Current)) { + // TODO@jiahao: Investigate potential issues with continue edges here. + + if (Visited.count(Pred) > 0) { + continue; + } + + if (isa(Pred->getTerminator())) { + continue; + } + + Visited.insert(Pred); + + DetachLevel[Pred] = DetachLevel[Current]; + + if (isa(Pred->getTerminator())) { + DetachLevel[Pred] ++; + } else if (isa(Pred->getTerminator())) { + DetachLevel[Pred] --; + } + + if (DetachLevel[Pred] > 0) { + OutputSet.insert(Pred); + } + + if (DetachLevel[Pred] >= 0) { + Frontier.push_back(Pred); + } + } + } + } + + // Vegas-finding code + // + // We run BFS starting from the sync block, following all foward edges, and stop a branch whenever + // we hit another sync block. + + void findVegas(const BasicBlock &BB, BasicBlockSet &OutputSet) { + assert(isa(BB.getTerminator())); + + BasicBlockSet Visited; + BasicBlockDeque Frontier; + + Frontier.push_back(&BB); + + while (!Frontier.empty()) { + const BasicBlock *Current = Frontier.front(); + Frontier.pop_front(); + + for (const BasicBlock *Succ: successors(Current)) { + if (Visited.count(Succ) > 0) { + continue; + } + + Visited.insert(Succ); + OutputSet.insert(Succ); + + // We need to include blocks whose terminator is another sync. + // Therefore we still insert the block into OutputSet in this case. + // However we do not search any further past the sync block. + if (!isa(Succ->getTerminator())) { + Frontier.push_back(Succ); + } + } + } + } + + bool willMod(const ModRefInfo &Info) { + return (Info == MRI_Mod || Info == MRI_ModRef); + } + + bool instTouchesMemory(const Instruction &Inst) { + return Inst.getOpcode() == Instruction::Load || + Inst.getOpcode() == Instruction::Store || + Inst.getOpcode() == Instruction::VAArg || + Inst.getOpcode() == Instruction::AtomicCmpXchg || + Inst.getOpcode() == Instruction::AtomicRMW; + } + + // FIXME: we can do better + void checkBlowUp(const Instruction &Inst) { + if (isa(Inst)) { + errs() << Inst << "\n"; + llvm_unreachable("BOOOOOOOOOOOOOOOOOOOOOOOOM! not supported (yet)"); + } + } + + bool isSyncEliminationLegal(const BasicBlockSet &RosettaSet, const BasicBlockSet &VegasSet) { + AliasAnalysis *AA = &getAnalysis().getAAResults(); + + for (const BasicBlock *RBB : RosettaSet) { + for (const Instruction &RI : *RBB) { + checkBlowUp(RI); + + if (RI.getOpcode() == Instruction::Sync) { + continue; + } + + for (const BasicBlock *VBB : VegasSet) { + for (const Instruction &VI : *VBB) { + checkBlowUp(VI); + + if (VI.getOpcode() == Instruction::Sync) { + continue; + } + + ImmutableCallSite RC(&RI), VC(&VI); + + if (!!RC) { + // If RI is a call/invoke + if (instTouchesMemory(VI) && + AA->getModRefInfo(const_cast(&VI), RC) != MRI_NoModRef) { + errs() << "SyncElimination: Conflict found between " << RI << " and " << VI << "\n"; + return false; + } + } else if (!!VC) { + // If VI is a call/invoke + if (instTouchesMemory(RI) && + AA->getModRefInfo(const_cast(&RI), VC) != MRI_NoModRef) { + errs() << "SyncElimination: Conflict found between " << RI << " and " << VI << "\n"; + return false; + } + } else { + if (!instTouchesMemory(VI) || !instTouchesMemory(RI)) { + continue; + } + + // If neither instruction is a call/invoke + MemoryLocation VML = MemoryLocation::get(&VI); + MemoryLocation RML = MemoryLocation::get(&RI); + + if (AA->alias(RML, VML) && (willMod(AA->getModRefInfo(&RI, RML)) || willMod(AA->getModRefInfo(&VI, VML)))) { + // If the two memory location can potentially be aliasing each other, and + // at least one instruction modifies its memory location. + errs() << "SyncElimination: Conflict found between " << RI << " and " << VI << "\n"; + return false; + } + } + } + } + } + } + + return true; + } + + bool processSyncInstBlock(BasicBlock &BB) { + errs() << "SyncElimination: Found sync block: " << BB.getName() << "\n"; + + BasicBlockSet RosettaSet, VegasSet; + + findRosetta(BB, RosettaSet); + findVegas(BB, VegasSet); + + errs() << "SyncElimination: Blocks found in the Rosetta set: " << "\n"; + for (const BasicBlock *BB: RosettaSet) { + errs() << "SyncElimination: " + BB->getName() << "\n"; + } + + errs() << "SyncElimination: Blocks found in the Vegas set: " << "\n"; + for (const BasicBlock *BB: VegasSet) { + errs() << "SyncElimination: " + BB->getName() << "\n"; + } + + if (isSyncEliminationLegal(RosettaSet, VegasSet)) { + SyncInst *Sync = dyn_cast(BB.getTerminator()); + assert(Sync != NULL); + BasicBlock* suc = Sync->getSuccessor(0); + IRBuilder<> Builder(Sync); + Builder.CreateBr(suc); + Sync->eraseFromParent(); + errs() << "SyncElimination: A sync is removed. " << "\n"; + return true; + } + + return false; + } +}; + +} + +char SyncElimination::ID = 0; +static RegisterPass X("sync-elimination", "Do sync-elimination's pass", false, false); + +// Public interface to the SyncElimination pass +FunctionPass *llvm::createSyncEliminationPass() { + return new SyncElimination(); +} diff --git a/llvm/lib/Transforms/Tapir/Tapir.cpp b/llvm/lib/Transforms/Tapir/Tapir.cpp new file mode 100644 index 00000000000000..50813076c64b10 --- /dev/null +++ b/llvm/lib/Transforms/Tapir/Tapir.cpp @@ -0,0 +1,43 @@ +//===-- Tapir.cpp ---------------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements common infrastructure for libLLVMTapirOpts.a, which +// implements several transformations over the Tapir/LLVM intermediate +// representation, including the C bindings for that library. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Tapir.h" +#include "llvm-c/Initialization.h" +#include "llvm-c/Transforms/Tapir.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/IR/Verifier.h" +#include "llvm/InitializePasses.h" +#include "llvm/IR/LegacyPassManager.h" + +using namespace llvm; + +/// initializeTapirOpts - Initialize all passes linked into the +/// TapirOpts library. +void llvm::initializeTapirOpts(PassRegistry &Registry) { + initializeLoopSpawningPass(Registry); + initializeLowerTapirToCilkPass(Registry); +} + +void LLVMInitializeTapirOpts(LLVMPassRegistryRef R) { + initializeTapirOpts(*unwrap(R)); +} + +void LLVMAddLoopSpawningPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLoopSpawningPass()); +} + +void LLVMAddLowerTapirToCilkPass(LLVMPassManagerRef PM) { + unwrap(PM)->add(createLowerTapirToCilkPass()); +} diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp index 7da768252fc198..2402e4b99779c2 100644 --- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp +++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp @@ -149,6 +149,18 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU, // Don't break unwinding instructions. if (PredBB->getTerminator()->isExceptionalTerminator()) return false; + // For now, don't break syncs. + // TODO: Don't break syncs unless they don't sync anything. + if (isa(PredBB->getTerminator())) return false; + // Don't break entry blocks of detached CFG's. + for (pred_iterator PI = pred_begin(PredBB), PE = pred_end(PredBB); + PI != PE; ++PI) { + BasicBlock *PredPredBB = *PI; + if (const DetachInst *DI = + dyn_cast(PredPredBB->getTerminator())) + if (DI->getDetached() == PredBB) + return false; + } // Can't merge if there are multiple distinct successors. if (PredBB->getUniqueSuccessor() != BB) @@ -301,7 +313,18 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT, // block. assert(BB->getTerminator()->getNumSuccessors() == 1 && "Should have a single succ!"); - return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU); + // return SplitBlock(BB, BB->getTerminator(), DT, LI); + BasicBlock *NewBB = SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU); + if (SyncInst *OldSI = dyn_cast(NewBB->getTerminator())) { + // Make sure the original BB is terminated by the sync. + SyncInst *SI = SyncInst::Create(NewBB, OldSI->getSyncRegion(), + BB->getTerminator()); + BranchInst::Create(Succ, OldSI); + SI->setDebugLoc(OldSI->getDebugLoc()); + BB->getTerminator()->eraseFromParent(); + OldSI->eraseFromParent(); + } + return NewBB; } unsigned diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp index fafc9aaba5c9cc..befb2ed13587e9 100644 --- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp +++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp @@ -137,10 +137,27 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, assert(!isa(TI) && "Cannot split critical edge from IndirectBrInst"); + assert(!isa(TI) && + "Cannot split critical edge from ReattachInst"); + + bool SplittingDetachContinue = isa(TI) && (1 == SuccNum); + if (SplittingDetachContinue) + assert((Options.SplitDetachContinue && Options.DT) && + "Cannot split critical continuation edge from a detach"); BasicBlock *TIBB = TI->getParent(); BasicBlock *DestBB = TI->getSuccessor(SuccNum); + // If we're splitting a detach-continue edge, get the associated reattaches. + SmallVector Reattaches; + if (SplittingDetachContinue) { + BasicBlockEdge DetachEdge(TIBB, TI->getSuccessor(0)); + for (BasicBlock *Pred : predecessors(DestBB)) + if (isa(Pred->getTerminator())) + if (Options.DT->dominates(DetachEdge, Pred)) + Reattaches.push_back(Pred); + } + // Splitting the critical edge to a pad block is non-trivial. Don't do // it in this generic function. if (DestBB->isEHPad()) return nullptr; @@ -155,6 +172,12 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, // Branch to the new block, breaking the edge. TI->setSuccessor(SuccNum, NewBB); + // If we're splitting a detach-continue edge, redirect all appropriate + // reattach edges to branch to the new block + if (SplittingDetachContinue) + for (BasicBlock *RBB : Reattaches) + RBB->getTerminator()->setSuccessor(0, NewBB); + // Insert the block into the function... right after the block TI lives in. Function &F = *TIBB->getParent(); Function::iterator FBBI = TIBB->getIterator(); @@ -179,6 +202,28 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum, BBIdx = PN->getBasicBlockIndex(TIBB); PN->setIncomingBlock(BBIdx, NewBB); } + + // Update the PHI node entries for the reattach predecessors as well. + if (SplittingDetachContinue) { + for (BasicBlock *RBB : Reattaches) { + unsigned BBIdx = 0; + for (BasicBlock::iterator I = DestBB->begin(); isa(I); ++I) { + // We no longer enter through RBB, now we come in through NewBB. + // Revector exactly one entry in the PHI node that used to come from + // TIBB to come from NewBB. + PHINode *PN = cast(I); + + // Reuse the previous value of BBIdx if it lines up. In cases where we + // have multiple phi nodes with *lots* of predecessors, this is a speed + // win because we don't have to scan the PHI looking for TIBB. This + // happens because the BB list of PHI nodes are usually in the same + // order. + if (PN->getIncomingBlock(BBIdx) != RBB) + BBIdx = PN->getBasicBlockIndex(RBB); + PN->removeIncomingValue(BBIdx); + } + } + } } // If there are any other edges from TIBB to DestBB, update those to go diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt index cb3dc17c03ad8d..e89b1d3c221cc2 100644 --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -54,6 +54,7 @@ add_llvm_library(LLVMTransformUtils SplitModule.cpp StripNonLineTableDebugInfo.cpp SymbolRewriter.cpp + TapirUtils.cpp UnifyFunctionExitNodes.cpp Utils.cpp ValueMapper.cpp diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 623fe91a5a6094..42ad327ab195e9 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -62,6 +62,8 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include @@ -1623,6 +1625,18 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, !isa(CallSiteUnwindDestToken); } + // Get the entry block of the detached context into which we're inlining. If + // we move allocas from the inlined code, we must move them to this block. + BasicBlock *DetachedCtxEntryBlock; + { + BasicBlock *CallingBlock = TheCall->getParent(); + DetachedCtxEntryBlock = GetDetachedCtx(CallingBlock); + assert(((&(CallingBlock->getParent()->getEntryBlock()) == + DetachedCtxEntryBlock) || + DetachedCtxEntryBlock->getSinglePredecessor()) && + "Entry block of detached context has multiple predecessors."); + } + // Get an iterator to the last basic block in the function, which will have // the new function inlined after it. Function::iterator LastBlock = --Caller->end(); @@ -1781,7 +1795,8 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // calculate which instruction they should be inserted before. We insert the // instructions at the end of the current alloca list. { - BasicBlock::iterator InsertPoint = Caller->begin()->begin(); + // BasicBlock::iterator InsertPoint = Caller->begin()->begin(); + BasicBlock::iterator InsertPoint = DetachedCtxEntryBlock->begin(); for (BasicBlock::iterator I = FirstNewBlock->begin(), E = FirstNewBlock->end(); I != E; ) { AllocaInst *AI = dyn_cast(I++); @@ -1811,7 +1826,9 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // Transfer all of the allocas over in a block. Using splice means // that the instructions aren't removed from the symbol table, then // reinserted. - Caller->getEntryBlock().getInstList().splice( + // Caller->getEntryBlock().getInstList().splice( + // InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I); + DetachedCtxEntryBlock->getInstList().splice( InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I); } // Move any dbg.declares describing the allocas into the entry basic block. @@ -1819,6 +1836,23 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, for (auto &AI : IFI.StaticAllocas) replaceDbgDeclareForAlloca(AI, AI, DIB, DIExpression::NoDeref, 0, DIExpression::NoDeref); + + // Move any syncregion_start's into the entry basic block. + for (BasicBlock::iterator I = FirstNewBlock->begin(), + E = FirstNewBlock->end(); I != E; ) { + IntrinsicInst *II = dyn_cast(I++); + if (!II) continue; + if (Intrinsic::syncregion_start != II->getIntrinsicID()) + continue; + + while (isa(I) && + Intrinsic::syncregion_start == + cast(I)->getIntrinsicID()) + ++I; + + DetachedCtxEntryBlock->getInstList().splice( + InsertPoint, FirstNewBlock->getInstList(), II->getIterator(), I); + } } SmallVector VarArgsToForward; @@ -2224,6 +2258,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, // this is an invoke instruction or a call instruction. BasicBlock *AfterCallBB; BranchInst *CreatedBranchToNormalDest = nullptr; + if (InvokeInst *II = dyn_cast(TheCall)) { // Add an unconditional branch to make this look like the CallInst case... diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp index 380f4fca54d9ed..a9ac90d6e391da 100644 --- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp +++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp @@ -522,6 +522,12 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl &Worklist, if (Preheader) Changed = true; } + // Ensure that the preheader is not terminated by a sync. + if (Preheader && isa(Preheader->getTerminator())) { + DEBUG(dbgs() << "LoopSimplify: Splitting sync-terminated preheader.\n"); + SplitEdge(Preheader, L->getHeader(), DT, LI); + Preheader = L->getLoopPreheader(); + } // Next, check to make sure that all exit nodes of the loop only have // predecessors that are inside of the loop. This check guarantees that the diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp index da7ed2bd165268..f3feb40ac97e08 100644 --- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -154,6 +154,15 @@ BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI, return OnlyPred; } +//! Identify if a loop could be a cilk for loop and thus diasble unrolling +bool isCilkFor(Loop* L) { + //TODO use a more precise detection of cilk for loops + for (BasicBlock* BB : L->blocks()) + if (dyn_cast(BB->getTerminator())) + return true; + return false; +} + /// Check if unrolling created a situation where we need to insert phi nodes to /// preserve LCSSA form. /// \param Blocks is a vector of basic blocks representing unrolled loop. @@ -411,6 +420,7 @@ LoopUnrollResult llvm::UnrollLoop( // Are we eliminating the loop control altogether? bool CompletelyUnroll = Count == TripCount; + if (isCilkFor(L) && !CompletelyUnroll) return false; SmallVector ExitBlocks; L->getExitBlocks(ExitBlocks); std::vector OriginalLoopBlocks = L->getBlocks(); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 112e80d27e345d..240e92b81d1873 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -613,6 +613,67 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr, } } +/// Returns true if the instruction in a loop is guaranteed to execute at least +/// once. +bool llvm::isGuaranteedToExecute(const Instruction &Inst, + const DominatorTree *DT, const Loop *CurLoop, + const LoopSafetyInfo *SafetyInfo) { + // We have to check to make sure that the instruction dominates all + // of the exit blocks. If it doesn't, then there is a path out of the loop + // which does not execute this instruction, so we can't hoist it. + + // If the instruction is in the header block for the loop (which is very + // common), it is always guaranteed to dominate the exit blocks. Since this + // is a common case, and can save some work, check it now. + if (Inst.getParent() == CurLoop->getHeader()) + // If there's a throw in the header block, we can't guarantee we'll reach + // Inst. + return !SafetyInfo->HeaderMayThrow; + + // Somewhere in this loop there is an instruction which may throw and make us + // exit the loop. + if (SafetyInfo->MayThrow) + return false; + + // Get the exit blocks for the current loop. + SmallVector ExitBlocks; + CurLoop->getExitBlocks(ExitBlocks); + + // Verify that the block dominates each of the exit blocks of the loop. + for (unsigned i=0,e=ExitBlocks.size(); idominates(Inst.getParent(), ExitBlocks[i])) { + bool valid = false; + for( BasicBlock* b : CurLoop->getBlocks() ) { + if( auto RE = dyn_cast(b->getTerminator()) ) { + if( b == Inst.getParent() || DT->dominates(Inst.getParent(), b) ) { + bool tv = true; + for(unsigned i2=0; i2!=e; ++i2){ + if( !DT->dominates( RE->getSuccessor(0), ExitBlocks[i2] ) ) { + tv = false; break; + } + } + if( tv ) { + valid = true; + break; + } + } + } + } + if (valid) continue; + return false; + } + + // As a degenerate case, if the loop is statically infinite then we haven't + // proven anything since there are no exit blocks. + if (ExitBlocks.empty()) + return false; + + // FIXME: In general, we have to prove that the loop isn't an infinite loop. + // See http::llvm.org/PR24078 . (The "ExitBlocks.empty()" check above is + // just a special case of this.) + return true; +} + Optional llvm::getLoopEstimatedTripCount(Loop *L) { // Only support loops with a unique exiting block, and a latch. if (!L->getExitingBlock()) diff --git a/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/llvm/lib/Transforms/Utils/Mem2Reg.cpp index 23145e5847512a..269d9a18d12efa 100644 --- a/llvm/lib/Transforms/Utils/Mem2Reg.cpp +++ b/llvm/lib/Transforms/Utils/Mem2Reg.cpp @@ -35,18 +35,33 @@ STATISTIC(NumPromoted, "Number of alloca's promoted"); static bool promoteMemoryToRegister(Function &F, DominatorTree &DT, AssumptionCache &AC) { std::vector Allocas; - BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function bool Changed = false; + // Scan the function to get its entry block and all entry blocks of detached + // CFG's. We can perform this scan for entry blocks once for the function, + // because this pass preserves the CFG. + SmallVector EntryBlocks; + bool FunctionContainsDetach = false; + EntryBlocks.push_back(&F.getEntryBlock()); + for (BasicBlock &BB : F) + if (BasicBlock *Pred = BB.getUniquePredecessor()) + if (DetachInst *DI = dyn_cast(Pred->getTerminator())) { + FunctionContainsDetach = true; + if (DI->getDetached() == &BB) + EntryBlocks.push_back(&BB); + } + while (true) { Allocas.clear(); // Find allocas that are safe to promote, by looking at all instructions in // the entry node - for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I) - if (AllocaInst *AI = dyn_cast(I)) // Is it an alloca? - if (isAllocaPromotable(AI)) - Allocas.push_back(AI); + for (BasicBlock *BB : EntryBlocks) + for (BasicBlock::iterator I = BB->begin(), E = --BB->end(); I != E; ++I) + if (AllocaInst *AI = dyn_cast(I)) // Is it an alloca? + if (isAllocaPromotable(AI) && + (!FunctionContainsDetach || isAllocaParallelPromotable(AI, DT))) + Allocas.push_back(AI); if (Allocas.empty()) break; diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp index ae5e72ea4d30f3..87aafa83ecfcab 100644 --- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp +++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp @@ -127,6 +127,24 @@ void llvm::appendToCompilerUsed(Module &M, ArrayRef Values) { appendToUsedList(M, "llvm.compiler.used", Values); } +Function *llvm::checkCsiInterfaceFunction(Constant *FuncOrBitcast) { + if (Function *F = dyn_cast(FuncOrBitcast)) { + return F; + } + if (ConstantExpr *CE = dyn_cast(FuncOrBitcast)) { + if (CE->isCast() && CE->getOpcode() == Instruction::BitCast) { + if (Function *F = dyn_cast(CE->getOperand(0))) { + return F; + } + } + } + FuncOrBitcast->print(errs()); + std::string Err; + raw_string_ostream Stream(Err); + Stream << "ComprehensiveStaticInstrumentation interface function redefined: " << *FuncOrBitcast; + report_fatal_error(Err); +} + Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) { if (isa(FuncOrBitcast)) return cast(FuncOrBitcast); diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp index 91e4f4254b3e76..7e87fce8edf218 100644 --- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp +++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp @@ -61,6 +61,7 @@ STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block"); STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store"); STATISTIC(NumDeadAlloca, "Number of dead alloca's removed"); STATISTIC(NumPHIInsert, "Number of PHI nodes inserted"); +STATISTIC(NumAllocaWithDetachedUses, "Number of alloca's with detached uses"); bool llvm::isAllocaPromotable(const AllocaInst *AI) { // FIXME: If the memory unit is of pointer or integer type, we can permit @@ -143,13 +144,12 @@ struct AllocaInfo { DefiningBlocks.push_back(SI->getParent()); AllocaPointerVal = SI->getOperand(0); OnlyStore = SI; - } else { - LoadInst *LI = cast(User); + } else if (LoadInst *LI = dyn_cast(User)) { // Otherwise it must be a load instruction, keep track of variable // reads. UsingBlocks.push_back(LI->getParent()); AllocaPointerVal = LI; - } + } else continue; if (OnlyUsedInOneBlock) { if (!OnlyBlock) @@ -556,10 +556,18 @@ void PromoteMem2Reg::run() { LargeBlockInfo LBI; ForwardIDFCalculator IDF(DT); + bool FunctionContainsDetach = false; + { + for (BasicBlock &BB : F) + FunctionContainsDetach |= isa(BB.getTerminator()); + } + for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) { AllocaInst *AI = Allocas[AllocaNum]; assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!"); + assert((!FunctionContainsDetach || isAllocaParallelPromotable(AI, DT)) && + "Cannot promote non-promotable alloca in function with detach!"); assert(AI->getParent()->getParent() == &F && "All allocas should be in the same function, which is same as DF!"); @@ -607,17 +615,8 @@ void PromoteMem2Reg::run() { BBNumbers[&BB] = ID++; } - // Remember the dbg.declare intrinsic describing this alloca, if any. - if (!Info.DbgDeclares.empty()) - AllocaDbgDeclares[AllocaNum] = Info.DbgDeclares; - - // Keep the reverse mapping of the 'Allocas' array for the rename pass. - AllocaLookup[Allocas[AllocaNum]] = AllocaNum; - - // At this point, we're committed to promoting the alloca using IDF's, and - // the standard SSA construction algorithm. Determine which blocks need PHI - // nodes and see if we can optimize out some work by avoiding insertion of - // dead phi nodes. + // Determine which blocks need PHI nodes and see if we can optimize out some + // work by avoiding insertion of dead phi nodes. // Unique the set of defining blocks for efficient lookup. SmallPtrSet DefBlocks; @@ -628,14 +627,44 @@ void PromoteMem2Reg::run() { SmallPtrSet LiveInBlocks; ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); - // At this point, we're committed to promoting the alloca using IDF's, and - // the standard SSA construction algorithm. Determine which blocks need phi - // nodes and see if we can optimize out some work by avoiding insertion of - // dead phi nodes. + // Determine which blocks need PHI nodes and see if we can optimize out some + // work by avoiding insertion of dead phi nodes. IDF.setLiveInBlocks(LiveInBlocks); IDF.setDefiningBlocks(DefBlocks); SmallVector PHIBlocks; IDF.calculate(PHIBlocks); + + // Determine which PHI nodes want to use a value from a detached + // predecessor. Because register state is not preserved across a reattach, + // these alloca's cannot be promoted. + bool DetachedPred = false; + for (unsigned i = 0, e = PHIBlocks.size(); i != e && !DetachedPred; ++i) { + BasicBlock *BB = PHIBlocks[i]; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); + PI != E && !DetachedPred; ++PI) { + BasicBlock *P = *PI; + if (isa(P->getTerminator())) { + DEBUG(dbgs() << "Alloca " << *AI << " has use reattached from " << + P->getName() << "\n"); + DetachedPred = true; + } + } + } + if (DetachedPred) { + RemoveFromAllocasList(AllocaNum); + ++NumAllocaWithDetachedUses; + continue; + } + + // Remember the dbg.declare intrinsic describing this alloca, if any. + if (!Info.DbgDeclares.empty()) + AllocaDbgDeclares[AllocaNum] = Info.DbgDeclares; + + // Keep the reverse mapping of the 'Allocas' array for the rename pass. + AllocaLookup[Allocas[AllocaNum]] = AllocaNum; + + // At this point, we're committed to promoting the alloca using IDF's, and + // the standard SSA construction algorithm. if (PHIBlocks.size() > 1) llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) { return BBNumbers.lookup(A) < BBNumbers.lookup(B); @@ -791,7 +820,7 @@ void PromoteMem2Reg::run() { /// These are blocks which lead to uses. Knowing this allows us to avoid /// inserting PHI nodes into blocks which don't lead to uses (thus, the /// inserted phi nodes would be dead). -void PromoteMem2Reg::ComputeLiveInBlocks( +static void ExternComputeLiveInBlocks( AllocaInst *AI, AllocaInfo &Info, const SmallPtrSetImpl &DefBlocks, SmallPtrSetImpl &LiveInBlocks) { @@ -860,6 +889,62 @@ void PromoteMem2Reg::ComputeLiveInBlocks( } } +void PromoteMem2Reg::ComputeLiveInBlocks( + AllocaInst *AI, AllocaInfo &Info, + const SmallPtrSetImpl &DefBlocks, + SmallPtrSetImpl &LiveInBlocks) { + ExternComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); +} + +// \brief Augmentation is isAllocaPromotable to handle detach and reattach. +// +// TODO: Replace the implementation of this method to use an analysis of +// parallel regions. +bool llvm::isAllocaParallelPromotable(const AllocaInst *AIP, + DominatorTree &DT) { + AllocaInst* AI = const_cast(AIP); + AllocaInfo Info; + LargeBlockInfo LBI; + ForwardIDFCalculator IDF(DT); + + // Calculate the set of read and write-locations for each alloca. This is + // analogous to finding the 'uses' and 'definitions' of each variable. + Info.AnalyzeAlloca(AI); + + if (Info.OnlyUsedInOneBlock) return true; + + // Unique the set of defining blocks for efficient lookup. + SmallPtrSet DefBlocks; + DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end()); + + // Determine which blocks the value is live in. These are blocks which lead + // to uses. + SmallPtrSet LiveInBlocks; + ExternComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks); + + // Determine which blocks need PHI nodes and see if we can optimize out some + // work by avoiding insertion of dead phi nodes. + IDF.setLiveInBlocks(LiveInBlocks); + IDF.setDefiningBlocks(DefBlocks); + SmallVector PHIBlocks; + IDF.calculate(PHIBlocks); + + // Determine which PHI nodes want to use a value from a detached predecessor. + // Because register state is not preserved across a reattach, these alloca's + // cannot be promoted. + for (unsigned i = 0, e = PHIBlocks.size(); i != e; ++i) { + BasicBlock *BB = PHIBlocks[i]; + for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); + PI != E; ++PI) { + BasicBlock *P = *PI; + if (isa(P->getTerminator())) + return false; + } + } + + return true; +} + /// Queue a phi-node to be added to a basic-block for a specific Alloca. /// /// Returns true if there wasn't already a phi-node for that variable diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 9e5fb0e7172d4d..c9dced38c694f2 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -44,11 +44,18 @@ static AvailableValsTy &getAvailableVals(void *AV) { return *static_cast(AV); } +typedef DenseMap ValIsDetachedTy; +static ValIsDetachedTy &getValIsDetached(void *VID) { + return *static_cast(VID); +} + SSAUpdater::SSAUpdater(SmallVectorImpl *NewPHI) : InsertedPHIs(NewPHI) {} SSAUpdater::~SSAUpdater() { delete static_cast(AV); + if (VID) + delete static_cast(VID); } void SSAUpdater::Initialize(Type *Ty, StringRef Name) { @@ -56,6 +63,10 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) { AV = new AvailableValsTy(); else getAvailableVals(AV).clear(); + if (!VID) + VID = new ValIsDetachedTy(); + else + getValIsDetached(VID).clear(); ProtoType = Ty; ProtoName = Name; } @@ -107,6 +118,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { // predecessor. SmallVector, 8> PredValues; Value *SingularValue = nullptr; + BasicBlock *DetachPred = nullptr, *ReattachPred = nullptr; // We can get our predecessor info by walking the pred_iterator list, but it // is relatively slow. If we already have PHI nodes in this block, walk one @@ -115,6 +127,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) { BasicBlock *PredBB = SomePhi->getIncomingBlock(i); Value *PredVal = GetValueAtEndOfBlock(PredBB); + if (isa(PredBB->getTerminator())) { + ReattachPred = PredBB; + continue; + } + if (isa(PredBB->getTerminator())) + DetachPred = PredBB; PredValues.push_back(std::make_pair(PredBB, PredVal)); // Compute SingularValue. @@ -128,6 +146,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) { BasicBlock *PredBB = *PI; Value *PredVal = GetValueAtEndOfBlock(PredBB); + if (isa(PredBB->getTerminator())) { + ReattachPred = PredBB; + continue; + } + if (isa(PredBB->getTerminator())) + DetachPred = PredBB; PredValues.push_back(std::make_pair(PredBB, PredVal)); // Compute SingularValue. @@ -138,6 +162,18 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { SingularValue = nullptr; } } + // Record any values we discover whose definitions occur in detached blocks. + if (ReattachPred) { + assert(DetachPred && + "Reattached predecessor of a block with no detached predecessor."); + Value *DetachVal = GetValueAtEndOfBlock(DetachPred); + PredValues.push_back(std::make_pair(ReattachPred, DetachVal)); + Value *ReattachVal = GetValueAtEndOfBlock(ReattachPred); + if (ReattachVal != DetachVal) { + SingularValue = nullptr; + getValIsDetached(VID)[BB] = true; + } + } // If there are no predecessors, just return undef. if (PredValues.empty()) @@ -187,6 +223,10 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) { return InsertedPHI; } +bool SSAUpdater::GetValueIsDetachedInBlock(BasicBlock *BB) { + return getValIsDetached(VID)[BB]; +} + void SSAUpdater::RewriteUse(Use &U) { Instruction *User = cast(U.getUser()); @@ -274,6 +314,18 @@ class SSAUpdaterTraits { return UndefValue::get(Updater->ProtoType); } + /// BlockReattaches - Return true if this block is terminated with a + /// reattach, false otherwise. + static bool BlockReattaches(BasicBlock *BB, SSAUpdater *Updater) { + return isa(BB->getTerminator()); + } + + /// BlockReattaches - Return true if this block is terminated with a + /// detach, false otherwise. + static bool BlockDetaches(BasicBlock *BB, SSAUpdater *Updater) { + return isa(BB->getTerminator()); + } + /// CreateEmptyPHI - Create a new PHI instruction in the specified block. /// Reserve space for the operands but do not fill them in yet. static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds, @@ -326,7 +378,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) { if (Value *V = AvailableVals[BB]) return V; - SSAUpdaterImpl Impl(this, &AvailableVals, InsertedPHIs); + SSAUpdaterImpl Impl(this, &AvailableVals, InsertedPHIs, + &getValIsDetached(VID)); return Impl.GetValue(BB); } @@ -448,7 +501,14 @@ run(const SmallVectorImpl &Insts) const { // Okay, now we rewrite all loads that use live-in values in the loop, // inserting PHI nodes as necessary. for (LoadInst *ALoad : LiveInLoads) { - Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent()); + BasicBlock *BB = ALoad->getParent(); + Value *NewVal = SSA.GetValueInMiddleOfBlock(BB); + + // Skip loads whose definitions are detached. + if (Instruction *Def = dyn_cast(NewVal)) + if (SSA.GetValueIsDetachedInBlock(Def->getParent())) + continue; + replaceLoadWithValue(ALoad, NewVal); // Avoid assertions in unreachable code. @@ -463,6 +523,8 @@ run(const SmallVectorImpl &Insts) const { // Now that everything is rewritten, delete the old instructions from the // function. They should all be dead now. for (Instruction *User : Insts) { + if (isa(User) && !User->use_empty()) continue; + // If this is a load that still has uses, then the load must have been added // as a live value in the SSAUpdate data structure for a block (e.g. because // the loaded value was stored later). In this case, we need to recursively diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index 03b73954321d86..7480b94e34ab61 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -66,6 +66,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/TapirUtils.h" #include "llvm/Transforms/Utils/ValueMapper.h" #include #include @@ -5751,6 +5753,14 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI, return false; } +static bool BlockIsEntryOfDetachedCtx(const BasicBlock *BB) { + if (const BasicBlock *PredBB = BB->getSinglePredecessor()) + if (const DetachInst *DI = dyn_cast(PredBB->getTerminator())) + if (DI->getDetached() == BB) + return true; + return false; +} + bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder) { BasicBlock *BB = BI->getParent(); @@ -5769,6 +5779,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, (LoopHeaders->count(BB) || LoopHeaders->count(Succ))); BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator(); if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() && + !BlockIsEntryOfDetachedCtx(BB) && !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB)) return true; @@ -5993,6 +6004,139 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) { return false; } +/// If BB immediately syncs and BB's predecessor detaches, serialize +/// the sync and detach. This will allow normal serial +/// optimization passes to remove the blocks appropriately. Return +/// false if BB does not terminate with a reattach. +static bool serializeDetachToImmediateSync(BasicBlock *BB) { + Instruction *I = BB->getFirstNonPHIOrDbgOrLifetime(); + if (isa(I)) { + // This block is empty + bool Changed = false; + // Collect the detach and reattach predecessors. + SmallSet DetachPreds; + SmallVector ReattachPreds; + for (BasicBlock *PredBB : predecessors(BB)) { + if (DetachInst *DI = dyn_cast(PredBB->getTerminator())) + DetachPreds.insert(DI); + + if (ReattachInst *RI = dyn_cast(PredBB->getTerminator())) + ReattachPreds.push_back(RI); + } + Value *SyncRegion = cast(I)->getSyncRegion(); + for (DetachInst *DI : DetachPreds) { + BasicBlock *Detached = DI->getDetached(); + + // Replace the detach with a branch to the detached block. + BB->removePredecessor(DI->getParent()); + ReplaceInstWithInst(DI, BranchInst::Create(Detached)); + + // Move static alloca instructions in the detached block to the + // appropriate entry block. + MoveStaticAllocasInBlock(cast(SyncRegion)->getParent(), + Detached, ReattachPreds); + // We should not need to add new llvm.stacksave/llvm.stackrestore + // intrinsics, because we're not introducing new alloca's into a loop. + Changed = true; + } + for (Instruction *RI : ReattachPreds) { + // Replace the reattach with an unconditional branch. + ReplaceInstWithInst(RI, BranchInst::Create(BB)); + Changed = true; + } + return Changed; + } + return false; +} + +/// If BB immediately reattaches and BB's predecessor detaches, +/// serialize the reattach and detach. This will allow normal serial +/// optimization passes to remove the blocks appropriately. Return +/// false if BB does not terminate with a reattach or predecessor does +/// terminate with detach. +static bool serializeTrivialDetachedBlock(BasicBlock *BB) { + Instruction *I = BB->getFirstNonPHI(); + if (ReattachInst *RI = dyn_cast(I)) { + // This detached block is empty + // Scan predecessors to verify that all of them detach BB. + for (BasicBlock *PredBB : predecessors(BB)) { + if (!isa(PredBB->getTerminator())) + return false; + } + // All predecessors detach BB, so we can serialize + for (BasicBlock *PredBB : predecessors(BB)) { + DetachInst *DI = dyn_cast(PredBB->getTerminator()); + BasicBlock *Detached = DI->getDetached(); + BasicBlock *Continue = DI->getContinue(); + assert(RI->getSuccessor(0) == Continue && + "Reattach destination does not match continue block of associated detach."); + // Remove the predecessor through the detach from the continue + // block. + Continue->removePredecessor(PredBB); + // Serialize the detach: replace it with an unconditional branch. + ReplaceInstWithInst(DI, BranchInst::Create(Detached)); + } + // Serialize the reattach: replace it with an unconditional branch. + ReplaceInstWithInst(RI, BranchInst::Create(RI->getSuccessor(0))); + return true; + } + return false; +} + +/// If BB detaches an CFG that cannot reach the continuation, serialize the +/// detach. Assuming the CFG is valid, this scenario arises when the detached +/// CFG is terminated by unreachable instructions. +static bool serializeDetachOfUnreachable(BasicBlock *BB) { + // This method assumes that the detached CFG is valid. + Instruction *I = BB->getTerminator(); + if (DetachInst *DI = dyn_cast(I)) { + // Check if continuation of the detach is not reached by reattach + // instructions. If the detached CFG is valid, then the detached CFG must + // be terminated by unreachable instructions. + BasicBlock *Continue = DI->getContinue(); + for (BasicBlock *PredBB : predecessors(Continue)) + if (isa(PredBB->getTerminator())) + return false; + // TODO: Add stronger checks to make sure the detached CFG is valid. + // Remove the predecessor through the detach from the continue + // block. + Continue->removePredecessor(BB); + // Replace the detach with a branch to the detached block. + ReplaceInstWithInst(DI, BranchInst::Create(DI->getDetached())); + return true; + } + return false; +} + +// Remove any syncs whose sync region is empty, meaning that the region contains +// no detach instructions. These sync instructions don't synchronize anything, +// so they can be removed. +static bool removeEmptySyncs(BasicBlock *BB) { + if (SyncInst *SI = dyn_cast(BB->getTerminator())) { + // Get the sync region containing this sync + Value *SyncRegion = SI->getSyncRegion(); + bool SyncRegionIsEmpty = true; + SmallVector Syncs; + // Scan the Tapir instructions in this sync region. + for (User *U : SyncRegion->users()) { + // If the sync region contains a detach or a reattach, then it's not + // empty. + if (isa(U) || isa(U)) + SyncRegionIsEmpty = false; + // Collect the syncs in this region. + else if (isa(U)) + Syncs.push_back(cast(U)); + } + // If the sync region is empty, then remove all sync instructions in it. + if (SyncRegionIsEmpty) { + for (SyncInst *Sync : Syncs) + ReplaceInstWithInst(Sync, BranchInst::Create(Sync->getSuccessor(0))); + return true; + } + } + return false; +} + bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { bool Changed = false; @@ -6018,6 +6162,14 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) { // Check for and remove branches that will always cause undefined behavior. Changed |= removeUndefIntroducingPredecessor(BB); + // Check for and remove trivial detached blocks. + Changed |= serializeTrivialDetachedBlock(BB); + Changed |= serializeDetachToImmediateSync(BB); + Changed |= serializeDetachOfUnreachable(BB); + + // Check for and remove sync instructions in empty sync regions. + Changed |= removeEmptySyncs(BB); + // Merge basic blocks into their predecessor if there is only one distinct // pred, and if there is only one distinct successor of the predecessor, and // if there are no PHI nodes. diff --git a/llvm/lib/Transforms/Utils/TapirUtils.cpp b/llvm/lib/Transforms/Utils/TapirUtils.cpp new file mode 100644 index 00000000000000..cba2f39411076d --- /dev/null +++ b/llvm/lib/Transforms/Utils/TapirUtils.cpp @@ -0,0 +1,318 @@ +//===-- TapirUtils.cpp - Utility methods for Tapir -------------*- C++ -*--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file utility methods for handling code containing Tapir instructions. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/TapirUtils.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; + +#define DEBUG_TYPE "tapirutils" + +/// Return the result of AI->isStaticAlloca() if AI were moved to the entry +/// block. Allocas used in inalloca calls and allocas of dynamic array size +/// cannot be static. +/// (Borrowed from Transforms/Utils/InlineFunction.cpp) +static bool allocaWouldBeStaticInEntry(const AllocaInst *AI) { + return isa(AI->getArraySize()) && !AI->isUsedWithInAlloca(); +} + +// Check whether this Value is used by a lifetime intrinsic. +static bool isUsedByLifetimeMarker(Value *V) { + for (User *U : V->users()) { + if (IntrinsicInst *II = dyn_cast(U)) { + switch (II->getIntrinsicID()) { + default: break; + case Intrinsic::lifetime_start: + case Intrinsic::lifetime_end: + return true; + } + } + } + return false; +} + +// Check whether the given alloca already has +// lifetime.start or lifetime.end intrinsics. +static bool hasLifetimeMarkers(AllocaInst *AI) { + Type *Ty = AI->getType(); + Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(), + Ty->getPointerAddressSpace()); + if (Ty == Int8PtrTy) + return isUsedByLifetimeMarker(AI); + + // Do a scan to find all the casts to i8*. + for (User *U : AI->users()) { + if (U->getType() != Int8PtrTy) continue; + if (U->stripPointerCasts() != AI) continue; + if (isUsedByLifetimeMarker(U)) + return true; + } + return false; +} + +// Move static allocas in a cloned block into the entry block of helper. Leave +// lifetime markers behind for those static allocas. Returns true if the cloned +// block still contains dynamic allocas, which cannot be moved. +bool llvm::MoveStaticAllocasInBlock( + BasicBlock *Entry, + BasicBlock *Block, + SmallVectorImpl &ExitPoints) { + Function *F = Entry->getParent(); + SmallVector StaticAllocas; + bool ContainsDynamicAllocas = false; + BasicBlock::iterator InsertPoint = Entry->begin(); + for (BasicBlock::iterator I = Block->begin(), + E = Block->end(); I != E; ) { + AllocaInst *AI = dyn_cast(I++); + if (!AI) continue; + + if (!allocaWouldBeStaticInEntry(AI)) { + ContainsDynamicAllocas = true; + continue; + } + + StaticAllocas.push_back(AI); + + // Scan for the block of allocas that we can move over, and move them + // all at once. + while (isa(I) && + allocaWouldBeStaticInEntry(cast(I))) { + StaticAllocas.push_back(cast(I)); + ++I; + } + + // Transfer all of the allocas over in a block. Using splice means + // that the instructions aren't removed from the symbol table, then + // reinserted. + Entry->getInstList().splice( + InsertPoint, Block->getInstList(), AI->getIterator(), I); + } + // Move any dbg.declares describing the allocas into the entry basic block. + DIBuilder DIB(*F->getParent()); + for (auto &AI : StaticAllocas) + replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false); + + // Move any syncregion_start's into the entry basic block. + for (BasicBlock::iterator I = Block->begin(), + E = Block->end(); I != E; ) { + IntrinsicInst *II = dyn_cast(I++); + if (!II) continue; + if (Intrinsic::syncregion_start != II->getIntrinsicID()) + continue; + + while (isa(I) && + Intrinsic::syncregion_start == + cast(I)->getIntrinsicID()) + ++I; + + Entry->getInstList().splice( + InsertPoint, Block->getInstList(), II->getIterator(), I); + } + + // Leave lifetime markers for the static alloca's, scoping them to the + // from cloned block to cloned exit. + if (!StaticAllocas.empty()) { + IRBuilder<> Builder(&Block->front()); + for (unsigned ai = 0, ae = StaticAllocas.size(); ai != ae; ++ai) { + AllocaInst *AI = StaticAllocas[ai]; + // Don't mark swifterror allocas. They can't have bitcast uses. + if (AI->isSwiftError()) + continue; + + // If the alloca is already scoped to something smaller than the whole + // function then there's no need to add redundant, less accurate markers. + if (hasLifetimeMarkers(AI)) + continue; + + // Try to determine the size of the allocation. + ConstantInt *AllocaSize = nullptr; + if (ConstantInt *AIArraySize = + dyn_cast(AI->getArraySize())) { + auto &DL = F->getParent()->getDataLayout(); + Type *AllocaType = AI->getAllocatedType(); + uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType); + uint64_t AllocaArraySize = AIArraySize->getLimitedValue(); + + // Don't add markers for zero-sized allocas. + if (AllocaArraySize == 0) + continue; + + // Check that array size doesn't saturate uint64_t and doesn't + // overflow when it's multiplied by type size. + if (AllocaArraySize != ~0ULL && + UINT64_MAX / AllocaArraySize >= AllocaTypeSize) { + AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()), + AllocaArraySize * AllocaTypeSize); + } + } + + Builder.CreateLifetimeStart(AI, AllocaSize); + for (Instruction *ExitPoint : ExitPoints) { + IRBuilder<>(ExitPoint).CreateLifetimeEnd(AI, AllocaSize); + } + } + } + + return ContainsDynamicAllocas; +} + + +/// SerializeDetachedCFG - Serialize the sub-CFG detached by the +/// specified detach instruction. Removes the detach instruction and +/// returns a pointer to the branch instruction that replaces it. +/// +BranchInst *llvm::SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT) { + // Get the parent of the detach instruction. + BasicBlock *Detacher = DI->getParent(); + // Get the detached block and continuation of this detach. + BasicBlock *Detached = DI->getDetached(); + BasicBlock *Continuation = DI->getContinue(); + + assert(Detached->getSinglePredecessor() && + "Detached block has multiple predecessors."); + + // Get the detach edge from DI. + BasicBlockEdge DetachEdge(Detacher, Detached); + + // Collect the reattaches into the continuation. If DT is + // available, verify that all reattaches are dominated by the detach + // edge from DI. + SmallVector Reattaches; + // If we only find a single reattach into the continuation, capture + // it so we can later update the dominator tree. + BasicBlock *SingleReattacher = nullptr; + int ReattachesFound = 0; + for (auto PI = pred_begin(Continuation), PE = pred_end(Continuation); + PI != PE; PI++) { + BasicBlock *Pred = *PI; + // Skip the detacher. + if (Detacher == Pred) continue; + // Record the reattaches found. + if (isa(Pred->getTerminator())) { + ReattachesFound++; + if (!SingleReattacher) + SingleReattacher = Pred; + if (DT) { + assert(DT->dominates(DetachEdge, Pred) && + "Detach edge does not dominate a reattach into its continuation."); + } + Reattaches.push_back(cast(Pred->getTerminator())); + } + } + // TODO: It's possible to detach a CFG that does not terminate with a + // reattach. For example, optimizations can create detached CFG's that are + // terminated by unreachable terminators only. Some of these special cases + // lead to problems with other passes, however, and this check will identify + // those special cases early while we sort out those issues. + assert(!Reattaches.empty() && "No reattach found for detach."); + + // Replace each reattach with branches to the continuation. + for (ReattachInst *RI : Reattaches) { + BranchInst *ReplacementBr = BranchInst::Create(Continuation, RI); + ReplacementBr->setDebugLoc(RI->getDebugLoc()); + RI->eraseFromParent(); + } + + // Replace the new detach with a branch to the detached CFG. + BranchInst *ReplacementBr = BranchInst::Create(Detached, DI); + ReplacementBr->setDebugLoc(DI->getDebugLoc()); + DI->eraseFromParent(); + + // Update the dominator tree. + if (DT) + if (DT->dominates(Detacher, Continuation) && 1 == ReattachesFound) + DT->changeImmediateDominator(Continuation, SingleReattacher); + + return ReplacementBr; +} + +/// GetDetachedCtx - Get the entry basic block to the detached context +/// that contains the specified block. +/// +BasicBlock *llvm::GetDetachedCtx(BasicBlock *BB) { + return const_cast( + GetDetachedCtx(const_cast(BB))); +} + +const BasicBlock *llvm::GetDetachedCtx(const BasicBlock *BB) { + // Traverse the CFG backwards until we either reach the entry block + // of the function or we find a detach instruction that detaches the + // current block. + SmallPtrSet Visited; + SmallVector WorkList; + WorkList.push_back(BB); + while (!WorkList.empty()) { + const BasicBlock *CurrBB = WorkList.pop_back_val(); + if (!Visited.insert(CurrBB).second) + continue; + + for (auto PI = pred_begin(CurrBB), PE = pred_end(CurrBB); + PI != PE; ++PI) { + const BasicBlock *PredBB = *PI; + + // Skip predecessors via reattach instructions. The detacher + // block corresponding to this reattach is also a predecessor of + // the current basic block. + if (isa(PredBB->getTerminator())) + continue; + + // If the predecessor is terminated by a detach, check to see if + // that detach detached the current basic block. + if (isa(PredBB->getTerminator())) { + const DetachInst *DI = cast(PredBB->getTerminator()); + if (DI->getDetached() == CurrBB) + // Return the current block, which is the entry of this detached + // sub-CFG. + return CurrBB; + } + + // Otherwise, add the predecessor block to the work list to + // search. + WorkList.push_back(PredBB); + } + } + + // Our search didn't find anything, so return the entry of the + // function containing the given block. + return &(BB->getParent()->getEntryBlock()); +} + +/// isCriticalContinueEdge - Return true if the specified edge is a critical +/// detach-continue edge. Critical detach-continue edges are critical edges - +/// from a block with multiple successors to a block with multiple predecessors +/// - even after ignoring all reattach edges. +bool llvm::isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum) { + assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!"); + if (TI->getNumSuccessors() == 1) return false; + + // Edge must come from a detach. + if (!isa(TI)) return false; + // Edge must go to the continuation. + if (SuccNum != 1) return false; + + const BasicBlock *Dest = TI->getSuccessor(SuccNum); + const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest); + + // If there is more than one predecessor, this is a critical edge... + assert(I != E && "No preds, but we have an edge to the block?"); + const BasicBlock *DetachPred = TI->getParent(); + for (; I != E; ++I) { + if (DetachPred == *I) continue; + if (isa((*I)->getTerminator())) continue; + return true; + } + return false; +} diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index c45dee590b8452..3d7800dd9b82b0 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -2741,6 +2741,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { assert(VectorPH && "Invalid loop structure"); assert(ExitBlock && "Must have an exit block"); + BasicBlock *sync_split = nullptr; + if (isa(VectorPH->getTerminator())) { + sync_split = VectorPH->splitBasicBlockWithTerminator("vector.sync_split"); + DT->splitBlock(sync_split); + //DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]); + DT->verifyDomTree(); + VectorPH = sync_split; + } + // Some loops have a single integer induction variable, while other loops // don't. One example is c++ iterators that often have multiple pointer // induction variables. In the code below we also support a case where we @@ -2773,6 +2782,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() { ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, *LI); ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI); + if (sync_split) ParentLoop->addBasicBlockToLoop(sync_split, *LI); } else { LI->addTopLevelLoop(Lp); } diff --git a/llvm/microbenchmarks/everything/everything.c b/llvm/microbenchmarks/everything/everything.c new file mode 100644 index 00000000000000..d2dd0aa96e5f2c --- /dev/null +++ b/llvm/microbenchmarks/everything/everything.c @@ -0,0 +1,32 @@ +#include +#include + +int foo() { + return 10; +} + +int bar(); + +int main() { + double c = foo(); + cilk_spawn { + c += sin(c); + c += sin(c); + c += sin(c); + } + cilk_spawn { + cilk_spawn { + c += sin(c); + c += sin(c); + c += sin(c); + } + } + cilk_spawn { + if (c) { + c += sin(c); + c += sin(c); + c += sin(c); + } + } + return c; +} diff --git a/llvm/microbenchmarks/everything/everything.ll b/llvm/microbenchmarks/everything/everything.ll new file mode 100644 index 00000000000000..249549a7131cd5 --- /dev/null +++ b/llvm/microbenchmarks/everything/everything.ll @@ -0,0 +1,118 @@ +; ModuleID = 'everything.c' +source_filename = "everything.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: noinline nounwind ssp uwtable +define i32 @foo() #0 { +entry: + ret i32 10 +} + +; Function Attrs: noinline nounwind ssp uwtable +define i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %c = alloca double, align 8 + store i32 0, i32* %retval, align 4 + %call = call i32 @foo() + %conv = sitofp i32 %call to double + store double %conv, double* %c, align 8 + detach label %det.achd, label %det.cont + +det.achd: ; preds = %entry + %0 = bitcast i32 undef to i32 + %1 = load double, double* %c, align 8 + %call1 = call double @sin(double %1) #2 + %2 = load double, double* %c, align 8 + %add = fadd double %2, %call1 + store double %add, double* %c, align 8 + %3 = load double, double* %c, align 8 + %call2 = call double @sin(double %3) #2 + %4 = load double, double* %c, align 8 + %add3 = fadd double %4, %call2 + store double %add3, double* %c, align 8 + %5 = load double, double* %c, align 8 + %call4 = call double @sin(double %5) #2 + %6 = load double, double* %c, align 8 + %add5 = fadd double %6, %call4 + store double %add5, double* %c, align 8 + reattach label %det.cont + +det.cont: ; preds = %det.achd, %entry + detach label %det.achd6, label %det.cont15 + +det.achd6: ; preds = %det.cont + %7 = bitcast i32 undef to i32 + detach label %det.achd7, label %det.cont14 + +det.achd7: ; preds = %det.achd6 + %8 = bitcast i32 undef to i32 + %9 = load double, double* %c, align 8 + %call8 = call double @sin(double %9) #2 + %10 = load double, double* %c, align 8 + %add9 = fadd double %10, %call8 + store double %add9, double* %c, align 8 + %11 = load double, double* %c, align 8 + %call10 = call double @sin(double %11) #2 + %12 = load double, double* %c, align 8 + %add11 = fadd double %12, %call10 + store double %add11, double* %c, align 8 + %13 = load double, double* %c, align 8 + %call12 = call double @sin(double %13) #2 + %14 = load double, double* %c, align 8 + %add13 = fadd double %14, %call12 + store double %add13, double* %c, align 8 + reattach label %det.cont14 + +det.cont14: ; preds = %det.achd7, %det.achd6 + reattach label %det.cont15 + +det.cont15: ; preds = %det.cont14, %det.cont + detach label %det.achd16, label %det.cont23 + +det.achd16: ; preds = %det.cont15 + %15 = bitcast i32 undef to i32 + %16 = load double, double* %c, align 8 + %tobool = fcmp une double %16, 0.000000e+00 + br i1 %tobool, label %if.then, label %if.end + +if.then: ; preds = %det.achd16 + %17 = load double, double* %c, align 8 + %call17 = call double @sin(double %17) #2 + %18 = load double, double* %c, align 8 + %add18 = fadd double %18, %call17 + store double %add18, double* %c, align 8 + %19 = load double, double* %c, align 8 + %call19 = call double @sin(double %19) #2 + %20 = load double, double* %c, align 8 + %add20 = fadd double %20, %call19 + store double %add20, double* %c, align 8 + %21 = load double, double* %c, align 8 + %call21 = call double @sin(double %21) #2 + %22 = load double, double* %c, align 8 + %add22 = fadd double %22, %call21 + store double %add22, double* %c, align 8 + br label %if.end + +if.end: ; preds = %if.then, %det.achd16 + reattach label %det.cont23 + +det.cont23: ; preds = %if.end, %det.cont15 + %23 = load double, double* %c, align 8 + %conv24 = fptosi double %23 to i32 + ret i32 %conv24 +} + +; Function Attrs: nounwind readnone +declare double @sin(double) #1 + +attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind readnone } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"} diff --git a/llvm/microbenchmarks/everything/simple.c b/llvm/microbenchmarks/everything/simple.c new file mode 100644 index 00000000000000..aa4252c4bc3890 --- /dev/null +++ b/llvm/microbenchmarks/everything/simple.c @@ -0,0 +1,15 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c = 0; + for (int i=0; i < 1000; i++) { + cilk_spawn { + foo(); + } + } + return c; +} diff --git a/llvm/microbenchmarks/everything/simple.ll b/llvm/microbenchmarks/everything/simple.ll new file mode 100644 index 00000000000000..268be428dbd3c6 --- /dev/null +++ b/llvm/microbenchmarks/everything/simple.ll @@ -0,0 +1,53 @@ +; ModuleID = 'simple.c' +source_filename = "simple.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: noinline nounwind ssp uwtable +define i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %c = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + store i32 0, i32* %c, align 4 + store i32 0, i32* %i, align 4 + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %0 = load i32, i32* %i, align 4 + %cmp = icmp slt i32 %0, 1000 + br i1 %cmp, label %for.body, label %for.end + +for.body: ; preds = %for.cond + detach label %det.achd, label %det.cont + +det.achd: ; preds = %for.body + %1 = bitcast i32 undef to i32 + %call = call i32 (...) @foo() + reattach label %det.cont + +det.cont: ; preds = %det.achd, %for.body + br label %for.inc + +for.inc: ; preds = %det.cont + %2 = load i32, i32* %i, align 4 + %inc = add nsw i32 %2, 1 + store i32 %inc, i32* %i, align 4 + br label %for.cond + +for.end: ; preds = %for.cond + %3 = load i32, i32* %c, align 4 + ret i32 %3 +} + +declare i32 @foo(...) #1 + +attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"} diff --git a/llvm/microbenchmarks/everything/temp.ll b/llvm/microbenchmarks/everything/temp.ll new file mode 100644 index 00000000000000..5d49d66271d392 --- /dev/null +++ b/llvm/microbenchmarks/everything/temp.ll @@ -0,0 +1,24 @@ +; ModuleID = '' +source_filename = "everything.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: noinline norecurse nounwind readnone ssp uwtable +define i32 @SpawnUnswitch_SmallBlock_RedundantSpawn_foo() local_unnamed_addr #0 { +entry: + ret i32 10 +} + +; Function Attrs: noinline norecurse nounwind readnone ssp uwtable +define i32 @SpawnUnswitch_SmallBlock_RedundantSpawn_main() local_unnamed_addr #0 { +entry: + ret i32 9 +} + +attributes #0 = { noinline norecurse nounwind readnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"} diff --git a/llvm/microbenchmarks/redundantspawn/complex.c b/llvm/microbenchmarks/redundantspawn/complex.c new file mode 100644 index 00000000000000..23874168629bd1 --- /dev/null +++ b/llvm/microbenchmarks/redundantspawn/complex.c @@ -0,0 +1,32 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + cilk_spawn { + foo(); + bar(); + c = 2; + } + bar(); + cilk_spawn { + cilk_spawn { + cilk_spawn { + foo(); + } + } + bar(); + } + cilk_spawn { + cilk_spawn { + foo(); + foo(); + } + } + } + return c; +} diff --git a/llvm/microbenchmarks/redundantspawn/multiple_nested.c b/llvm/microbenchmarks/redundantspawn/multiple_nested.c new file mode 100644 index 00000000000000..3f9a1f235b183a --- /dev/null +++ b/llvm/microbenchmarks/redundantspawn/multiple_nested.c @@ -0,0 +1,21 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + cilk_spawn { + foo(); + bar(); + c = 2; + } + cilk_spawn { + foo(); + foo(); + } + } + return c; +} diff --git a/llvm/microbenchmarks/redundantspawn/multiple_redundant.c b/llvm/microbenchmarks/redundantspawn/multiple_redundant.c new file mode 100644 index 00000000000000..aa52f045e0be6f --- /dev/null +++ b/llvm/microbenchmarks/redundantspawn/multiple_redundant.c @@ -0,0 +1,20 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + cilk_spawn { + cilk_spawn { + cilk_spawn { + foo(); + foo(); + } + } + } + } + return c; +} diff --git a/llvm/microbenchmarks/redundantspawn/serial.c b/llvm/microbenchmarks/redundantspawn/serial.c new file mode 100644 index 00000000000000..12b21b6b0ebc38 --- /dev/null +++ b/llvm/microbenchmarks/redundantspawn/serial.c @@ -0,0 +1,15 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c = foo(); + if (c > 0) { + bar(); + } else { + foo(); + } + return c; +} diff --git a/llvm/microbenchmarks/redundantspawn/simple_spawn.c b/llvm/microbenchmarks/redundantspawn/simple_spawn.c new file mode 100644 index 00000000000000..41183d94ae8ad0 --- /dev/null +++ b/llvm/microbenchmarks/redundantspawn/simple_spawn.c @@ -0,0 +1,15 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + foo(); + bar(); + c = 2; + } + return c; +} diff --git a/llvm/microbenchmarks/redundantspawn/single_redundant.c b/llvm/microbenchmarks/redundantspawn/single_redundant.c new file mode 100644 index 00000000000000..33de19ce0f1872 --- /dev/null +++ b/llvm/microbenchmarks/redundantspawn/single_redundant.c @@ -0,0 +1,16 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + cilk_spawn { + foo(); + foo(); + } + } + return c; +} diff --git a/llvm/microbenchmarks/smallblock/conditional.c b/llvm/microbenchmarks/smallblock/conditional.c new file mode 100644 index 00000000000000..058b70da06735f --- /dev/null +++ b/llvm/microbenchmarks/smallblock/conditional.c @@ -0,0 +1,27 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c = foo(); + if (c*2 > 1) { + cilk_spawn { + if (c > 1) { + bar(); + } else { + foo(); + } + } + } else if (c*3 < 1) { + cilk_spawn { + bar(); + } + } else { + cilk_spawn { + foo(); + } + } + return c; +} diff --git a/llvm/microbenchmarks/smallblock/conditional.ll b/llvm/microbenchmarks/smallblock/conditional.ll new file mode 100644 index 00000000000000..6e796bb19273e1 --- /dev/null +++ b/llvm/microbenchmarks/smallblock/conditional.ll @@ -0,0 +1,66 @@ +; ModuleID = 'conditional.c' +source_filename = "conditional.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: nounwind ssp uwtable +define i32 @main() local_unnamed_addr #0 { +entry: + %call = tail call i32 (...) @foo() #2 + %mul = shl nsw i32 %call, 1 + %cmp = icmp sgt i32 %mul, 1 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + detach label %det.achd, label %if.end17 + +det.achd: ; preds = %if.then + %cmp1 = icmp sgt i32 %call, 1 + br i1 %cmp1, label %if.then2, label %if.else + +if.then2: ; preds = %det.achd + %call3 = tail call i32 (...) @bar() #2 + br label %if.end + +if.else: ; preds = %det.achd + %call4 = tail call i32 (...) @foo() #2 + br label %if.end + +if.end: ; preds = %if.else, %if.then2 + reattach label %if.end17 + +if.else5: ; preds = %entry + %cmp7 = icmp slt i32 %call, 1 + br i1 %cmp7, label %if.then8, label %if.else12 + +if.then8: ; preds = %if.else5 + detach label %det.achd9, label %if.end17 + +det.achd9: ; preds = %if.then8 + %call10 = tail call i32 (...) @bar() #2 + reattach label %if.end17 + +if.else12: ; preds = %if.else5 + detach label %det.achd13, label %if.end17 + +det.achd13: ; preds = %if.else12 + %call14 = tail call i32 (...) @foo() #2 + reattach label %if.end17 + +if.end17: ; preds = %det.achd9, %if.then8, %det.achd13, %if.else12, %if.then, %if.end + ret i32 %call +} + +declare i32 @foo(...) local_unnamed_addr #1 + +declare i32 @bar(...) local_unnamed_addr #1 + +attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"} diff --git a/llvm/microbenchmarks/smallblock/conditional_opt.ll b/llvm/microbenchmarks/smallblock/conditional_opt.ll new file mode 100644 index 00000000000000..226b5972c852b4 --- /dev/null +++ b/llvm/microbenchmarks/smallblock/conditional_opt.ll @@ -0,0 +1,89 @@ +; ModuleID = '' +source_filename = "conditional.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: noinline nounwind ssp uwtable +define i32 @SmallBlock_main() #0 { +entry: + %retval = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + %call = call i32 (...) @foo() + store i32 %call, i32* %c, align 4 + %0 = load i32, i32* %c, align 4 + %mul = mul nsw i32 %0, 2 + %cmp = icmp sgt i32 %mul, 1 + br i1 %cmp, label %if.then, label %if.else5 + +if.then: ; preds = %entry + br label %det.achd + +det.achd: ; preds = %if.then + %1 = bitcast i32 undef to i32 + %2 = load i32, i32* %c, align 4 + %cmp1 = icmp sgt i32 %2, 1 + br i1 %cmp1, label %if.then2, label %if.else + +if.then2: ; preds = %det.achd + %call3 = call i32 (...) @bar() + br label %if.end + +if.else: ; preds = %det.achd + %call4 = call i32 (...) @foo() + br label %if.end + +if.end: ; preds = %if.else, %if.then2 + br label %det.cont + +det.cont: ; preds = %if.end + br label %if.end17 + +if.else5: ; preds = %entry + %3 = load i32, i32* %c, align 4 + %mul6 = mul nsw i32 %3, 3 + %cmp7 = icmp slt i32 %mul6, 1 + br i1 %cmp7, label %if.then8, label %if.else12 + +if.then8: ; preds = %if.else5 + br label %det.achd9 + +det.achd9: ; preds = %if.then8 + %4 = bitcast i32 undef to i32 + %call10 = call i32 (...) @bar() + br label %det.cont11 + +det.cont11: ; preds = %det.achd9 + br label %if.end16 + +if.else12: ; preds = %if.else5 + br label %det.achd13 + +det.achd13: ; preds = %if.else12 + %5 = bitcast i32 undef to i32 + %call14 = call i32 (...) @foo() + br label %det.cont15 + +det.cont15: ; preds = %det.achd13 + br label %if.end16 + +if.end16: ; preds = %det.cont15, %det.cont11 + br label %if.end17 + +if.end17: ; preds = %if.end16, %det.cont + %6 = load i32, i32* %c, align 4 + ret i32 %6 +} + +declare i32 @foo(...) #1 + +declare i32 @bar(...) #1 + +attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"} diff --git a/llvm/microbenchmarks/smallblock/multiple_nested.c b/llvm/microbenchmarks/smallblock/multiple_nested.c new file mode 100644 index 00000000000000..3f9a1f235b183a --- /dev/null +++ b/llvm/microbenchmarks/smallblock/multiple_nested.c @@ -0,0 +1,21 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + cilk_spawn { + foo(); + bar(); + c = 2; + } + cilk_spawn { + foo(); + foo(); + } + } + return c; +} diff --git a/llvm/microbenchmarks/smallblock/multiple_spawn.c b/llvm/microbenchmarks/smallblock/multiple_spawn.c new file mode 100644 index 00000000000000..b551796f050ed0 --- /dev/null +++ b/llvm/microbenchmarks/smallblock/multiple_spawn.c @@ -0,0 +1,19 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + foo(); + bar(); + c = 2; + } + cilk_spawn { + foo(); + foo(); + } + return c; +} diff --git a/llvm/microbenchmarks/smallblock/serial.c b/llvm/microbenchmarks/smallblock/serial.c new file mode 100644 index 00000000000000..12b21b6b0ebc38 --- /dev/null +++ b/llvm/microbenchmarks/smallblock/serial.c @@ -0,0 +1,15 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c = foo(); + if (c > 0) { + bar(); + } else { + foo(); + } + return c; +} diff --git a/llvm/microbenchmarks/smallblock/simple_spawn.c b/llvm/microbenchmarks/smallblock/simple_spawn.c new file mode 100644 index 00000000000000..41183d94ae8ad0 --- /dev/null +++ b/llvm/microbenchmarks/smallblock/simple_spawn.c @@ -0,0 +1,15 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + foo(); + bar(); + c = 2; + } + return c; +} diff --git a/llvm/microbenchmarks/spawnrestructure/base_negative.c b/llvm/microbenchmarks/spawnrestructure/base_negative.c new file mode 100644 index 00000000000000..3718ca3466844c --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/base_negative.c @@ -0,0 +1,20 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + foo(); + bar(); + c = 2; + } + cilk_spawn { + foo(); + foo(); + } + bar(); + return 0; +} diff --git a/llvm/microbenchmarks/spawnrestructure/base_negative.ll b/llvm/microbenchmarks/spawnrestructure/base_negative.ll new file mode 100644 index 00000000000000..b92b96b872d09c --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/base_negative.ll @@ -0,0 +1,46 @@ +; ModuleID = 'base_negative.c' +source_filename = "base_negative.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: noinline nounwind ssp uwtable +define i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + detach label %det.achd, label %det.cont + +det.achd: ; preds = %entry + %0 = bitcast i32 undef to i32 + %call = call i32 (...) @foo() + %call1 = call i32 (...) @bar() + store i32 2, i32* %c, align 4 + reattach label %det.cont + +det.cont: ; preds = %det.achd, %entry + detach label %det.achd2, label %det.cont5 + +det.achd2: ; preds = %det.cont + %1 = bitcast i32 undef to i32 + %call3 = call i32 (...) @foo() + %call4 = call i32 (...) @foo() + reattach label %det.cont5 + +det.cont5: ; preds = %det.achd2, %det.cont + %call6 = call i32 (...) @bar() + ret i32 0 +} + +declare i32 @foo(...) #1 + +declare i32 @bar(...) #1 + +attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"} diff --git a/llvm/microbenchmarks/spawnrestructure/base_positive.c b/llvm/microbenchmarks/spawnrestructure/base_positive.c new file mode 100644 index 00000000000000..7e3d0546efd52b --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/base_positive.c @@ -0,0 +1,19 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + foo(); + bar(); + c = 2; + } + cilk_spawn { + foo(); + foo(); + } + return bar(); +} diff --git a/llvm/microbenchmarks/spawnrestructure/base_positive.ll b/llvm/microbenchmarks/spawnrestructure/base_positive.ll new file mode 100644 index 00000000000000..8055cdfe786d67 --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/base_positive.ll @@ -0,0 +1,46 @@ +; ModuleID = 'base_positive.c' +source_filename = "base_positive.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: noinline nounwind ssp uwtable +define i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %c = alloca i32, align 4 + store i32 0, i32* %retval, align 4 + detach label %det.achd, label %det.cont + +det.achd: ; preds = %entry + %0 = bitcast i32 undef to i32 + %call = call i32 (...) @foo() + %call1 = call i32 (...) @bar() + store i32 2, i32* %c, align 4 + reattach label %det.cont + +det.cont: ; preds = %det.achd, %entry + detach label %det.achd2, label %det.cont5 + +det.achd2: ; preds = %det.cont + %1 = bitcast i32 undef to i32 + %call3 = call i32 (...) @foo() + %call4 = call i32 (...) @foo() + reattach label %det.cont5 + +det.cont5: ; preds = %det.achd2, %det.cont + %call6 = call i32 (...) @bar() + ret i32 %call6 +} + +declare i32 @foo(...) #1 + +declare i32 @bar(...) #1 + +attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"} diff --git a/llvm/microbenchmarks/spawnrestructure/complex.c b/llvm/microbenchmarks/spawnrestructure/complex.c new file mode 100644 index 00000000000000..23874168629bd1 --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/complex.c @@ -0,0 +1,32 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + cilk_spawn { + foo(); + bar(); + c = 2; + } + bar(); + cilk_spawn { + cilk_spawn { + cilk_spawn { + foo(); + } + } + bar(); + } + cilk_spawn { + cilk_spawn { + foo(); + foo(); + } + } + } + return c; +} diff --git a/llvm/microbenchmarks/spawnrestructure/multiple_nested.c b/llvm/microbenchmarks/spawnrestructure/multiple_nested.c new file mode 100644 index 00000000000000..3f9a1f235b183a --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/multiple_nested.c @@ -0,0 +1,21 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + cilk_spawn { + foo(); + bar(); + c = 2; + } + cilk_spawn { + foo(); + foo(); + } + } + return c; +} diff --git a/llvm/microbenchmarks/spawnrestructure/serial.c b/llvm/microbenchmarks/spawnrestructure/serial.c new file mode 100644 index 00000000000000..12b21b6b0ebc38 --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/serial.c @@ -0,0 +1,15 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c = foo(); + if (c > 0) { + bar(); + } else { + foo(); + } + return c; +} diff --git a/llvm/microbenchmarks/spawnrestructure/simple_spawn.c b/llvm/microbenchmarks/spawnrestructure/simple_spawn.c new file mode 100644 index 00000000000000..41183d94ae8ad0 --- /dev/null +++ b/llvm/microbenchmarks/spawnrestructure/simple_spawn.c @@ -0,0 +1,15 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c; + cilk_spawn { + foo(); + bar(); + c = 2; + } + return c; +} diff --git a/llvm/microbenchmarks/spawnunswitch/simple.c b/llvm/microbenchmarks/spawnunswitch/simple.c new file mode 100644 index 00000000000000..d817a44c676419 --- /dev/null +++ b/llvm/microbenchmarks/spawnunswitch/simple.c @@ -0,0 +1,16 @@ +#include + +int foo(); + +int bar(); + +int main() { + int c = foo(); + int d = bar(); + cilk_spawn { + if (c) { + foo(); + } + } + return foo(); +} diff --git a/llvm/microbenchmarks/spawnunswitch/simple.ll b/llvm/microbenchmarks/spawnunswitch/simple.ll new file mode 100644 index 00000000000000..05d3ac9fbbd8ec --- /dev/null +++ b/llvm/microbenchmarks/spawnunswitch/simple.ll @@ -0,0 +1,41 @@ +; ModuleID = 'simple.c' +source_filename = "simple.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: nounwind ssp uwtable +define i32 @main() local_unnamed_addr #0 { +entry: + %call = tail call i32 (...) @foo() #2 + %call1 = tail call i32 (...) @bar() #2 + detach label %det.achd, label %det.cont + +det.achd: ; preds = %entry + %tobool = icmp eq i32 %call, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %det.achd + %call2 = tail call i32 (...) @foo() #2 + br label %if.end + +if.end: ; preds = %det.achd, %if.then + reattach label %det.cont + +det.cont: ; preds = %if.end, %entry + %call3 = tail call i32 (...) @foo() #2 + ret i32 %call3 +} + +declare i32 @foo(...) local_unnamed_addr #1 + +declare i32 @bar(...) local_unnamed_addr #1 + +attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"} diff --git a/llvm/microbenchmarks/spawnunswitch/simple2.c b/llvm/microbenchmarks/spawnunswitch/simple2.c new file mode 100644 index 00000000000000..7e376f1522451d --- /dev/null +++ b/llvm/microbenchmarks/spawnunswitch/simple2.c @@ -0,0 +1,14 @@ +#include + +int foo(); + +int bar(); + +int main() { + if (foo()) { + cilk_spawn { + bar(); + } + } + return foo(); +} diff --git a/llvm/microbenchmarks/spawnunswitch/simple2.ll b/llvm/microbenchmarks/spawnunswitch/simple2.ll new file mode 100644 index 00000000000000..a6dfc993f89703 --- /dev/null +++ b/llvm/microbenchmarks/spawnunswitch/simple2.ll @@ -0,0 +1,37 @@ +; ModuleID = 'simple2.c' +source_filename = "simple2.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: nounwind ssp uwtable +define i32 @main() local_unnamed_addr #0 { +entry: + %call = tail call i32 (...) @foo() #2 + %tobool = icmp eq i32 %call, 0 + br i1 %tobool, label %if.end, label %if.then + +if.then: ; preds = %entry + detach label %det.achd, label %if.end + +det.achd: ; preds = %if.then + %call1 = tail call i32 (...) @bar() #2 + reattach label %if.end + +if.end: ; preds = %entry, %if.then, %det.achd + %call2 = tail call i32 (...) @foo() #2 + ret i32 %call2 +} + +declare i32 @foo(...) local_unnamed_addr #1 + +declare i32 @bar(...) local_unnamed_addr #1 + +attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"} diff --git a/llvm/microbenchmarks/spawnunswitch/temp.ll b/llvm/microbenchmarks/spawnunswitch/temp.ll new file mode 100644 index 00000000000000..1484049381dfc4 --- /dev/null +++ b/llvm/microbenchmarks/spawnunswitch/temp.ll @@ -0,0 +1,38 @@ +; ModuleID = '' +source_filename = "simple.c" +target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.12.0" + +; Function Attrs: nounwind ssp uwtable +define i32 @SpawnUnswitch_main() local_unnamed_addr #0 { +entry: + %call = tail call i32 (...) @foo() #2 + %call1 = tail call i32 (...) @bar() #2 + %tobool = icmp eq i32 %call, 0 + br i1 %tobool, label %det.achd, label %det.cont + +det.achd: ; preds = %entry + detach label %if.end, label %det.cont + +if.end: ; preds = %det.achd + %call2 = tail call i32 (...) @foo() #2 + reattach label %det.cont + +det.cont: ; preds = %det.achd, %entry, %if.end + %call3 = tail call i32 (...) @foo() #2 + ret i32 %call3 +} + +declare i32 @foo(...) local_unnamed_addr #1 + +declare i32 @bar(...) local_unnamed_addr #1 + +attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #2 = { nounwind } + +!llvm.module.flags = !{!0} +!llvm.ident = !{!1} + +!0 = !{i32 1, !"PIC Level", i32 2} +!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"} diff --git a/llvm/microbenchmarks/spawnunswitch/test.c b/llvm/microbenchmarks/spawnunswitch/test.c new file mode 100644 index 00000000000000..7228775811b839 --- /dev/null +++ b/llvm/microbenchmarks/spawnunswitch/test.c @@ -0,0 +1,12 @@ +#include + +int foo(); + +int bar(); + +int main() { + cilk_for (int i=0; i < 1000; i++) { + foo(); + } + return foo(); +} diff --git a/llvm/microbenchmarks/spawnunswitch/test2.c b/llvm/microbenchmarks/spawnunswitch/test2.c new file mode 100644 index 00000000000000..56dd3cb7977f61 --- /dev/null +++ b/llvm/microbenchmarks/spawnunswitch/test2.c @@ -0,0 +1,12 @@ +#include + +int foo(); + +int bar(); + +int main() { + cilk_spawn { + bar(); + } + return foo(); +} diff --git a/llvm/microbenchmarks/timing/average.py b/llvm/microbenchmarks/timing/average.py new file mode 100644 index 00000000000000..17dc85395caa7a --- /dev/null +++ b/llvm/microbenchmarks/timing/average.py @@ -0,0 +1,10 @@ +import sys +f = open("spawn.txt", 'r') +g = open("simple.txt", 'r') +total1 = 0 +for line in f.readlines(): + total1 += int(line[:len(line)-1]) +total2 = 0 +for line in g.readlines(): + total2 += int(line[:len(line)-1]) +print "Spawn to serial ratio: " + str((total1*1.0)/total2) diff --git a/llvm/microbenchmarks/timing/ratio.sh b/llvm/microbenchmarks/timing/ratio.sh new file mode 100644 index 00000000000000..ac4c6a3e239305 --- /dev/null +++ b/llvm/microbenchmarks/timing/ratio.sh @@ -0,0 +1 @@ +for i in {1..100};do ./simple >> simple.txt;./spawn >> spawn.txt;done;python average.py;rm *.txt diff --git a/llvm/microbenchmarks/timing/simple b/llvm/microbenchmarks/timing/simple new file mode 100755 index 00000000000000..68c3cd94e6a26f Binary files /dev/null and b/llvm/microbenchmarks/timing/simple differ diff --git a/llvm/microbenchmarks/timing/simple.c b/llvm/microbenchmarks/timing/simple.c new file mode 100644 index 00000000000000..c7a90879912060 --- /dev/null +++ b/llvm/microbenchmarks/timing/simple.c @@ -0,0 +1,16 @@ +#include +#include + +int main() { + int c = 0; + int its = 100; + clock_t start = clock(), diff; + #pragma unroll + for (int i = 0; i < its; i++) { + c += i; + } + diff = clock() - start; + int msec = (diff * 1000000) / CLOCKS_PER_SEC; + printf("%d\n", msec); + return c; +} diff --git a/llvm/microbenchmarks/timing/spawn b/llvm/microbenchmarks/timing/spawn new file mode 100755 index 00000000000000..3dc36bce56b1ea Binary files /dev/null and b/llvm/microbenchmarks/timing/spawn differ diff --git a/llvm/microbenchmarks/timing/spawn.c b/llvm/microbenchmarks/timing/spawn.c new file mode 100644 index 00000000000000..1588cfec2f113d --- /dev/null +++ b/llvm/microbenchmarks/timing/spawn.c @@ -0,0 +1,19 @@ +#include +#include +#include + +int main() { + int c = 0; + int its = 100; + clock_t start = clock(), diff; + cilk_spawn { + for (int i = 0; i < its; i++) { + c += i; + } + } + cilk_sync; + diff = clock() - start; + int msec = (diff * 1000000) / CLOCKS_PER_SEC; + printf("%d\n", msec); + return c; +} diff --git a/llvm/test/Transforms/LoopFuse/fuse.ll b/llvm/test/Transforms/LoopFuse/fuse.ll new file mode 100644 index 00000000000000..f283778f432028 --- /dev/null +++ b/llvm/test/Transforms/LoopFuse/fuse.ll @@ -0,0 +1,87 @@ +; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s + +; 'C' equivalent: Partially generated and hand modified. +; void fuse(int *a, int *b, int *c) { +; for (i = 0; i < 1000; ++i) // L1 +; c[i] = a[i] + c[i + 1]; +; for (i = 0; i < 1000; ++i) // L2 +; c[i] = a[i] + b[i]; +; } +; There is no backward dependence from L1 to L2. So it is safe to fuse. + +; Test that there are two versions - original loops and fused loop. +; CHECK: br i1 %memcheck.conflict, label %entry.split, label %entry.split.L1clone + +; Test for fusion along fused path. +; CHECK: for.body.L1clone: ; preds = %for.body.1.L2clone, %entry.split.L1clone +; CHECK: for.body.1.L2clone: ; preds = %for.body.L1clone +; CHECK: br i1 %exitcond.L1clone, label %for.end.loopexit.1, label %for.body.L1clone, !llvm.loop !1 + +; Test for merged defs and its uses outside the loops. +; CHECK: for.end.loopexit.1: ; preds = %for.body.1.L2clone, %for.body.1 +; CHECK: %add11.lfuse = phi i32 [ %add11, %for.body.1 ], [ %add11.L2clone, %for.body.1.L2clone ] +; CHECK: %add4.lfuse = phi i32 [ %add4, %for.body.1 ], [ %add4.L1clone, %for.body.1.L2clone ] +; CHECK: %outsideUse = add nsw i32 %add11.lfuse, %add4.lfuse + +; ModuleID = '1.bc' + +; Function Attrs: norecurse nounwind uwtable +define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next + %1 = load i32, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + store i32 %add4, i32* %arrayidx6, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4 + +for.end.loopexit: ; preds = %for.body + br label %for.body.1 + +for.body.1: ; preds = %for.body.1, %for.end.loopexit + %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ] + %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1 + %3 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %3, %2 + %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1 + store i32 %add11, i32* %arrayidx12, align 4 + %indvars.iv.next.1 = add i64 %indvars.iv.1, 1 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000 + br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4 + +for.end.loopexit.1: ; preds = %for.body.1 + br label %for.end + +for.end: ; preds = %for.end.loopexit.1 + %outsideUse = add nsw i32 %add11, %add4 + ret void +} + +attributes #0 = { norecurse nounwind uwtable } +attributes #1 = { norecurse nounwind readonly uwtable } +attributes #2 = { nounwind uwtable } +attributes #3 = { nounwind readonly } +attributes #4 = { nounwind } +attributes #5 = { noreturn nounwind } +attributes #6 = { nounwind readonly } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.8.0"} +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 1} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} +!6 = distinct !{!6, !2, !3} +!7 = distinct !{!7, !2, !3} diff --git a/llvm/test/Transforms/LoopFuse/no-fuse.ll b/llvm/test/Transforms/LoopFuse/no-fuse.ll new file mode 100644 index 00000000000000..7abb67fd622998 --- /dev/null +++ b/llvm/test/Transforms/LoopFuse/no-fuse.ll @@ -0,0 +1,78 @@ +; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s + +; 'C' equivalent: Partially generated and hand modified. +; void noFuse(int *a, int *b, int *c) { +; for (i = 0; i < 1000; ++i) // L1 +; c[i] = a[i] + c[i - 1]; +; for (i = 0; i < 1000; ++i) // L2 +; c[i] = a[i] + b[i]; +; } +; There is a backward dependence from L1 to L2. So it is unsafe to fuse. + +; CHECK: entry: +; CHECK-NEXT: br label %for.body +; CHECK: for.body: ; preds = %for.body, %entry +; CHECK: for.body.1: + +; ModuleID = '1.bc' + +; Function Attrs: norecurse nounwind uwtable +define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv + %0 = load i32, i32* %arrayidx, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %indvars.iv.next.back = add i64 %indvars.iv, -1 + %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next.back + %1 = load i32, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %1, %0 + %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv + store i32 %add4, i32* %arrayidx6, align 4 + %exitcond = icmp eq i64 %indvars.iv.next, 1000 + br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4 + +for.end.loopexit: ; preds = %for.body + br label %for.body.1 + +for.body.1: ; preds = %for.body.1, %for.end.loopexit + %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ] + %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1 + %2 = load i32, i32* %arrayidx.1, align 4 + %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1 + %3 = load i32, i32* %arrayidx10, align 4 + %add11 = add nsw i32 %3, %2 + %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1 + store i32 %add11, i32* %arrayidx12, align 4 + %indvars.iv.next.1 = add i64 %indvars.iv.1, 1 + %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000 + br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4 + +for.end.loopexit.1: ; preds = %for.body.1 + br label %for.end + +for.end: ; preds = %for.end.loopexit.1 + ret void +} + +attributes #0 = { norecurse nounwind uwtable } +attributes #1 = { norecurse nounwind readonly uwtable } +attributes #2 = { nounwind uwtable } +attributes #3 = { nounwind readonly } +attributes #4 = { nounwind } +attributes #5 = { noreturn nounwind } +attributes #6 = { nounwind readonly } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.8.0"} +!1 = distinct !{!1, !2, !3} +!2 = !{!"llvm.loop.vectorize.width", i32 1} +!3 = !{!"llvm.loop.interleave.count", i32 1} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} +!6 = distinct !{!6, !2, !3} +!7 = distinct !{!7, !2, !3} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp new file mode 100644 index 00000000000000..0461b69c99b3a1 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp @@ -0,0 +1,6 @@ +#include + +void func() { + cilk_sync; + cilk_sync; +} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll b/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll new file mode 100644 index 00000000000000..5615d4c1310d2c --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll @@ -0,0 +1,29 @@ +; RUN: opt < %s -sync-elimination -S | FileCheck %s + +; ModuleID = 'basic1.cpp' +source_filename = "basic1.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @_Z4funcv() #0 { +entry: +; CHECK: @_Z4funcv + %syncreg = call token @llvm.syncregion.start() +; CHECK-NOT: sync within %syncreg, label %sync.continue + sync within %syncreg, label %sync.continue + +sync.continue: ; preds = %entry +; CHECK-NOT: sync within %syncreg, label %sync.continue + sync within %syncreg, label %sync.continue1 + +; CHECK: sync.continue +sync.continue1: ; preds = %sync.continue + ret void +} + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp new file mode 100644 index 00000000000000..6de0ad05f14611 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp @@ -0,0 +1,8 @@ +#include + +void func() { + cilk_spawn { + } + cilk_sync; + cilk_sync; +} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll b/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll new file mode 100644 index 00000000000000..5658771430bc25 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll @@ -0,0 +1,34 @@ +; RUN: opt < %s -sync-elimination -S | FileCheck %s + +; ModuleID = 'basic2.cpp' +source_filename = "basic2.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @_Z4funcv() #0 { +; CHECK: @_Z4funcv +entry: + %syncreg = call token @llvm.syncregion.start() + detach within %syncreg, label %det.achd, label %det.cont + +det.achd: ; preds = %entry + reattach within %syncreg, label %det.cont + +det.cont: ; preds = %det.achd, %entry +; CHECK-NOT: sync within %syncreg, label %sync.continue + sync within %syncreg, label %sync.continue + +; CHECK: sync.continue +sync.continue: ; preds = %det.cont + sync within %syncreg, label %sync.continue1 + +sync.continue1: ; preds = %sync.continue + ret void +} + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp new file mode 100644 index 00000000000000..03c7cb7efdd77d --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp @@ -0,0 +1,9 @@ +#include + +void func() { + int a; + cilk_spawn { + a = 1; + } + cilk_sync; +} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll b/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll new file mode 100644 index 00000000000000..0638fc2d81c5b9 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll @@ -0,0 +1,37 @@ +; RUN: opt < %s -sync-elimination -S | FileCheck %s + +; ModuleID = 'fail1.cpp' +source_filename = "fail1.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @_Z4funcv() #0 { +entry: + %a = alloca i32, align 4 + %syncreg = call token @llvm.syncregion.start() + detach within %syncreg, label %det.achd, label %det.cont + +det.achd: ; preds = %entry + store i32 1, i32* %a, align 4 + reattach within %syncreg, label %det.cont + +det.cont: ; preds = %det.achd, %entry + sync within %syncreg, label %sync.continue +; CHECK: sync within %syncreg, label %sync.continue + +sync.continue: ; preds = %det.cont + store i32 2, i32* %a, align 4 + sync within %syncreg, label %sync.continue1 +; CHECK-NOT: sync within %syncreg, label %sync.continue1 + +sync.continue1: ; preds = %sync.continue + ret void +; CHECK: ret void +} + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp new file mode 100644 index 00000000000000..779d13b2483954 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp @@ -0,0 +1,10 @@ +#include + +void func(int *a, int *b) { + cilk_spawn { + *a = 1; + } + cilk_sync; + *b = 2; + cilk_sync; +} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll b/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll new file mode 100644 index 00000000000000..c4d2d395658f34 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll @@ -0,0 +1,42 @@ +; RUN: opt < %s -sync-elimination -S | FileCheck %s + +; ModuleID = 'fail2.cpp' +source_filename = "fail2.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @_Z4funcPiS_(i32* %a, i32* %b) #0 { +entry: + %a.addr = alloca i32*, align 8 + %b.addr = alloca i32*, align 8 + %syncreg = call token @llvm.syncregion.start() + store i32* %a, i32** %a.addr, align 8 + store i32* %b, i32** %b.addr, align 8 + detach within %syncreg, label %det.achd, label %det.cont + +det.achd: ; preds = %entry + %0 = load i32*, i32** %a.addr, align 8 + store i32 1, i32* %0, align 4 + reattach within %syncreg, label %det.cont + +det.cont: ; preds = %det.achd, %entry + sync within %syncreg, label %sync.continue +; CHECK: sync within %syncreg, label %sync.continue + +sync.continue: ; preds = %det.cont + %1 = load i32*, i32** %b.addr, align 8 + store i32 2, i32* %1, align 4 + sync within %syncreg, label %sync.continue1 +; CHECK-NOT: sync within %syncreg, label %sync.continue1 + +sync.continue1: ; preds = %sync.continue + ret void +; CHECK: ret void +} + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp new file mode 100644 index 00000000000000..bcf9db1d5e83f3 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp @@ -0,0 +1,8 @@ +#include + +void func() { + cilk_for (int i = 0; i < 10; i++) { + } + cilk_for (int i = 0; i < 10; i++) { + } +} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for1.ll b/llvm/test/Transforms/Tapir/SyncElimination/for1.ll new file mode 100644 index 00000000000000..394e04b2bc0731 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/for1.ll @@ -0,0 +1,112 @@ +; RUN: opt < %s -sync-elimination -S | FileCheck %s + +; ModuleID = 'for1.cpp' +source_filename = "for1.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: noinline nounwind uwtable +define void @_Z4funcv() #0 { +entry: + %syncreg = call token @llvm.syncregion.start() + %__init = alloca i32, align 4 + %__begin = alloca i32, align 4 + %__end = alloca i32, align 4 + %syncreg1 = call token @llvm.syncregion.start() + %__init2 = alloca i32, align 4 + %__begin3 = alloca i32, align 4 + %__end4 = alloca i32, align 4 + store i32 0, i32* %__init, align 4 + store i32 0, i32* %__begin, align 4 + store i32 10, i32* %__end, align 4 + br label %pfor.cond + +pfor.cond: ; preds = %pfor.inc, %entry + %0 = load i32, i32* %__begin, align 4 + %1 = load i32, i32* %__end, align 4 + %cmp = icmp slt i32 %0, %1 + br i1 %cmp, label %pfor.detach, label %pfor.end + +pfor.detach: ; preds = %pfor.cond + %2 = load i32, i32* %__init, align 4 + %3 = load i32, i32* %__begin, align 4 + %mul = mul nsw i32 %3, 1 + %add = add nsw i32 %2, %mul + detach within %syncreg, label %pfor.body.entry, label %pfor.inc + +pfor.body.entry: ; preds = %pfor.detach + %i = alloca i32, align 4 + store i32 %add, i32* %i, align 4 + br label %pfor.body + +pfor.body: ; preds = %pfor.body.entry + br label %pfor.preattach + +pfor.preattach: ; preds = %pfor.body + reattach within %syncreg, label %pfor.inc + +pfor.inc: ; preds = %pfor.preattach, %pfor.detach + %4 = load i32, i32* %__begin, align 4 + %inc = add nsw i32 %4, 1 + store i32 %inc, i32* %__begin, align 4 + br label %pfor.cond, !llvm.loop !1 + +pfor.end: ; preds = %pfor.cond + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.end + store i32 0, i32* %__init2, align 4 + store i32 0, i32* %__begin3, align 4 + store i32 10, i32* %__end4, align 4 + br label %pfor.cond3 + +; CHECK: pfor.end +; CHECK-NOT: sync +; CHECK: pfor.cond + +pfor.cond3: ; preds = %pfor.inc8, %pfor.end.continue + %5 = load i32, i32* %__begin3, align 4 + %6 = load i32, i32* %__end4, align 4 + %cmp6 = icmp slt i32 %5, %6 + br i1 %cmp6, label %pfor.detach5, label %pfor.end10 + +pfor.detach5: ; preds = %pfor.cond3 + %7 = load i32, i32* %__init2, align 4 + %8 = load i32, i32* %__begin3, align 4 + %mul8 = mul nsw i32 %8, 1 + %add9 = add nsw i32 %7, %mul8 + detach within %syncreg1, label %pfor.body.entry6, label %pfor.inc8 + +pfor.body.entry6: ; preds = %pfor.detach5 + %i11 = alloca i32, align 4 + store i32 %add9, i32* %i11, align 4 + br label %pfor.body6 + +pfor.body6: ; preds = %pfor.body.entry5 + br label %pfor.preattach7 + +pfor.preattach7: ; preds = %pfor.body6 + reattach within %syncreg1, label %pfor.inc8 + +pfor.inc8: ; preds = %pfor.preattach7, %pfor.detach5 + %9 = load i32, i32* %__begin3, align 4 + %inc15 = add nsw i32 %9, 1 + store i32 %inc15, i32* %__begin3, align 4 + br label %pfor.cond3, !llvm.loop !3 + +pfor.end10: ; preds = %pfor.cond3 + sync within %syncreg1, label %pfor.end.continue11 + +pfor.end.continue11: ; preds = %pfor.end10 + ret void +} + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } + +!1 = distinct !{!1, !2} +!2 = !{!"tapir.loop.spawn.strategy", i32 1} +!3 = distinct !{!3, !2} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp new file mode 100644 index 00000000000000..5627249702cef6 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp @@ -0,0 +1,8 @@ +#include + +void func() { + cilk_for (int i = 0; i < 100; i++) { + cilk_for (int j = 0; j < 3; j++) { + } + } +} diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for2.ll b/llvm/test/Transforms/Tapir/SyncElimination/for2.ll new file mode 100644 index 00000000000000..91b70b4db95e94 --- /dev/null +++ b/llvm/test/Transforms/Tapir/SyncElimination/for2.ll @@ -0,0 +1,78 @@ +; RUN: opt < %s -sync-elimination -S | FileCheck %s +; XFAIL: * + +; ModuleID = 'for2.cpp' +source_filename = "for2.cpp" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; Function Attrs: nounwind uwtable +define void @_Z4funcv() #0 { +entry: + %syncreg = call token @llvm.syncregion.start() + br label %pfor.cond + +pfor.cond: ; preds = %pfor.inc15, %entry + %__begin.0 = phi i32 [ 0, %entry ], [ %inc16, %pfor.inc15 ] + %cmp = icmp slt i32 %__begin.0, 100 + br i1 %cmp, label %pfor.detach, label %pfor.cond.cleanup + +pfor.cond.cleanup: ; preds = %pfor.cond +;; The sync before a return is not safe to remove. +; CHECK: sync within %syncreg, label %pfor.end.continue + sync within %syncreg, label %pfor.end.continue + +pfor.end.continue: ; preds = %pfor.cond.cleanup + ret void + +pfor.detach: ; preds = %pfor.cond + detach within %syncreg, label %pfor.body.entry, label %pfor.inc15 + +pfor.body.entry: ; preds = %pfor.detach + %syncreg1 = call token @llvm.syncregion.start() + br label %pfor.body + +pfor.body: ; preds = %pfor.body.entry + br label %pfor.cond5 + +pfor.cond5: ; preds = %pfor.inc, %pfor.body + %__begin3.0 = phi i32 [ 0, %pfor.body ], [ %inc, %pfor.inc ] + %cmp6 = icmp slt i32 %__begin3.0, 3 + br i1 %cmp6, label %pfor.detach9, label %pfor.cond.cleanup7 + +; CHECK: pfor.cond5 +pfor.cond.cleanup7: ; preds = %pfor.cond5 +; CHECK-NOT: sync within %syncreg1, label %pfor.end.continue + sync within %syncreg1, label %pfor.end.continue8 +; CHECK: pfor.inc15 + +pfor.end.continue8: ; preds = %pfor.cond.cleanup7 + reattach within %syncreg, label %pfor.inc15 + +pfor.detach9: ; preds = %pfor.cond5 + detach within %syncreg1, label %pfor.body.entry12, label %pfor.inc + +pfor.body.entry12: ; preds = %pfor.detach9 + br label %pfor.preattach + +pfor.preattach: ; preds = %pfor.body.entry12 + reattach within %syncreg1, label %pfor.inc + +pfor.inc: ; preds = %pfor.preattach, %pfor.detach9 + %inc = add nsw i32 %__begin3.0, 1 + br label %pfor.cond5, !llvm.loop !2 + +pfor.inc15: ; preds = %pfor.end.continue8, %pfor.detach + %inc16 = add nsw i32 %__begin.0, 1 + br label %pfor.cond, !llvm.loop !4 +} + +; Function Attrs: argmemonly nounwind +declare token @llvm.syncregion.start() #1 + +attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { argmemonly nounwind } + +!2 = distinct !{!2, !3} +!3 = !{!"tapir.loop.spawn.strategy", i32 1} +!4 = distinct !{!4, !3} diff --git a/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll b/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll new file mode 100644 index 00000000000000..a31d07f206846d --- /dev/null +++ b/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll @@ -0,0 +1,98 @@ +; Test that Tapir's loop spawning pass transforms this simple loop +; into recursive divide-and-conquer. + +; RUN: opt < %s -loop-spawning -S | FileCheck %s + +; Function Attrs: nounwind uwtable +define void @foo(i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @foo( +entry: + %syncreg = call token @llvm.syncregion.start() + %cmp5 = icmp sgt i32 %n, 0 + br i1 %cmp5, label %pfor.detach.preheader, label %pfor.cond.cleanup + +pfor.detach.preheader: ; preds = %entry +; CHECK: pfor.detach.preheader: +; CHECK: [[LIMIT:%[0-9]+]] = add [[TYPE:i[0-9]+]] %n, -1 +; CHECK: call fastcc void @[[OUTLINED:[a-zA-Z0-9._]+]]( +; CHECK: [[TYPE]] 0 +; CHECK: [[TYPE]] [[LIMIT]] +; CHECK: [[TYPE]] {{[%]?[a-zA-Z0-9._]+}} +; CHECK-NEXT: br label %pfor.cond.cleanup.loopexit + br label %pfor.detach + +pfor.cond.cleanup.loopexit: ; preds = %pfor.inc + br label %pfor.cond.cleanup + +pfor.cond.cleanup: ; preds = %pfor.cond.cleanup.loopexit, %entry +; CHECK: pfor.cond.cleanup +; CHECK-NOT: sync within %syncreg, label %0 + sync within %syncreg, label %0 + +;