diff --git a/llvm/.circleci/config.yml b/llvm/.circleci/config.yml
new file mode 100644
index 00000000000000..ad6dfc714d34e1
--- /dev/null
+++ b/llvm/.circleci/config.yml
@@ -0,0 +1,30 @@
+version: 2
+jobs:
+  build:
+    resource_class: xlarge
+    docker:
+      - image: wsmoses/tapir:latest
+
+    steps:
+      - checkout
+      - run:
+          name: submodules
+          command: |
+            git submodule sync
+            git submodule update --init --recursive
+      - run:
+          name: cmake
+          command: |
+            mkdir build
+            cd build
+            cmake .. -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD=host -DLLVM_BUILD_TESTS=ON -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_LTO=OFF -DLLVM_PARALLEL_COMPILE_JOBS=2 -DLLVM_PARALLEL_LINK_JOBS=1
+      - run:
+          name: make
+          command: |
+            cd build
+            make -j2
+      - run:
+          name: test
+          command: |
+            cd build
+            make check-all
diff --git a/llvm/.gitignore b/llvm/.gitignore
index be58944c9b3a1a..ba3de277d5e2f7 100644
--- a/llvm/.gitignore
+++ b/llvm/.gitignore
@@ -80,3 +80,8 @@ docs/_build
 #==============================================================================#
 bindings/go/llvm/llvm_config.go
 bindings/go/llvm/workdir
+
+build/*
+build
+build-debug/*
+build-debug
diff --git a/llvm/898/sync_elimination_pfor_mb/main.c b/llvm/898/sync_elimination_pfor_mb/main.c
new file mode 100644
index 00000000000000..b5e0ebc4281ad5
--- /dev/null
+++ b/llvm/898/sync_elimination_pfor_mb/main.c
@@ -0,0 +1,35 @@
+#include <stdio.h>
+
+#include <cilk/cilk.h>
+
+#define N 100000000
+
+__attribute__((always_inline))
+int f(int x) {
+    return x * x;
+}
+
+__attribute__((always_inline))
+int g(int x) {
+    return x + 3;
+}
+
+int r1[N];
+int r2[N];
+
+int main(void)
+{
+    int sum = 0;
+
+    cilk_for (int i=0; i<N; i++) {
+        r1[i] = f(i) * g(i);
+    }
+
+    cilk_for (int i=0; i<N; i++) {
+        r2[i] = f(i) / g(i);
+    }
+
+    printf("%d %d\n", r1[N / 2], r2[N / 2]);
+
+    return 0;
+}
diff --git a/llvm/898/sync_elimination_pfor_mb/makefile b/llvm/898/sync_elimination_pfor_mb/makefile
new file mode 100644
index 00000000000000..c680a28b41a006
--- /dev/null
+++ b/llvm/898/sync_elimination_pfor_mb/makefile
@@ -0,0 +1,16 @@
+all: main_o3 main_sync
+
+clean:
+	rm -f main_o3 main_sync
+
+main_o3:
+	clang -O3 -o main_o3 main.c
+	clang -O3 -S -emit-llvm -o main_o3.ll main.c
+
+main_sync:
+	clang -S -emit-llvm -o main_sync_raw.ll main.c
+	opt -O3 main_sync_raw.ll -S -o main_sync_o3.ll
+	opt -sync-elimination main_sync_o3.ll -S -o main_sync_sync_eliminated.ll
+	opt -simplifycfg -loop-simplify -loop-fuse main_sync_sync_eliminated.ll -S -o main_sync_loop_fused.ll
+	clang -O3 -o main_sync main_sync_loop_fused.ll
+	clang -O3 -S -emit-llvm -o main_sync.ll main_sync_loop_fused.ll
diff --git a/llvm/LICENSE.TXT b/llvm/LICENSE.TXT
index e4d67d16fea1ee..e16d5ab866cd43 100644
--- a/llvm/LICENSE.TXT
+++ b/llvm/LICENSE.TXT
@@ -15,6 +15,15 @@ Developed by:
 
     http://llvm.org
 
+--------------------------
+Changes to implement Tapir
+--------------------------
+Copyright (c) 2016 William S. Moses and Tao B. Schardl.
+
+Developed by:
+
+    William S. Moses and Tao B. Schardl
+
 Permission is hereby granted, free of charge, to any person obtaining a copy of
 this software and associated documentation files (the "Software"), to deal with
 the Software without restriction, including without limitation the rights to
diff --git a/llvm/README.md b/llvm/README.md
new file mode 100644
index 00000000000000..a5cb3c3767ca4d
--- /dev/null
+++ b/llvm/README.md
@@ -0,0 +1,24 @@
+Tapir/LLVM
+================================
+
+This directory and its subdirectories contain source code for
+Tapir/LLVM, a prototype compiler based on LLVM that implements the
+Tapir compiler IR extensions for fork-join parallelism.
+
+Tapir/LLVM is under active development.  This directory contains
+prototype implementations of compiler technologies that take advantage
+of the Tapir compiler IR.  These prototype technologies include the
+Rhino extensions to Tapir (unpublished).
+
+Tapir/LLVM is open source software.  You may freely distribute it
+under the terms of the license agreement found in LICENSE.txt.
+
+[![CircleCI](https://circleci.com/gh/wsmoses/Parallel-IR.svg?style=svg)](https://circleci.com/gh/wsmoses/Parallel-IR)
+
+
+# References
+
+T. B. Schardl, W. S. Moses, C. E. Leiserson.  "Tapir: Embedding
+Fork-Join Parallelism into LLVM's Intermediate Representation."  ACM
+PPoPP, February 2017, pp. 249-265.  Won Best Paper Award.
+http://dl.acm.org/citation.cfm?id=3018758
diff --git a/llvm/include/llvm-c/Core.h b/llvm/include/llvm-c/Core.h
index 06de058bdc58c4..259459759fd0e5 100644
--- a/llvm/include/llvm-c/Core.h
+++ b/llvm/include/llvm-c/Core.h
@@ -139,7 +139,12 @@ typedef enum {
   LLVMCatchRet       = 62,
   LLVMCatchPad       = 63,
   LLVMCleanupPad     = 64,
-  LLVMCatchSwitch    = 65
+  LLVMCatchSwitch    = 65,
+
+  /* Parallel operators */
+  LLVMDetach         = 66,
+  LLVMReattach       = 67,
+  LLVMSync           = 68
 } LLVMOpcode;
 
 typedef enum {
@@ -255,6 +260,10 @@ typedef enum {
   LLVMMemoryDefValueKind,
   LLVMMemoryPhiValueKind,
 
+  LLVMDetachUseValueKind,
+  LLVMDetachDefValueKind,
+  LLVMDetachPhiValueKind,
+
   LLVMFunctionValueKind,
   LLVMGlobalAliasValueKind,
   LLVMGlobalIFuncValueKind,
@@ -1568,8 +1577,11 @@ LLVMTypeRef LLVMX86MMXType(void);
       macro(SwitchInst)                     \
       macro(UnreachableInst)                \
       macro(ResumeInst)                     \
-      macro(CleanupReturnInst)              \
-      macro(CatchReturnInst)                \
+      macro(DetachInst)                   \
+      macro(ReattachInst)                 \
+      macro(SyncInst)                     \
+      macro(CleanupReturnInst)            \
+      macro(CatchReturnInst)              \
       macro(FuncletPadInst)                 \
         macro(CatchPadInst)                 \
         macro(CleanupPadInst)               \
@@ -3433,6 +3445,12 @@ LLVMValueRef LLVMBuildCleanupPad(LLVMBuilderRef B, LLVMValueRef ParentPad,
 LLVMValueRef LLVMBuildCatchSwitch(LLVMBuilderRef B, LLVMValueRef ParentPad,
                                   LLVMBasicBlockRef UnwindBB,
                                   unsigned NumHandlers, const char *Name);
+LLVMValueRef LLVMBuildResume(LLVMBuilderRef B, LLVMValueRef Exn);
+LLVMValueRef LLVMBuildUnreachable(LLVMBuilderRef);
+LLVMValueRef LLVMBuildDetach(LLVMBuilderRef,
+                             LLVMBasicBlockRef Child, LLVMBasicBlockRef Parent);
+LLVMValueRef LLVMBuildReattach(LLVMBuilderRef);
+LLVMValueRef LLVMBuildSync(LLVMBuilderRef, LLVMBasicBlockRef Continue);
 
 /* Add a case to the switch instruction */
 void LLVMAddCase(LLVMValueRef Switch, LLVMValueRef OnVal,
diff --git a/llvm/include/llvm-c/Initialization.h b/llvm/include/llvm-c/Initialization.h
index e45eafb139f2c0..dfb4a47a3c2f63 100644
--- a/llvm/include/llvm-c/Initialization.h
+++ b/llvm/include/llvm-c/Initialization.h
@@ -36,6 +36,7 @@ void LLVMInitializeTransformUtils(LLVMPassRegistryRef R);
 void LLVMInitializeScalarOpts(LLVMPassRegistryRef R);
 void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R);
 void LLVMInitializeVectorization(LLVMPassRegistryRef R);
+void LLVMInitializeTapirOpts(LLVMPassRegistryRef R);
 void LLVMInitializeInstCombine(LLVMPassRegistryRef R);
 void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R);
 void LLVMInitializeIPO(LLVMPassRegistryRef R);
diff --git a/llvm/include/llvm-c/Transforms/Tapir.h b/llvm/include/llvm-c/Transforms/Tapir.h
new file mode 100644
index 00000000000000..2a60157172dd75
--- /dev/null
+++ b/llvm/include/llvm-c/Transforms/Tapir.h
@@ -0,0 +1,50 @@
+/*===---------------------------Tapir.h ------------------------- -*- C -*-===*\
+|*===----------- Tapir Transformation Library C Interface -----------------===*|
+|*                                                                            *|
+|*                     The LLVM Compiler Infrastructure                       *|
+|*                                                                            *|
+|* This file is distributed under the University of Illinois Open Source      *|
+|* License. See LICENSE.TXT for details.                                      *|
+|*                                                                            *|
+|*===----------------------------------------------------------------------===*|
+|*                                                                            *|
+|* This header declares the C interface to libLLVMTapirOpts.a, which          *|
+|* implements various Tapir transformations of the LLVM IR.                   *|
+|*                                                                            *|
+|* Many exotic languages can interoperate with C code but have a harder time  *|
+|* with C++ due to name mangling. So in addition to C, this interface enables *|
+|* tools written in such languages.                                           *|
+|*                                                                            *|
+\*===----------------------------------------------------------------------===*/
+
+#ifndef LLVM_C_TRANSFORMS_TAPIR_H
+#define LLVM_C_TRANSFORMS_TAPIR_H
+
+#include "llvm-c/Types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @defgroup LLVMCTransformsTapir Tapir transformations
+ * @ingroup LLVMCTransforms
+ *
+ * @{
+ */
+
+/** See llvm::createLoopSpawningPass function. */
+void LLVMAddLoopSpawningPass(LLVMPassManagerRef PM);
+
+/** See llvm::createLowerTapirToCilkPass function. */
+void LLVMAddLowerTapirToCilkPass(LLVMPassManagerRef PM);
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif /* defined(__cplusplus) */
+
+#endif
diff --git a/llvm/include/llvm/Analysis/AliasAnalysis.h b/llvm/include/llvm/Analysis/AliasAnalysis.h
index e2a2ac0622e82d..b05f390e0915c3 100644
--- a/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -507,6 +507,16 @@ class AAResults {
     return getModRefInfo(Call, MemoryLocation(P, Size));
   }
 
+  /// getModRefInfo (for detaches) - Return information about whether
+  /// a particular detach modifies or reads the specified memory location.
+  ModRefInfo getModRefInfo(const DetachInst *D, const MemoryLocation &Loc);
+
+  /// getModRefInfo (for detaches) - A convenience wrapper.
+  ModRefInfo getModRefInfo(const DetachInst *D, const Value *P,
+                           uint64_t Size) {
+    return getModRefInfo(D, MemoryLocation(P, Size));
+  }
+
   /// getModRefInfo (for loads) - Return information about whether
   /// a particular load modifies or reads the specified memory location.
   ModRefInfo getModRefInfo(const LoadInst *L, const MemoryLocation &Loc);
@@ -537,6 +547,15 @@ class AAResults {
     return getModRefInfo(S, MemoryLocation(P, Size));
   }
 
+  /// getModRefInfo (for syncs) - Return information about whether
+  /// a particular store modifies or reads the specified memory location.
+  ModRefInfo getModRefInfo(const SyncInst *S, const MemoryLocation &Loc);
+
+  /// getModRefInfo (for syncs) - A convenience wrapper.
+  ModRefInfo getModRefInfo(const SyncInst *S, const Value *P, uint64_t Size) {
+    return getModRefInfo(S, MemoryLocation(P, Size));
+  }
+
   /// getModRefInfo (for cmpxchges) - Return information about whether
   /// a particular cmpxchg modifies or reads the specified memory location.
   ModRefInfo getModRefInfo(const AtomicCmpXchgInst *CX,
@@ -613,6 +632,7 @@ class AAResults {
     case Instruction::Load:   return getModRefInfo((const LoadInst*)I,  Loc);
     case Instruction::Store:  return getModRefInfo((const StoreInst*)I, Loc);
     case Instruction::Fence:  return getModRefInfo((const FenceInst*)I, Loc);
+    case Instruction::Sync:   return getModRefInfo((const SyncInst*)I, Loc);
     case Instruction::AtomicCmpXchg:
       return getModRefInfo((const AtomicCmpXchgInst*)I, Loc);
     case Instruction::AtomicRMW:
@@ -623,6 +643,7 @@ class AAResults {
       return getModRefInfo((const CatchPadInst *)I, Loc);
     case Instruction::CatchRet:
       return getModRefInfo((const CatchReturnInst *)I, Loc);
+    case Instruction::Detach: return getModRefInfo((const DetachInst*)I,Loc);
     default:
       return ModRefInfo::NoModRef;
     }
diff --git a/llvm/include/llvm/Analysis/DetachSSA.h b/llvm/include/llvm/Analysis/DetachSSA.h
new file mode 100644
index 00000000000000..1611f60a7db263
--- /dev/null
+++ b/llvm/include/llvm/Analysis/DetachSSA.h
@@ -0,0 +1,827 @@
+//===- DetachSSA.h - Build Detach SSA ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file exposes an interface to building/using detach SSA to
+/// walk detach instructions using a use/def graph.
+///
+/// This analysis is heavily based on MemorySSA.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_DETACHSSA_H
+#define LLVM_ANALYSIS_DETACHSSA_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedUser.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace llvm {
+
+class Function;
+class Instruction;
+class DetachAccess;
+class LLVMContext;
+class raw_ostream;
+namespace DSSAHelpers {
+struct AllAccessTag {};
+struct DefsOnlyTag {};
+}
+
+enum {
+  // Used to signify what the default invalid ID is for DetachAccess's
+  // getID()
+  INVALID_DETACHACCESS_ID = 0
+};
+
+template <class T> class detachaccess_def_iterator_base;
+using detachaccess_def_iterator = detachaccess_def_iterator_base<DetachAccess>;
+using const_detachaccess_def_iterator =
+    detachaccess_def_iterator_base<const DetachAccess>;
+
+// \brief The base for all detach accesses, i.e., detaches (defs) and syncs
+// (uses).
+class DetachAccess
+  : public DerivedUser,
+    public ilist_node<DetachAccess, ilist_tag<DSSAHelpers::AllAccessTag>>,
+    public ilist_node<DetachAccess, ilist_tag<DSSAHelpers::DefsOnlyTag>> {
+public:
+  using AllAccessType =
+      ilist_node<DetachAccess, ilist_tag<DSSAHelpers::AllAccessTag>>;
+  using DefsOnlyType =
+      ilist_node<DetachAccess, ilist_tag<DSSAHelpers::DefsOnlyTag>>;
+
+  // Methods for support type inquiry through isa, cast, and
+  // dyn_cast
+  static inline bool classof(const Value *V) {
+    unsigned ID = V->getValueID();
+    return ID == DetachUseVal || ID == DetachPhiVal || ID == DetachDefVal;
+  }
+
+  DetachAccess(const DetachAccess &) = delete;
+  DetachAccess &operator=(const DetachAccess &) = delete;
+
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t) = delete;
+
+  BasicBlock *getBlock() const { return Block; }
+
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+  /// \brief The user iterators for a detach access
+  typedef user_iterator iterator;
+  typedef const_user_iterator const_iterator;
+
+  /// \brief This iterator walks over all of the defs in a given
+  /// DetachAccess. For DetachPhi nodes, this walks arguments. For
+  /// DetachUse/DetachDef, this walks the defining access.
+  detachaccess_def_iterator defs_begin();
+  const_detachaccess_def_iterator defs_begin() const;
+  detachaccess_def_iterator defs_end();
+  const_detachaccess_def_iterator defs_end() const;
+
+  /// \brief Get the iterators for the all access list and the defs only list
+  /// We default to the all access list.
+  AllAccessType::self_iterator getIterator() {
+    return this->AllAccessType::getIterator();
+  }
+  AllAccessType::const_self_iterator getIterator() const {
+    return this->AllAccessType::getIterator();
+  }
+  AllAccessType::reverse_self_iterator getReverseIterator() {
+    return this->AllAccessType::getReverseIterator();
+  }
+  AllAccessType::const_reverse_self_iterator getReverseIterator() const {
+    return this->AllAccessType::getReverseIterator();
+  }
+  DefsOnlyType::self_iterator getDefsIterator() {
+    return this->DefsOnlyType::getIterator();
+  }
+  DefsOnlyType::const_self_iterator getDefsIterator() const {
+    return this->DefsOnlyType::getIterator();
+  }
+  DefsOnlyType::reverse_self_iterator getReverseDefsIterator() {
+    return this->DefsOnlyType::getReverseIterator();
+  }
+  DefsOnlyType::const_reverse_self_iterator getReverseDefsIterator() const {
+    return this->DefsOnlyType::getReverseIterator();
+  }
+
+protected:
+  friend class DetachSSA;
+  friend class DetachUseOrDef;
+  friend class DetachUse;
+  friend class DetachDef;
+  friend class DetachPhi;
+
+  /// \brief Used by DetachSSA to change the block of a DetachAccess when it is
+  /// moved.
+  void setBlock(BasicBlock *BB) { Block = BB; }
+
+  /// \brief Used for debugging and tracking things about DetachAccesses.
+  /// Guaranteed unique among DetachAccesses, no guarantees otherwise.
+  inline unsigned getID() const;
+
+  DetachAccess(LLVMContext &C, unsigned Vty, DeleteValueTy DeleteValue,
+               BasicBlock *BB, unsigned NumOperands)
+      : DerivedUser(Type::getVoidTy(C), Vty, nullptr, NumOperands, DeleteValue),
+        Block(BB) {}
+
+private:
+  BasicBlock *Block;
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const DetachAccess &DA) {
+  DA.print(OS);
+  return OS;
+}
+
+/// \brief Class that has the common methods + fields of detach uses/defs. It's
+/// a little awkward to have, but there are many cases where we want either a
+/// use or def, and there are many cases where uses are needed (defs aren't
+/// acceptable), and vice-versa.
+///
+/// This class should never be instantiated directly; make a DetachUse or
+/// DetachDef instead.
+class DetachUseOrDef : public DetachAccess {
+public:
+  void *operator new(size_t, unsigned) = delete;
+  void *operator new(size_t) = delete;
+
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  /// \brief Get the instruction that this DetachAccess represents.
+  Instruction *getDAInst() const { return DAInst; }
+
+  /// \brief Get the access that produces the detach state used by this Use.
+  DetachAccess *getDefiningAccess() const { return getOperand(0); }
+
+  static inline bool classof(const Value *DA) {
+    return DA->getValueID() == DetachUseVal || DA->getValueID() == DetachDefVal;
+  }
+
+  // Sadly, these have to be public because they are needed in some of the
+  // iterators.
+  inline bool isOptimized() const;
+  inline DetachAccess *getOptimized() const;
+  inline void setOptimized(DetachAccess *);
+
+  /// \brief Reset the ID of what this DetachUse was optimized to, causing it to
+  /// be rewalked by the walker if necessary.
+  /// This really should only be called by tests.
+  inline void resetOptimized();
+
+protected:
+  friend class DetachSSA;
+  DetachUseOrDef(LLVMContext &C, DetachAccess *DDA, unsigned Vty,
+                 DeleteValueTy DeleteValue, Instruction *TI, BasicBlock *BB)
+      : DetachAccess(C, Vty, DeleteValue, BB, 1), DAInst(TI) {
+    setDefiningAccess(DDA);
+  }
+  void setDefiningAccess(DetachAccess *DDA, bool Optimized = false) {
+    if (!Optimized) {
+      setOperand(0, DDA);
+      return;
+    }
+    setOptimized(DDA);
+  }
+
+private:
+  Instruction *DAInst;
+};
+
+template <>
+struct OperandTraits<DetachUseOrDef>
+    : public FixedNumOperandTraits<DetachUseOrDef, 1> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachUseOrDef, DetachAccess)
+
+/// \brief Represents a detach use, i.e., a sync instruction.
+class DetachUse final : public DetachUseOrDef {
+public:
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  DetachUse(LLVMContext &C, DetachAccess *DDA, Instruction *SI, BasicBlock *BB)
+      : DetachUseOrDef(C, DDA, DetachUseVal, deleteMe, SI, BB),
+        OptimizedID(0) {}
+
+  // allocate space for exactly one operand
+  void *operator new(size_t s) { return User::operator new(s, 1); }
+  void *operator new(size_t, unsigned) = delete;
+
+  static inline bool classof(const Value *DA) {
+    return DA->getValueID() == DetachUseVal;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  void setOptimized(DetachAccess *DDA) {
+    OptimizedID = DDA->getID();
+    setOperand(0, DDA);
+  }
+
+  bool isOptimized() const {
+    return getDefiningAccess() && OptimizedID == getDefiningAccess()->getID();
+  }
+
+  DetachAccess *getOptimized() const {
+    return getDefiningAccess();
+  }
+  void resetOptimized() {
+    OptimizedID = INVALID_DETACHACCESS_ID;
+  }
+
+protected:
+  friend class DetachSSA;
+
+private:
+  static void deleteMe(DerivedUser *Self);
+
+  unsigned int OptimizedID;
+};
+
+template <>
+struct OperandTraits<DetachUse> : public FixedNumOperandTraits<DetachUse, 1> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachUse, DetachAccess)
+
+/// \brief Represents a detach definition, i.e., a detach.
+class DetachDef final : public DetachUseOrDef {
+public:
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  DetachDef(LLVMContext &C, DetachAccess *DDA, Instruction *DI, BasicBlock *BB,
+            unsigned Ver)
+      : DetachUseOrDef(C, DDA, DetachDefVal, deleteMe, DI, BB),
+        ID(Ver), Optimized(nullptr), OptimizedID(INVALID_DETACHACCESS_ID) {}
+
+  // allocate space for exactly one operand
+  void *operator new(size_t s) { return User::operator new(s, 1); }
+  void *operator new(size_t, unsigned) = delete;
+
+  static inline bool classof(const Value *DA) {
+    return DA->getValueID() == DetachDefVal;
+  }
+
+  void setOptimized(DetachAccess *DA) {
+    Optimized = DA;
+    OptimizedID = getDefiningAccess()->getID();
+  }
+  DetachAccess *getOptimized() const { return Optimized; }
+  bool isOptimized() const {
+    return getOptimized() && getDefiningAccess() &&
+           OptimizedID == getDefiningAccess()->getID();
+  }
+  void resetOptimized() {
+    OptimizedID = INVALID_DETACHACCESS_ID;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  friend class DetachSSA;
+
+  unsigned getID() const { return ID; }
+
+private:
+  static void deleteMe(DerivedUser *Self);
+
+  const unsigned ID;
+  DetachAccess *Optimized;
+  unsigned int OptimizedID;
+};
+
+template <>
+struct OperandTraits<DetachDef> : public FixedNumOperandTraits<DetachDef, 1> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachDef, DetachAccess)
+
+/// \brief Represents phi nodes for detach accesses.
+///
+/// These have the same semantics as regular phi nodes, with the exception that
+/// only one phi will ever exist in a given basic block.
+/// Guaranteeing one phi per block means guaranteeing there is only ever one
+/// valid reaching DetachDef/DetachPHI along each path to the phi node.
+/// This is ensured by not allowing disambiguation of the RHS of a DetachDef or
+/// a DetachPhi's operands.
+class DetachPhi final : public DetachAccess {
+  // allocate space for exactly zero operands
+  void *operator new(size_t s) { return User::operator new(s); }
+
+public:
+  /// Provide fast operand accessors
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(DetachAccess);
+
+  DetachPhi(LLVMContext &C, BasicBlock *BB, unsigned Ver, unsigned NumPreds = 0)
+      : DetachAccess(C, DetachPhiVal, deleteMe, BB, 0), ID(Ver),
+        ReservedSpace(NumPreds) {
+    allocHungoffUses(ReservedSpace);
+  }
+
+  void *operator new(size_t, unsigned) = delete;
+
+  // Block iterator interface. This provides access to the list of incoming
+  // basic blocks, which parallels the list of incoming values.
+  typedef BasicBlock **block_iterator;
+  typedef BasicBlock *const *const_block_iterator;
+
+  block_iterator block_begin() {
+    auto *Ref = reinterpret_cast<Use::UserRef *>(op_begin() + ReservedSpace);
+    return reinterpret_cast<block_iterator>(Ref + 1);
+  }
+
+  const_block_iterator block_begin() const {
+    const auto *Ref =
+        reinterpret_cast<const Use::UserRef *>(op_begin() + ReservedSpace);
+    return reinterpret_cast<const_block_iterator>(Ref + 1);
+  }
+
+  block_iterator block_end() { return block_begin() + getNumOperands(); }
+
+  const_block_iterator block_end() const {
+    return block_begin() + getNumOperands();
+  }
+
+  iterator_range<block_iterator> blocks() {
+    return make_range(block_begin(), block_end());
+  }
+
+  iterator_range<const_block_iterator> blocks() const {
+    return make_range(block_begin(), block_end());
+  }
+
+  op_range incoming_values() { return operands(); }
+
+  const_op_range incoming_values() const { return operands(); }
+
+  /// \brief Return the number of incoming edges
+  unsigned getNumIncomingValues() const { return getNumOperands(); }
+
+  /// \brief Return incoming value number x
+  DetachAccess *getIncomingValue(unsigned I) const { return getOperand(I); }
+  void setIncomingValue(unsigned I, DetachAccess *V) {
+    assert(V && "PHI node got a null value!");
+    setOperand(I, V);
+  }
+  static unsigned getOperandNumForIncomingValue(unsigned I) { return I; }
+  static unsigned getIncomingValueNumForOperand(unsigned I) { return I; }
+
+  /// \brief Return incoming basic block number @p i.
+  BasicBlock *getIncomingBlock(unsigned I) const { return block_begin()[I]; }
+
+  /// \brief Return incoming basic block corresponding
+  /// to an operand of the PHI.
+  BasicBlock *getIncomingBlock(const Use &U) const {
+    assert(this == U.getUser() && "Iterator doesn't point to PHI's Uses?");
+    return getIncomingBlock(unsigned(&U - op_begin()));
+  }
+
+  /// \brief Return incoming basic block corresponding
+  /// to value use iterator.
+  BasicBlock *getIncomingBlock(DetachAccess::const_user_iterator I) const {
+    return getIncomingBlock(I.getUse());
+  }
+
+  void setIncomingBlock(unsigned I, BasicBlock *BB) {
+    assert(BB && "PHI node got a null basic block!");
+    block_begin()[I] = BB;
+  }
+
+  /// \brief Add an incoming value to the end of the PHI list
+  void addIncoming(DetachAccess *V, BasicBlock *BB) {
+    if (getNumOperands() == ReservedSpace)
+      growOperands(); // Get more space!
+    // Initialize some new operands.
+    setNumHungOffUseOperands(getNumOperands() + 1);
+    setIncomingValue(getNumOperands() - 1, V);
+    setIncomingBlock(getNumOperands() - 1, BB);
+  }
+
+  /// \brief Return the first index of the specified basic
+  /// block in the value list for this PHI.  Returns -1 if no instance.
+  int getBasicBlockIndex(const BasicBlock *BB) const {
+    for (unsigned I = 0, E = getNumOperands(); I != E; ++I)
+      if (block_begin()[I] == BB)
+        return I;
+    return -1;
+  }
+
+  Value *getIncomingValueForBlock(const BasicBlock *BB) const {
+    int Idx = getBasicBlockIndex(BB);
+    assert(Idx >= 0 && "Invalid basic block argument!");
+    return getIncomingValue(Idx);
+  }
+
+  static inline bool classof(const Value *V) {
+    return V->getValueID() == DetachPhiVal;
+  }
+
+  void print(raw_ostream &OS) const;
+
+  unsigned getID() const { return ID; }
+
+protected:
+  friend class DetachSSA;
+
+  /// \brief this is more complicated than the generic
+  /// User::allocHungoffUses, because we have to allocate Uses for the incoming
+  /// values and pointers to the incoming blocks, all in one allocation.
+  void allocHungoffUses(unsigned N) {
+    User::allocHungoffUses(N, /* IsPhi */ true);
+  }
+
+private:
+  // For debugging only
+  const unsigned ID;
+  unsigned ReservedSpace;
+
+  /// \brief This grows the operand list in response to a push_back style of
+  /// operation.  This grows the number of ops by 1.5 times.
+  void growOperands() {
+    unsigned E = getNumOperands();
+    // 2 op PHI nodes are VERY common, so reserve at least enough for that.
+    ReservedSpace = std::max(E + E / 2, 2u);
+    growHungoffUses(ReservedSpace, /* IsPhi */ true);
+  }
+
+  static void deleteMe(DerivedUser *Self);
+};
+
+inline unsigned DetachAccess::getID() const {
+  assert((isa<DetachDef>(this) || isa<DetachPhi>(this)) &&
+         "only detach defs and phis have ids");
+  if (const auto *DD = dyn_cast<DetachDef>(this))
+    return DD->getID();
+  return cast<DetachPhi>(this)->getID();
+}
+
+inline bool DetachUseOrDef::isOptimized() const {
+  if (const auto *DD = dyn_cast<DetachDef>(this))
+    return DD->isOptimized();
+  return cast<DetachUse>(this)->isOptimized();
+}
+
+inline DetachAccess *DetachUseOrDef::getOptimized() const {
+  if (const auto *DD = dyn_cast<DetachDef>(this))
+    return DD->getOptimized();
+  return cast<DetachUse>(this)->getOptimized();
+}
+
+inline void DetachUseOrDef::setOptimized(DetachAccess *DA) {
+  if (auto *DD = dyn_cast<DetachDef>(this))
+    DD->setOptimized(DA);
+  else
+    cast<DetachUse>(this)->setOptimized(DA);
+}
+
+inline void DetachUseOrDef::resetOptimized() {
+  if (auto *DD = dyn_cast<DetachDef>(this))
+    DD->resetOptimized();
+  else
+    cast<DetachUse>(this)->resetOptimized();
+}
+
+
+template <> struct OperandTraits<DetachPhi> : public HungoffOperandTraits<2> {};
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachPhi, DetachAccess)
+
+
+/// \brief Encapsulates DetachSSA, including all data associated with detach
+/// accesses.
+class DetachSSA {
+public:
+  DetachSSA(Function &, DominatorTree *);
+  ~DetachSSA();
+
+  /// \brief Given a detach Mod/Ref'ing instruction, get the DetachSSA
+  /// access associated with it. If passed a basic block gets the detach phi
+  /// node that exists for that block, if there is one. Otherwise, this will get
+  /// a DetachUseOrDef.
+  DetachUseOrDef *getDetachAccess(const Instruction *) const;
+  DetachPhi *getDetachAccess(const BasicBlock *BB) const;
+
+  void dump() const;
+  void print(raw_ostream &) const;
+
+  /// \brief Return true if \p MA represents the live on entry value
+  inline bool isLiveOnEntryDef(const DetachAccess *DA) const {
+    return DA == LiveOnEntryDef.get();
+  }
+
+  inline DetachAccess *getLiveOnEntryDef() const {
+    return LiveOnEntryDef.get();
+  }
+
+  // Sadly, iplists, by default, owns and deletes pointers added to the
+  // list. It's not currently possible to have two iplists for the same type,
+  // where one owns the pointers, and one does not. This is because the traits
+  // are per-type, not per-tag.  If this ever changes, we should make the
+  // DefList an iplist.
+  using AccessList = iplist<DetachAccess, ilist_tag<DSSAHelpers::AllAccessTag>>;
+  using DefsList =
+      simple_ilist<DetachAccess, ilist_tag<DSSAHelpers::DefsOnlyTag>>;
+
+  /// \brief Return the list of MemoryAccess's for a given basic block.
+  ///
+  /// This list is not modifiable by the user.
+  const AccessList *getBlockAccesses(const BasicBlock *BB) const {
+    return getWritableBlockAccesses(BB);
+  }
+
+  /// \brief Return the list of MemoryDef's and MemoryPhi's for a given basic
+  /// block.
+  ///
+  /// This list is not modifiable by the user.
+  const DefsList *getBlockDefs(const BasicBlock *BB) const {
+    return getWritableBlockDefs(BB);
+  }
+
+  /// \brief Given two detach accesses in the same basic block, determine
+  /// whether DetachAccess \p A dominates DetachAccess \p B.
+  bool locallyDominates(const DetachAccess *A, const DetachAccess *B) const;
+
+  /// \brief Given two detach accesses in potentially different blocks,
+  /// determine whether DetachAccess \p A dominates DetachAccess \p B.
+  bool dominates(const DetachAccess *A, const DetachAccess *B) const;
+
+  /// \brief Given a DetachAccess and a Use, determine whether DetachAccess \p A
+  /// dominates Use \p B.
+  bool dominates(const DetachAccess *A, const Use &B) const;
+
+  /// \brief Verify that DetachSSA is self consistent (IE definitions dominate
+  /// all uses, uses appear in the right places).  This is used by unit tests.
+  void verifyDetachSSA() const;
+
+  /// Used in various insertion functions to specify whether we are talking
+  /// about the beginning or end of a block.
+  enum InsertionPlace { Beginning, End };
+
+protected:
+  // Used by Detach SSA annotater, dumpers, and wrapper pass
+  friend class DetachSSAAnnotatedWriter;
+  friend class DetachSSAPrinterLegacyPass;
+
+  void verifyDefUses(Function &F) const;
+  void verifyDomination(Function &F) const;
+  void verifyOrdering(Function &F) const;
+
+  AccessList *getWritableBlockAccesses(const BasicBlock *BB) const {
+    auto It = PerBlockAccesses.find(BB);
+    return It == PerBlockAccesses.end() ? nullptr : It->second.get();
+  }
+
+  DefsList *getWritableBlockDefs(const BasicBlock *BB) const {
+    auto It = PerBlockDefs.find(BB);
+    return It == PerBlockDefs.end() ? nullptr : It->second.get();
+  }
+
+  void moveTo(DetachUseOrDef *What, BasicBlock *BB, AccessList::iterator Where);
+  void moveTo(DetachUseOrDef *What, BasicBlock *BB, InsertionPlace Point);
+  // Rename the dominator tree branch rooted at BB.
+  void renamePass(BasicBlock *BB, DetachAccess *IncomingVal,
+                  SmallPtrSetImpl<BasicBlock *> &Visited) {
+    renamePass(DT->getNode(BB), IncomingVal, Visited, true, true);
+  }
+  void removeFromLookups(DetachAccess *);
+  void removeFromLists(DetachAccess *, bool ShouldDelete = true);
+  void insertIntoListsForBlock(DetachAccess *, const BasicBlock *,
+                               InsertionPlace);
+  void insertIntoListsBefore(DetachAccess *, const BasicBlock *,
+                             AccessList::iterator);
+  // DetachUseOrDef *createDefinedAccess(Instruction *, DetachAccess *);
+
+private:
+  // class CachingWalker;
+
+  // CachingWalker *getWalkerImpl();
+  void buildDetachSSA();
+
+  void verifyUseInDefs(DetachAccess *, DetachAccess *) const;
+  using AccessMap = DenseMap<const BasicBlock *, std::unique_ptr<AccessList>>;
+  using DefsMap = DenseMap<const BasicBlock *, std::unique_ptr<DefsList>>;
+
+  void
+  determineInsertionPoint(const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks);
+  void markUnreachableAsLiveOnEntry(BasicBlock *BB);
+  bool dominatesUse(const DetachAccess *, const DetachAccess *) const;
+  DetachPhi *createDetachPhi(BasicBlock *BB);
+  // DetachUseOrDef *createNewAccess(Instruction *);
+  DetachAccess *findDominatingDef(BasicBlock *, enum InsertionPlace);
+  void placePHINodes(const SmallPtrSetImpl<BasicBlock *> &,
+                     const DenseMap<const BasicBlock *, unsigned int> &);
+  DetachAccess *renameBlock(BasicBlock *, DetachAccess *, bool);
+  void renameSuccessorPhis(BasicBlock *, DetachAccess *, bool);
+  void renamePass(DomTreeNode *, DetachAccess *IncomingVal,
+                  SmallPtrSetImpl<BasicBlock *> &Visited,
+                  bool SkipVisited = false, bool RenameAllUses = false);
+  AccessList *getOrCreateAccessList(const BasicBlock *);
+  DefsList *getOrCreateDefsList(const BasicBlock *);
+  void renumberBlock(const BasicBlock *) const;
+  DominatorTree *DT;
+  Function &F;
+
+  // Detach SSA mappings
+  DenseMap<const Value *, DetachAccess *> ValueToDetachAccess;
+  // These two mappings contain the main block to access/def mappings for
+  // DetachSSA. The list contained in PerBlockAccesses really owns all the
+  // DetachAccesses.
+  // Both maps maintain the invariant that if a block is found in them, the
+  // corresponding list is not empty, and if a block is not found in them, the
+  // corresponding list is empty.
+  AccessMap PerBlockAccesses;
+  DefsMap PerBlockDefs;
+  std::unique_ptr<DetachAccess> LiveOnEntryDef;
+
+  // Domination mappings
+  // Note that the numbering is local to a block, even though the map is
+  // global.
+  mutable SmallPtrSet<const BasicBlock *, 16> BlockNumberingValid;
+  mutable DenseMap<const DetachAccess *, unsigned long> BlockNumbering;
+
+  // Memory SSA building info
+  // std::unique_ptr<CachingWalker> Walker;
+  unsigned NextID;
+};
+
+// This pass does eager building and then printing of DetachSSA. It is used by
+// the tests to be able to build, dump, and verify Detach SSA.
+class DetachSSAPrinterLegacyPass : public FunctionPass {
+public:
+  DetachSSAPrinterLegacyPass();
+
+  bool runOnFunction(Function &) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  static char ID;
+};
+
+/// An analysis that produces \c DetachSSA for a function.
+///
+class DetachSSAAnalysis : public AnalysisInfoMixin<DetachSSAAnalysis> {
+  friend AnalysisInfoMixin<DetachSSAAnalysis>;
+
+  static AnalysisKey Key;
+
+public:
+  // Wrap DetachSSA result to ensure address stability of internal DetachSSA
+  // pointers after construction.  Use a wrapper class instead of plain
+  // unique_ptr<DetachSSA> to avoid build breakage on MSVC.
+  struct Result {
+    Result(std::unique_ptr<DetachSSA> &&DSSA) : DSSA(std::move(DSSA)) {}
+    DetachSSA &getDSSA() { return *DSSA.get(); }
+
+    std::unique_ptr<DetachSSA> DSSA;
+  };
+
+  Result run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// \brief Printer pass for \c DetachSSA.
+class DetachSSAPrinterPass : public PassInfoMixin<DetachSSAPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit DetachSSAPrinterPass(raw_ostream &OS) : OS(OS) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// \brief Verifier pass for \c DetachSSA.
+struct DetachSSAVerifierPass : PassInfoMixin<DetachSSAVerifierPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// \brief Legacy analysis pass which computes \c DetachSSA.
+class DetachSSAWrapperPass : public FunctionPass {
+public:
+  DetachSSAWrapperPass();
+
+  static char ID;
+
+  bool runOnFunction(Function &) override;
+  void releaseMemory() override;
+  DetachSSA &getDSSA() { return *DSSA; }
+  const DetachSSA &getDSSA() const { return *DSSA; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void verifyAnalysis() const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+private:
+  std::unique_ptr<DetachSSA> DSSA;
+};
+
+/// \brief Iterator base class used to implement const and non-const iterators
+/// over the defining accesses of a DetachAccess.
+template <class T>
+class detachaccess_def_iterator_base
+    : public iterator_facade_base<detachaccess_def_iterator_base<T>,
+                                  std::forward_iterator_tag, T, ptrdiff_t, T *,
+                                  T *> {
+  using BaseT = typename detachaccess_def_iterator_base::iterator_facade_base;
+
+public:
+  detachaccess_def_iterator_base(T *Start) : Access(Start) {}
+  detachaccess_def_iterator_base() = default;
+
+  bool operator==(const detachaccess_def_iterator_base &Other) const {
+    return Access == Other.Access && (!Access || ArgNo == Other.ArgNo);
+  }
+
+  // This is a bit ugly, but for DetachPHI's, unlike PHINodes, you can't get the
+  // block from the operand in constant time (In a PHINode, the uselist has
+  // both, so it's just subtraction). We provide it as part of the
+  // iterator to avoid callers having to linear walk to get the block.
+  // If the operation becomes constant time on DetachPHI's, this bit of
+  // abstraction breaking should be removed.
+  BasicBlock *getPhiArgBlock() const {
+    DetachPhi *DP = dyn_cast<DetachPhi>(Access);
+    assert(DP && "Tried to get phi arg block when not iterating over a PHI");
+    return DP->getIncomingBlock(ArgNo);
+  }
+  typename BaseT::iterator::pointer operator*() const {
+    assert(Access && "Tried to access past the end of our iterator");
+    // Go to the first argument for phis, and the defining access for everything
+    // else.
+    if (DetachPhi *DP = dyn_cast<DetachPhi>(Access))
+      return DP->getIncomingValue(ArgNo);
+    return cast<DetachUseOrDef>(Access)->getDefiningAccess();
+  }
+  using BaseT::operator++;
+  detachaccess_def_iterator &operator++() {
+    assert(Access && "Hit end of iterator");
+    if (DetachPhi *DP = dyn_cast<DetachPhi>(Access)) {
+      if (++ArgNo >= DP->getNumIncomingValues()) {
+        ArgNo = 0;
+        Access = nullptr;
+      }
+    } else {
+      Access = nullptr;
+    }
+    return *this;
+  }
+
+private:
+  T *Access = nullptr;
+  unsigned ArgNo = 0;
+};
+
+inline detachaccess_def_iterator DetachAccess::defs_begin() {
+  return detachaccess_def_iterator(this);
+}
+
+inline const_detachaccess_def_iterator DetachAccess::defs_begin() const {
+  return const_detachaccess_def_iterator(this);
+}
+
+inline detachaccess_def_iterator DetachAccess::defs_end() {
+  return detachaccess_def_iterator();
+}
+
+inline const_detachaccess_def_iterator DetachAccess::defs_end() const {
+  return const_detachaccess_def_iterator();
+}
+
+/// \brief GraphTraits for a DetachAccess, which walks defs in the normal case,
+/// and uses in the inverse case.
+template <> struct GraphTraits<DetachAccess *> {
+  using NodeRef = DetachAccess *;
+  using ChildIteratorType = detachaccess_def_iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->defs_begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->defs_end(); }
+};
+
+template <> struct GraphTraits<Inverse<DetachAccess *>> {
+  using NodeRef = DetachAccess *;
+  using ChildIteratorType = DetachAccess::iterator;
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+  static ChildIteratorType child_begin(NodeRef N) { return N->user_begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return N->user_end(); }
+};
+
+} // End namespace llvm
+
+#endif // LLVM_ANALYSIS_DETACHSSA_H
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index f0d11e9c16894e..c53ca11aaae3ea 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -534,6 +534,9 @@ enum FunctionCodes {
   // 54 is unused.
   FUNC_CODE_OPERAND_BUNDLE = 55, // OPERAND_BUNDLE: [tag#, value...]
   FUNC_CODE_INST_UNOP = 56,      // UNOP:       [opcode, ty, opval]
+  FUNC_CODE_INST_DETACH      = 57, // DETACH: [bb#, bb#]
+  FUNC_CODE_INST_REATTACH    = 58, // REATTACH
+  FUNC_CODE_INST_SYNC        = 59, // SYNC: [bb#]
 };
 
 enum UseListCodes {
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index d1770bf6e4cead..3c9c9d4f76be62 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -282,6 +282,12 @@ class IRTranslator : public MachineFunctionPass {
 
   bool translateIndirectBr(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateDetach(const User &U, MachineIRBuilder &MIRBuilder);
+
+  bool translateReattach(const User &U, MachineIRBuilder &MIRBuilder);
+
+  bool translateSync(const User &U, MachineIRBuilder &MIRBuilder);
+
   bool translateExtractValue(const User &U, MachineIRBuilder &MIRBuilder);
 
   bool translateInsertValue(const User &U, MachineIRBuilder &MIRBuilder);
diff --git a/llvm/include/llvm/IR/BasicBlock.h b/llvm/include/llvm/IR/BasicBlock.h
index 99eac33f742ec2..bcab517887c790 100644
--- a/llvm/include/llvm/IR/BasicBlock.h
+++ b/llvm/include/llvm/IR/BasicBlock.h
@@ -382,6 +382,7 @@ class BasicBlock final : public Value, // Basic blocks are data objects also
   ///
   /// Also note that this doesn't preserve any passes. To split blocks while
   /// keeping loop information consistent, use the SplitBlock utility function.
+  BasicBlock *splitBasicBlockWithTerminator(const Twine &BBName = "");
   BasicBlock *splitBasicBlock(iterator I, const Twine &BBName = "");
   BasicBlock *splitBasicBlock(Instruction *I, const Twine &BBName = "") {
     return splitBasicBlock(I->getIterator(), BBName);
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 9526d6287d2f83..8c56973b517d6c 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -245,6 +245,10 @@ class StructType : public CompositeType {
   /// Create an empty structure type.
   static StructType *get(LLVMContext &Context, bool isPacked = false);
 
+  /// Try to lookup a structure type by name, and create one if one does not
+  /// exist.
+  static StructType *getOrCreate(LLVMContext &Context, StringRef Name);
+
   /// This static method is a convenience method for creating structure types by
   /// specifying the elements as arguments. Note that this method always returns
   /// a non-packed struct, and requires at least one element type.
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
index fac2ff46c4531a..d5746c86329fd4 100644
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -979,6 +979,26 @@ class IRBuilder : public IRBuilderBase, public Inserter {
     return Insert(new UnreachableInst(Context));
   }
 
+  /// \brief Create a detach instruction, 'detach within SyncRegion, Detached,
+  // Continue'.
+  DetachInst *CreateDetach(BasicBlock *Detached, BasicBlock *Continue,
+                           Value *SyncRegion, MDNode *BranchWeights = nullptr) {
+    return Insert(addBranchMetadata(DetachInst::Create(Detached, Continue,
+                                                       SyncRegion),
+                                    BranchWeights, nullptr));
+  }
+
+  /// \brief Create a reattach instruction, 'reattach within SyncRegion,
+  /// DetachContinue'.
+  ReattachInst *CreateReattach(BasicBlock *DetachContinue, Value *SyncRegion) {
+    return Insert(ReattachInst::Create(DetachContinue, SyncRegion));
+  }
+
+  /// \brief Create a sync instruction, 'sync within SyncRegion, Continue'.
+  SyncInst *CreateSync(BasicBlock *Continue, Value *SyncRegion) {
+    return Insert(SyncInst::Create(Continue, SyncRegion));
+  }
+
   //===--------------------------------------------------------------------===//
   // Instruction creation methods: Binary Operators
   //===--------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h
index c5b4c6f71d7d8e..f068b39f959215 100644
--- a/llvm/include/llvm/IR/InstVisitor.h
+++ b/llvm/include/llvm/IR/InstVisitor.h
@@ -166,6 +166,18 @@ class InstVisitor {
   // Specific Instruction type classes... note that all of the casts are
   // necessary because we use the instruction classes as opaque types...
   //
+  RetTy visitReturnInst(ReturnInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitBranchInst(BranchInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitSwitchInst(SwitchInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitIndirectBrInst(IndirectBrInst &I)    { DELEGATE(TerminatorInst);}
+  RetTy visitResumeInst(ResumeInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitUnreachableInst(UnreachableInst &I)  { DELEGATE(TerminatorInst);}
+  RetTy visitCleanupReturnInst(CleanupReturnInst &I) { DELEGATE(TerminatorInst);}
+  RetTy visitCatchReturnInst(CatchReturnInst &I)  { DELEGATE(TerminatorInst); }
+  RetTy visitCatchSwitchInst(CatchSwitchInst &I)  { DELEGATE(TerminatorInst);}
+  RetTy visitDetachInst(DetachInst &I)            { DELEGATE(TerminatorInst);}
+  RetTy visitReattachInst(ReattachInst &I)        { DELEGATE(TerminatorInst);}
+  RetTy visitSyncInst(SyncInst &I)                { DELEGATE(TerminatorInst);}
   RetTy visitICmpInst(ICmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitFCmpInst(FCmpInst &I)                { DELEGATE(CmpInst);}
   RetTy visitAllocaInst(AllocaInst &I)            { DELEGATE(UnaryInstruction);}
diff --git a/llvm/include/llvm/IR/Instruction.def b/llvm/include/llvm/IR/Instruction.def
index 58e4e2e1d6cc50..80297e4c7ab603 100644
--- a/llvm/include/llvm/IR/Instruction.def
+++ b/llvm/include/llvm/IR/Instruction.def
@@ -135,89 +135,92 @@ HANDLE_TERM_INST  ( 7, Unreachable   , UnreachableInst)
 HANDLE_TERM_INST  ( 8, CleanupRet    , CleanupReturnInst)
 HANDLE_TERM_INST  ( 9, CatchRet      , CatchReturnInst)
 HANDLE_TERM_INST  (10, CatchSwitch   , CatchSwitchInst)
-  LAST_TERM_INST  (10)
+HANDLE_TERM_INST  (11, Detach        , DetachInst)
+HANDLE_TERM_INST  (12, Reattach      , ReattachInst)
+HANDLE_TERM_INST  (13, Sync          , SyncInst)
+  LAST_TERM_INST  (13)
 
 // Standard unary operators...
- FIRST_UNARY_INST(11)
-HANDLE_UNARY_INST(11, FNeg  , UnaryOperator)
-  LAST_UNARY_INST(11)
+ FIRST_UNARY_INST(14)
+HANDLE_UNARY_INST(14, FNeg  , UnaryOperator)
+  LAST_UNARY_INST(14)
 
 // Standard binary operators...
- FIRST_BINARY_INST(12)
-HANDLE_BINARY_INST(12, Add  , BinaryOperator)
-HANDLE_BINARY_INST(13, FAdd , BinaryOperator)
-HANDLE_BINARY_INST(14, Sub  , BinaryOperator)
-HANDLE_BINARY_INST(15, FSub , BinaryOperator)
-HANDLE_BINARY_INST(16, Mul  , BinaryOperator)
-HANDLE_BINARY_INST(17, FMul , BinaryOperator)
-HANDLE_BINARY_INST(18, UDiv , BinaryOperator)
-HANDLE_BINARY_INST(19, SDiv , BinaryOperator)
-HANDLE_BINARY_INST(20, FDiv , BinaryOperator)
-HANDLE_BINARY_INST(21, URem , BinaryOperator)
-HANDLE_BINARY_INST(22, SRem , BinaryOperator)
-HANDLE_BINARY_INST(23, FRem , BinaryOperator)
+ FIRST_BINARY_INST(14)
+HANDLE_BINARY_INST(14, Add  , BinaryOperator)
+HANDLE_BINARY_INST(15, FAdd , BinaryOperator)
+HANDLE_BINARY_INST(16, Sub  , BinaryOperator)
+HANDLE_BINARY_INST(17, FSub , BinaryOperator)
+HANDLE_BINARY_INST(18, Mul  , BinaryOperator)
+HANDLE_BINARY_INST(19, FMul , BinaryOperator)
+HANDLE_BINARY_INST(20, UDiv , BinaryOperator)
+HANDLE_BINARY_INST(21, SDiv , BinaryOperator)
+HANDLE_BINARY_INST(22, FDiv , BinaryOperator)
+HANDLE_BINARY_INST(23, URem , BinaryOperator)
+HANDLE_BINARY_INST(24, SRem , BinaryOperator)
+HANDLE_BINARY_INST(25, FRem , BinaryOperator)
 
 // Logical operators (integer operands)
-HANDLE_BINARY_INST(24, Shl  , BinaryOperator) // Shift left  (logical)
-HANDLE_BINARY_INST(25, LShr , BinaryOperator) // Shift right (logical)
-HANDLE_BINARY_INST(26, AShr , BinaryOperator) // Shift right (arithmetic)
-HANDLE_BINARY_INST(27, And  , BinaryOperator)
-HANDLE_BINARY_INST(28, Or   , BinaryOperator)
-HANDLE_BINARY_INST(29, Xor  , BinaryOperator)
-  LAST_BINARY_INST(29)
+HANDLE_BINARY_INST(26, Shl  , BinaryOperator) // Shift left  (logical)
+HANDLE_BINARY_INST(27, LShr , BinaryOperator) // Shift right (logical)
+HANDLE_BINARY_INST(28, AShr , BinaryOperator) // Shift right (arithmetic)
+HANDLE_BINARY_INST(29, And  , BinaryOperator)
+HANDLE_BINARY_INST(30, Or   , BinaryOperator)
+HANDLE_BINARY_INST(31, Xor  , BinaryOperator)
+  LAST_BINARY_INST(31)
 
 // Memory operators...
- FIRST_MEMORY_INST(30)
-HANDLE_MEMORY_INST(30, Alloca, AllocaInst)  // Stack management
-HANDLE_MEMORY_INST(31, Load  , LoadInst  )  // Memory manipulation instrs
-HANDLE_MEMORY_INST(32, Store , StoreInst )
-HANDLE_MEMORY_INST(33, GetElementPtr, GetElementPtrInst)
-HANDLE_MEMORY_INST(34, Fence , FenceInst )
-HANDLE_MEMORY_INST(35, AtomicCmpXchg , AtomicCmpXchgInst )
-HANDLE_MEMORY_INST(36, AtomicRMW , AtomicRMWInst )
-  LAST_MEMORY_INST(36)
+ FIRST_MEMORY_INST(32)
+HANDLE_MEMORY_INST(32, Alloca, AllocaInst)  // Stack management
+HANDLE_MEMORY_INST(33, Load  , LoadInst  )  // Memory manipulation instrs
+HANDLE_MEMORY_INST(34, Store , StoreInst )
+HANDLE_MEMORY_INST(35, GetElementPtr, GetElementPtrInst)
+HANDLE_MEMORY_INST(36, Fence , FenceInst )
+HANDLE_MEMORY_INST(37, AtomicCmpXchg , AtomicCmpXchgInst )
+HANDLE_MEMORY_INST(38, AtomicRMW , AtomicRMWInst )
+  LAST_MEMORY_INST(38)
 
 // Cast operators ...
 // NOTE: The order matters here because CastInst::isEliminableCastPair
 // NOTE: (see Instructions.cpp) encodes a table based on this ordering.
- FIRST_CAST_INST(37)
-HANDLE_CAST_INST(37, Trunc   , TruncInst   )  // Truncate integers
-HANDLE_CAST_INST(38, ZExt    , ZExtInst    )  // Zero extend integers
-HANDLE_CAST_INST(39, SExt    , SExtInst    )  // Sign extend integers
-HANDLE_CAST_INST(40, FPToUI  , FPToUIInst  )  // floating point -> UInt
-HANDLE_CAST_INST(41, FPToSI  , FPToSIInst  )  // floating point -> SInt
-HANDLE_CAST_INST(42, UIToFP  , UIToFPInst  )  // UInt -> floating point
-HANDLE_CAST_INST(43, SIToFP  , SIToFPInst  )  // SInt -> floating point
-HANDLE_CAST_INST(44, FPTrunc , FPTruncInst )  // Truncate floating point
-HANDLE_CAST_INST(45, FPExt   , FPExtInst   )  // Extend floating point
-HANDLE_CAST_INST(46, PtrToInt, PtrToIntInst)  // Pointer -> Integer
-HANDLE_CAST_INST(47, IntToPtr, IntToPtrInst)  // Integer -> Pointer
-HANDLE_CAST_INST(48, BitCast , BitCastInst )  // Type cast
-HANDLE_CAST_INST(49, AddrSpaceCast, AddrSpaceCastInst)  // addrspace cast
-  LAST_CAST_INST(49)
-
- FIRST_FUNCLETPAD_INST(50)
-HANDLE_FUNCLETPAD_INST(50, CleanupPad, CleanupPadInst)
-HANDLE_FUNCLETPAD_INST(51, CatchPad  , CatchPadInst)
-  LAST_FUNCLETPAD_INST(51)
+ FIRST_CAST_INST(39)
+HANDLE_CAST_INST(39, Trunc   , TruncInst   )  // Truncate integers
+HANDLE_CAST_INST(40, ZExt    , ZExtInst    )  // Zero extend integers
+HANDLE_CAST_INST(41, SExt    , SExtInst    )  // Sign extend integers
+HANDLE_CAST_INST(42, FPToUI  , FPToUIInst  )  // floating point -> UInt
+HANDLE_CAST_INST(43, FPToSI  , FPToSIInst  )  // floating point -> SInt
+HANDLE_CAST_INST(44, UIToFP  , UIToFPInst  )  // UInt -> floating point
+HANDLE_CAST_INST(45, SIToFP  , SIToFPInst  )  // SInt -> floating point
+HANDLE_CAST_INST(46, FPTrunc , FPTruncInst )  // Truncate floating point
+HANDLE_CAST_INST(47, FPExt   , FPExtInst   )  // Extend floating point
+HANDLE_CAST_INST(48, PtrToInt, PtrToIntInst)  // Pointer -> Integer
+HANDLE_CAST_INST(49, IntToPtr, IntToPtrInst)  // Integer -> Pointer
+HANDLE_CAST_INST(50, BitCast , BitCastInst )  // Type cast
+HANDLE_CAST_INST(51, AddrSpaceCast, AddrSpaceCastInst)  // addrspace cast
+  LAST_CAST_INST(51)
+
+ FIRST_FUNCLETPAD_INST(52)
+HANDLE_FUNCLETPAD_INST(52, CleanupPad, CleanupPadInst)
+HANDLE_FUNCLETPAD_INST(53, CatchPad  , CatchPadInst)
+  LAST_FUNCLETPAD_INST(53)
 
 // Other operators...
- FIRST_OTHER_INST(52)
-HANDLE_OTHER_INST(52, ICmp   , ICmpInst   )  // Integer comparison instruction
-HANDLE_OTHER_INST(53, FCmp   , FCmpInst   )  // Floating point comparison instr.
-HANDLE_OTHER_INST(54, PHI    , PHINode    )  // PHI node instruction
-HANDLE_OTHER_INST(55, Call   , CallInst   )  // Call a function
-HANDLE_OTHER_INST(56, Select , SelectInst )  // select instruction
-HANDLE_USER_INST (57, UserOp1, Instruction)  // May be used internally in a pass
-HANDLE_USER_INST (58, UserOp2, Instruction)  // Internal to passes only
-HANDLE_OTHER_INST(59, VAArg  , VAArgInst  )  // vaarg instruction
-HANDLE_OTHER_INST(60, ExtractElement, ExtractElementInst)// extract from vector
-HANDLE_OTHER_INST(61, InsertElement, InsertElementInst)  // insert into vector
-HANDLE_OTHER_INST(62, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
-HANDLE_OTHER_INST(63, ExtractValue, ExtractValueInst)// extract from aggregate
-HANDLE_OTHER_INST(64, InsertValue, InsertValueInst)  // insert into aggregate
-HANDLE_OTHER_INST(65, LandingPad, LandingPadInst)  // Landing pad instruction.
-  LAST_OTHER_INST(65)
+ FIRST_OTHER_INST(54)
+HANDLE_OTHER_INST(54, ICmp   , ICmpInst   )  // Integer comparison instruction
+HANDLE_OTHER_INST(55, FCmp   , FCmpInst   )  // Floating point comparison instr.
+HANDLE_OTHER_INST(56, PHI    , PHINode    )  // PHI node instruction
+HANDLE_OTHER_INST(57, Call   , CallInst   )  // Call a function
+HANDLE_OTHER_INST(58, Select , SelectInst )  // select instruction
+HANDLE_USER_INST (59, UserOp1, Instruction)  // May be used internally in a pass
+HANDLE_USER_INST (60, UserOp2, Instruction)  // Internal to passes only
+HANDLE_OTHER_INST(61, VAArg  , VAArgInst  )  // vaarg instruction
+HANDLE_OTHER_INST(62, ExtractElement, ExtractElementInst)// extract from vector
+HANDLE_OTHER_INST(63, InsertElement, InsertElementInst)  // insert into vector
+HANDLE_OTHER_INST(64, ShuffleVector, ShuffleVectorInst)  // shuffle two vectors.
+HANDLE_OTHER_INST(65, ExtractValue, ExtractValueInst)// extract from aggregate
+HANDLE_OTHER_INST(66, InsertValue, InsertValueInst)  // insert into aggregate
+HANDLE_OTHER_INST(67, LandingPad, LandingPadInst)  // Landing pad instruction.
+  LAST_OTHER_INST(67)
 
 #undef  FIRST_TERM_INST
 #undef HANDLE_TERM_INST
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 5e78cb1edf02b1..f2f161730c7084 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -545,6 +545,7 @@ class Instruction : public User,
     // This list should be kept in sync with the list in mayWriteToMemory for
     // all opcodes which don't have a memory location.
     case Instruction::Fence:
+    case Instruction::Sync: // Like Instruction::Fence
     case Instruction::CatchPad:
     case Instruction::CatchRet:
     case Instruction::Call:
diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 0ff8f56f213ad0..5557cb4fdae59c 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -4403,6 +4403,255 @@ class UnreachableInst : public Instruction {
   }
 };
 
+//===----------------------------------------------------------------------===//
+//                               DetachInst Class
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------------
+/// DetachInst - Detach instruction
+///
+class DetachInst : public TerminatorInst {
+  /// Ops list - The operands are ordered: Detached, Continue.
+  DetachInst(const DetachInst &DI);
+  void AssertOK();
+  // DetachInst constructors (where {D, C} are blocks and SR is a token):
+  // DetachInst(BB *D, BB *C, Value *SR)          - 'detach SR, D, C'
+  // DetachInst(BB *D, BB *C, Value *SR, Inst *I)
+  //                                        - 'detach SR, D, C', insert before I
+  // DetachInst(BB *D, BB *C, Value *SR, BB *I)
+  //                                        - 'detach SR, D, C', insert at end
+  DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+             Value *SyncRegion,
+             Instruction *InsertBefore = nullptr);
+  DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+             Value *SyncRegion,
+             BasicBlock *InsertAtEnd);
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+  DetachInst *cloneImpl() const;
+
+public:
+  static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue,
+                            Value *SyncRegion,
+                            Instruction *InsertBefore = nullptr) {
+    return new(3) DetachInst(Detached, Continue, SyncRegion, InsertBefore);
+  }
+  static DetachInst *Create(BasicBlock *Detached, BasicBlock *Continue,
+                            Value *SyncRegion,
+                            BasicBlock *InsertAtEnd) {
+    return new(3) DetachInst(Detached, Continue, SyncRegion, InsertAtEnd);
+  }
+
+  /// Provide fast operand accessors
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  Value *getSyncRegion() const {
+    return Op<-3>();
+  }
+
+  void setSyncRegion(Value *SyncRegion) {
+    Op<-3>() = SyncRegion;
+  }
+
+  unsigned getNumSuccessors() const { return 2; }
+
+  BasicBlock *getSuccessor(unsigned i) const {
+    assert(i < getNumSuccessors() && "Successor # out of range for detach!");
+    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
+  }
+
+  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
+    assert(idx < getNumSuccessors() && "Successor # out of range for detach!");
+    *(&Op<-1>() - idx) = (Value*)NewSucc;
+  }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Instruction *I) {
+    return (I->getOpcode() == Instruction::Detach);
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+
+  inline BasicBlock* getDetached() const { return getSuccessor(0); }
+  inline BasicBlock* getContinue() const { return getSuccessor(1); }
+private:
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
+};
+
+template <>
+struct OperandTraits<DetachInst> : public VariadicOperandTraits<DetachInst, 1> {
+};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(DetachInst, Value)
+
+//===----------------------------------------------------------------------===//
+//                           ReattachInst Class
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------------
+/// ReattachInst - Reattach instruction.  This instruction terminates
+/// a subCFG and has no successors.  The DetachContinue field
+/// maintains the continue block after the detach instruction
+/// corresponding to this reattach.
+///
+class ReattachInst : public TerminatorInst {
+  ReattachInst(const ReattachInst &RI);
+  void AssertOK();
+  // ReattachInst constructors (where C is a block and SR is a token):
+  // ReattachInst(BB *C, Value *SR)          - 'reattach SR, C'
+  // ReattachInst(BB *C, Value *SR, Inst *I) - 'reattach SR, C', insert before I
+  // ReattachInst(BB *C, Value *SR, BB *I)   - 'reattach SR, C', insert at end
+  explicit ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+                        Instruction *InsertBefore = nullptr);
+  ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+               BasicBlock *InsertAtEnd);
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+  ReattachInst *cloneImpl() const;
+
+public:
+  static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion,
+                              Instruction *InsertBefore = nullptr) {
+    return new(2) ReattachInst(DetachContinue, SyncRegion, InsertBefore);
+  }
+
+  static ReattachInst *Create(BasicBlock *DetachContinue, Value *SyncRegion,
+                              BasicBlock *InsertAtEnd) {
+    return new(2) ReattachInst(DetachContinue, SyncRegion, InsertAtEnd);
+  }
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  Value *getSyncRegion() const {
+    return Op<-2>();
+  }
+
+  void setSyncRegion(Value *SyncRegion) {
+    Op<-2>() = SyncRegion;
+  }
+
+  unsigned getNumSuccessors() const { return 1; }
+
+  BasicBlock *getDetachContinue() const {
+    return cast_or_null<BasicBlock>((&Op<-1>())->get());
+  }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Instruction *I) {
+    return I->getOpcode() == Instruction::Reattach;
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+  BasicBlock *getSuccessor(unsigned i) const {
+    assert(i < getNumSuccessors() && "Successor # out of range for reattach!");
+    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
+  }
+  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
+    assert(idx < getNumSuccessors() &&
+           "Successor # out of range for reattach!");
+    *(&Op<-1>() - idx) = NewSucc;
+  }
+private:
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
+};
+
+template <>
+struct OperandTraits<ReattachInst> : public VariadicOperandTraits<ReattachInst, 1> {
+};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ReattachInst, Value)
+
+//===----------------------------------------------------------------------===//
+//                           SyncInst Class
+//===----------------------------------------------------------------------===//
+
+//===---------------------------------------------------------------------------
+/// SyncInst - Sync instruction.
+///
+class SyncInst : public TerminatorInst {
+  /// Ops list - A sync looks like an unconditional branch to its continuation.
+  SyncInst(const SyncInst &SI);
+  void AssertOK();
+  // SyncInst constructor (where C is a block and SR is a token):
+  // SyncInst(BB *C, Value *SR)          - 'sync SR, C'
+  // SyncInst(BB *C, Value *SR, Inst *I) - 'sync SR, C'        insert before I
+  // SyncInst(BB *C, Value *SR, BB *I)   - 'sync SR, C'        insert at end
+  explicit SyncInst(BasicBlock *Continue, Value *SyncRegion,
+                    Instruction *InsertBefore = nullptr);
+  SyncInst(BasicBlock *Continue, Value *SyncRegion,
+           BasicBlock *InsertAtEnd);
+protected:
+  // Note: Instruction needs to be a friend here to call cloneImpl.
+  friend class Instruction;
+  SyncInst *cloneImpl() const;
+
+public:
+  static SyncInst *Create(BasicBlock *Continue,
+                          Value *SyncRegion,
+                          Instruction *InsertBefore = nullptr) {
+    return new(2) SyncInst(Continue, SyncRegion, InsertBefore);
+  }
+  static SyncInst *Create(BasicBlock *Continue,
+                          Value *SyncRegion, BasicBlock *InsertAtEnd) {
+    return new(2) SyncInst(Continue, SyncRegion, InsertAtEnd);
+  }
+
+  /// Transparently provide more efficient getOperand methods.
+  DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  Value *getSyncRegion() const {
+    return Op<-2>();
+  }
+
+  void setSyncRegion(Value *SyncRegion) {
+    Op<-2>() = SyncRegion;
+  }
+
+  unsigned getNumSuccessors() const { return 1; }
+
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static inline bool classof(const Instruction *I) {
+    return I->getOpcode() == Instruction::Sync;
+  }
+  static inline bool classof(const Value *V) {
+    return isa<Instruction>(V) && classof(cast<Instruction>(V));
+  }
+
+  BasicBlock *getSuccessor(unsigned i) const {
+    assert(i < getNumSuccessors() && "Successor # out of range for sync!");
+    return cast_or_null<BasicBlock>((&Op<-1>() - i)->get());
+  }
+  void setSuccessor(unsigned idx, BasicBlock *NewSucc) {
+    assert(idx < getNumSuccessors() && "Successor # out of range for sync!");
+    *(&Op<-1>() - idx) = NewSucc;
+  }
+private:
+  friend TerminatorInst;
+
+  BasicBlock *getSuccessorV(unsigned idx) const;
+  unsigned getNumSuccessorsV() const;
+  void setSuccessorV(unsigned idx, BasicBlock *B);
+};
+
+template <>
+struct OperandTraits<SyncInst> : public VariadicOperandTraits<SyncInst, 1> {
+};
+
+DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SyncInst, Value)
+
 //===----------------------------------------------------------------------===//
 //                                 TruncInst Class
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 64603d8ea03091..0eedd5e98f83e9 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -952,6 +952,13 @@ def int_coro_subfn_addr : Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_i8_ty],
                                     [IntrReadMem, IntrArgMemOnly, ReadOnly<0>,
                                      NoCapture<0>]>;
 
+///===-------------------------- Tapir Intrinsics --------------------------===//
+//
+def int_syncregion_start : Intrinsic<[llvm_token_ty], [],
+                                     [IntrArgMemOnly]>;
+
+def int_detached_rethrow : Intrinsic<[], [], [Throws]>;
+
 ///===-------------------------- Other Intrinsics --------------------------===//
 //
 def int_flt_rounds : Intrinsic<[llvm_i32_ty]>,
diff --git a/llvm/include/llvm/IR/Value.def b/llvm/include/llvm/IR/Value.def
index e2ddba0aa1596e..22ca38793f5278 100644
--- a/llvm/include/llvm/IR/Value.def
+++ b/llvm/include/llvm/IR/Value.def
@@ -103,6 +103,10 @@ HANDLE_MEMORY_VALUE(MemoryUse)
 HANDLE_MEMORY_VALUE(MemoryDef)
 HANDLE_MEMORY_VALUE(MemoryPhi)
 
+HANDLE_MEMORY_VALUE(DetachUse)
+HANDLE_MEMORY_VALUE(DetachDef)
+HANDLE_MEMORY_VALUE(DetachPhi)
+
 HANDLE_INSTRUCTION(Instruction)
 // Enum values starting at InstructionVal are used for Instructions;
 // don't add new values here!
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 037c0dbb56ecec..3843050f205dfc 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -61,6 +61,9 @@ void initializeGlobalISel(PassRegistry&);
 /// Initialize all passes linked into the CodeGen library.
 void initializeTarget(PassRegistry&);
 
+/// Initialize all passes linked into the TapirOpts library.
+void initializeTapirOpts(PassRegistry&);
+
 void initializeAAEvalLegacyPassPass(PassRegistry&);
 void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
@@ -100,7 +103,9 @@ void initializeCallGraphViewerPass(PassRegistry&);
 void initializeCallGraphWrapperPassPass(PassRegistry&);
 void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
 void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
+void initializeCilkSanitizerPass(PassRegistry&);
 void initializeCodeGenPreparePass(PassRegistry&);
+void initializeComprehensiveStaticInstrumentationPass(PassRegistry&);
 void initializeConstantHoistingLegacyPassPass(PassRegistry&);
 void initializeConstantMergeLegacyPassPass(PassRegistry&);
 void initializeConstantPropagationPass(PassRegistry&);
@@ -119,6 +124,8 @@ void initializeDelinearizationPass(PassRegistry&);
 void initializeDemandedBitsWrapperPassPass(PassRegistry&);
 void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDependenceAnalysisWrapperPassPass(PassRegistry&);
+void initializeDetachSSAPrinterLegacyPassPass(PassRegistry&);
+void initializeDetachSSAWrapperPassPass(PassRegistry&);
 void initializeDetectDeadLanesPass(PassRegistry&);
 void initializeDivRemPairsLegacyPassPass(PassRegistry&);
 void initializeDomOnlyPrinterPass(PassRegistry&);
@@ -219,6 +226,7 @@ void initializeLoopDeletionLegacyPassPass(PassRegistry&);
 void initializeLoopDistributeLegacyPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
 void initializeLoopGuardWideningLegacyPassPass(PassRegistry&);
+void initializeLoopFusePass(PassRegistry&);
 void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
 void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
@@ -230,6 +238,7 @@ void initializeLoopRerollPass(PassRegistry&);
 void initializeLoopRotateLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyCFGLegacyPassPass(PassRegistry&);
 void initializeLoopSimplifyPass(PassRegistry&);
+void initializeLoopSpawningPass(PassRegistry&);
 void initializeLoopStrengthReducePass(PassRegistry&);
 void initializeLoopUnrollAndJamPass(PassRegistry&);
 void initializeLoopUnrollPass(PassRegistry&);
@@ -244,6 +253,7 @@ void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&);
 void initializeLowerIntrinsicsPass(PassRegistry&);
 void initializeLowerInvokeLegacyPassPass(PassRegistry&);
 void initializeLowerSwitchPass(PassRegistry&);
+void initializeLowerTapirToCilkPass(PassRegistry&);
 void initializeLowerTypeTestsPass(PassRegistry&);
 void initializeMIRCanonicalizerPass(PassRegistry &);
 void initializeMIRPrintingPassPass(PassRegistry&);
diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h
index 0851c2f8d265bc..8564d42e5609d3 100644
--- a/llvm/include/llvm/LinkAllPasses.h
+++ b/llvm/include/llvm/LinkAllPasses.h
@@ -52,6 +52,7 @@
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/Scalarizer.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Tapir.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Vectorize.h"
@@ -131,6 +132,7 @@ namespace {
       (void) llvm::createLoopPredicationPass();
       (void) llvm::createLoopSimplifyPass();
       (void) llvm::createLoopSimplifyCFGPass();
+      (void) llvm::createLoopSpawningPass();
       (void) llvm::createLoopStrengthReducePass();
       (void) llvm::createLoopRerollPass();
       (void) llvm::createLoopUnrollPass();
@@ -142,6 +144,7 @@ namespace {
       (void) llvm::createLowerExpectIntrinsicPass();
       (void) llvm::createLowerInvokePass();
       (void) llvm::createLowerSwitchPass();
+      (void) llvm::createLowerTapirToCilkPass(false,false);
       (void) llvm::createNaryReassociatePass();
       (void) llvm::createObjCARCAAWrapperPass();
       (void) llvm::createObjCARCAPElimPass();
@@ -221,6 +224,11 @@ namespace {
       (void) llvm::createEliminateAvailableExternallyPass();
       (void) llvm::createScalarizeMaskedMemIntrinPass();
       (void) llvm::createWarnMissedTransformationsPass();
+      (void) llvm::createSmallBlockPass();
+      (void) llvm::createRedundantSpawnPass();
+      (void) llvm::createSpawnRestructurePass();
+      (void) llvm::createSyncEliminationPass();
+      (void) llvm::createSpawnUnswitchPass();
 
       (void)new llvm::IntervalPartition();
       (void)new llvm::ScalarEvolutionWrapperPass();
diff --git a/llvm/include/llvm/Transforms/CSI.h b/llvm/include/llvm/Transforms/CSI.h
new file mode 100644
index 00000000000000..a357324d013b3e
--- /dev/null
+++ b/llvm/include/llvm/Transforms/CSI.h
@@ -0,0 +1,610 @@
+//===-- CSI.h ------------------------instrumentation hooks --*- C++ -*----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// TODO: License
+//===----------------------------------------------------------------------===//
+//
+// This file is part of CSI, a framework that provides comprehensive static
+// instrumentation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_CSI_H
+#define LLVM_TRANSFORMS_CSI_H
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+
+namespace llvm {
+
+static const char *const CsiRtUnitInitName = "__csirt_unit_init";
+static const char *const CsiRtUnitCtorName = "csirt.unit_ctor";
+static const char *const CsiFunctionBaseIdName = "__csi_unit_func_base_id";
+static const char *const CsiFunctionExitBaseIdName = "__csi_unit_func_exit_base_id";
+static const char *const CsiBasicBlockBaseIdName = "__csi_unit_bb_base_id";
+static const char *const CsiCallsiteBaseIdName = "__csi_unit_callsite_base_id";
+static const char *const CsiLoadBaseIdName = "__csi_unit_load_base_id";
+static const char *const CsiStoreBaseIdName = "__csi_unit_store_base_id";
+static const char *const CsiUnitFedTableName = "__csi_unit_fed_table";
+static const char *const CsiFuncIdVariablePrefix = "__csi_func_id_";
+static const char *const CsiUnitFedTableArrayName = "__csi_unit_fed_tables";
+static const char *const CsiInitCallsiteToFunctionName =
+    "__csi_init_callsite_to_function";
+static const char *const CsiDisableInstrumentationName =
+    "__csi_disable_instrumentation";
+
+static const int64_t CsiCallsiteUnknownTargetId = -1;
+// See llvm/tools/clang/lib/CodeGen/CodeGenModule.h:
+static const int CsiUnitCtorPriority = 65535;
+
+/// Maintains a mapping from CSI ID to static data for that ID.
+class ForensicTable {
+public:
+  ForensicTable() : BaseId(nullptr), IdCounter(0) {}
+  ForensicTable(Module &M, StringRef BaseIdName);
+
+  /// The number of entries in this forensic table
+  uint64_t size() const { return IdCounter; }
+
+  /// Get the local ID of the given Value.
+  uint64_t getId(const Value *V);
+
+  /// The GlobalVariable holding the base ID for this forensic table.
+  GlobalVariable *baseId() const { return BaseId; }
+
+  /// Converts a local to global ID conversion.
+  ///
+  /// This is done by using the given IRBuilder to insert a load to the base ID
+  /// global variable followed by an add of the base value and the local ID.
+  ///
+  /// \returns A Value holding the global ID corresponding to the
+  /// given local ID.
+  Value *localToGlobalId(uint64_t LocalId, IRBuilder<> &IRB) const;
+
+protected:
+  /// The GlobalVariable holding the base ID for this FED table.
+  GlobalVariable *BaseId;
+  /// Counter of local IDs used so far.
+  uint64_t IdCounter;
+  /// Map of Value to Local ID.
+  DenseMap<const Value *, uint64_t> ValueToLocalIdMap;
+};
+
+/// Maintains a mapping from CSI ID to front-end data for that ID.
+///
+/// The front-end data currently is the source location that a given
+/// CSI ID corresponds to.
+class FrontEndDataTable : public ForensicTable {
+public:
+  FrontEndDataTable() : ForensicTable() {}
+  FrontEndDataTable(Module &M, StringRef BaseIdName)
+      : ForensicTable(M, BaseIdName) {}
+
+  /// The number of entries in this FED table
+  uint64_t size() const { return LocalIdToSourceLocationMap.size(); }
+
+  /// Add the given Function to this FED table.
+  /// \returns The local ID of the Function.
+  uint64_t add(const Function &F);
+
+  /// Add the given BasicBlock to this FED table.
+  /// \returns The local ID of the BasicBlock.
+  uint64_t add(const BasicBlock &BB);
+
+  /// Add the given Instruction to this FED table.
+  /// \returns The local ID of the Instruction.
+  uint64_t add(const Instruction &I);
+
+  /// Get the Type for a pointer to a FED table entry.
+  ///
+  /// A FED table entry is just a source location.
+  static PointerType *getPointerType(LLVMContext &C);
+
+  /// Insert this FED table into the given Module.
+  ///
+  /// The FED table is constructed as a ConstantArray indexed by local
+  /// IDs.  The runtime is responsible for performing the mapping that
+  /// allows the table to be indexed by global ID.
+  Constant *insertIntoModule(Module &M) const;
+
+private:
+  struct SourceLocation {
+    StringRef Name;
+    int32_t Line;
+    int32_t Column;
+    StringRef Filename;
+    StringRef Directory;
+  };
+
+  /// Map of local ID to SourceLocation.
+  DenseMap<uint64_t, SourceLocation> LocalIdToSourceLocationMap;
+
+  /// Create a struct type to match the "struct SourceLocation" type.
+  /// (and the source_loc_t type in csi.h).
+  static StructType *getSourceLocStructType(LLVMContext &C);
+
+  /// Append the debug information to the table, assigning it the next
+  /// available ID.
+  ///
+  /// \returns The local ID of the appended information.
+  /// @{
+  void add(uint64_t ID, const DILocation *Loc);
+  void add(uint64_t ID, const DISubprogram *Subprog);
+  /// @}
+
+  /// Append the line and file information to the table, assigning it
+  /// the next available ID.
+  ///
+  /// \returns The new local ID of the DILocation.
+  void add(uint64_t ID, int32_t Line = -1, int32_t Column = -1,
+           StringRef Filename = "", StringRef Directory = "",
+           StringRef Name = "");
+};
+
+/// Represents a property value passed to hooks.
+class CsiProperty {
+public:
+  CsiProperty() {}
+
+  /// Return the coerced type of a property.
+  ///
+  /// TODO: Right now, this function simply returns a 64-bit integer.  Although
+  /// this solution works for x86_64, it should be generalized to handle other
+  /// architectures in the future.
+  static Type *getCoercedType(LLVMContext &C, StructType *Ty) {
+    // Must match the definition of property type in csi.h
+    // return StructType::get(IntegerType::get(C, 64),
+    //                        nullptr);
+    // We return an integer type, rather than a struct type, to deal with x86_64
+    // type coercion on struct bit fields.
+    return IntegerType::get(C, 64);
+  }
+
+  /// Return a constant value holding this property.
+  virtual Constant *getValueImpl(LLVMContext &C) const = 0;
+
+  Constant *getValue(IRBuilder<> &IRB) const {
+    return getValueImpl(IRB.getContext());
+  }
+};
+
+class CsiFuncProperty : public CsiProperty {
+public:
+  CsiFuncProperty() {
+    PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.MaySpawn),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(StructTy,
+    //                            ConstantInt::get(IntegerType::get(C, 64), 0),
+    //                            nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the MightDetach property.
+  void setMaySpawn(bool v) {
+    PropValue.Fields.MaySpawn = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned MaySpawn : 1;
+      uint64_t Padding : 63;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int MaySpawn;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, (64-1) };
+};
+
+class CsiFuncExitProperty : public CsiProperty {
+public:
+  CsiFuncExitProperty() {
+      PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.MaySpawn),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(StructTy,
+    //                            ConstantInt::get(IntegerType::get(C, 64), 0),
+    //                            nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the MightDetach property.
+  void setMaySpawn(bool v) {
+    PropValue.Fields.MaySpawn = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned MaySpawn : 1;
+      uint64_t Padding : 63;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int MaySpawn;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, (64-1) };
+};
+
+class CsiBBProperty : public CsiProperty {
+public:
+  CsiBBProperty() {
+    PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.IsLandingPad),
+                           IntegerType::get(C, PropBits.IsEHPad),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(StructTy,
+    //                            ConstantInt::get(IntegerType::get(C, 64), 0),
+    //                            nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the IsLandingPad property.
+  void setIsLandingPad(bool v) {
+    PropValue.Fields.IsLandingPad = v;
+  }
+
+  /// Set the value of the IsEHPad property.
+  void setIsEHPad(bool v) {
+    PropValue.Fields.IsEHPad = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned IsLandingPad : 1;
+      unsigned IsEHPad : 1;
+      uint64_t Padding : 62;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int IsLandingPad;
+    int IsEHPad;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, 1, (64-1-1) };
+};
+
+class CsiCallProperty : public CsiProperty {
+public:
+  CsiCallProperty() {
+    PropValue.Bits = 0;
+  }
+
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.IsIndirect),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // StructType *StructTy = getType(C);
+    // return ConstantStruct::get(
+    //     StructTy,
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsIndirect),
+    //                      PropValue.IsIndirect),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.Padding), 0),
+    //     nullptr);
+    // TODO: This solution works for x86, but should be generalized to support
+    // other architectures in the future.
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the IsIndirect property.
+  void setIsIndirect(bool v) {
+    PropValue.Fields.IsIndirect = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned IsIndirect : 1;
+      uint64_t Padding : 63;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int IsIndirect;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 1, (64-1) };
+};
+
+class CsiLoadStoreProperty : public CsiProperty {
+public:
+  CsiLoadStoreProperty() {
+    PropValue.Bits = 0;
+  }
+  /// Return the Type of a property.
+  static Type *getType(LLVMContext &C) {
+    // Must match the definition of property type in csi.h
+    return CsiProperty::getCoercedType(
+        C, StructType::get(IntegerType::get(C, PropBits.Alignment),
+                           IntegerType::get(C, PropBits.IsVtableAccess),
+                           IntegerType::get(C, PropBits.IsConstant),
+                           IntegerType::get(C, PropBits.IsOnStack),
+                           IntegerType::get(C, PropBits.MayBeCaptured),
+                           IntegerType::get(C, PropBits.LoadReadBeforeWriteInBB),
+                           IntegerType::get(C, PropBits.Padding)));
+  }
+  /// Return a constant value holding this property.
+  Constant *getValueImpl(LLVMContext &C) const override {
+    // Must match the definition of property type in csi.h
+    // return ConstantStruct::get(
+    //     StructTy,
+    //     ConstantInt::get(IntegerType::get(C, PropBits.Alignment),
+    //                      PropValue.Alignment),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsVtableAccess),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsConstant),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.IsOnStack),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.MayBeCaptured),
+    //                      PropValue.IsVtableAccess),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.LoadReadBeforeWriteInBB),
+    //                      PropValue.LoadReadBeforeWriteInBB),
+    //     ConstantInt::get(IntegerType::get(C, PropBits.Padding), 0),
+    //     nullptr);
+    return ConstantInt::get(getType(C), PropValue.Bits);
+  }
+
+  /// Set the value of the Alignment property.
+  void setAlignment(char v) {
+    PropValue.Fields.Alignment = v;
+  }
+  /// Set the value of the IsVtableAccess property.
+  void setIsVtableAccess(bool v) {
+    PropValue.Fields.IsVtableAccess = v;
+  }
+  /// Set the value of the IsConstant property.
+  void setIsConstant(bool v) {
+    PropValue.Fields.IsConstant = v;
+  }
+  /// Set the value of the IsOnStack property.
+  void setIsOnStack(bool v) {
+    PropValue.Fields.IsOnStack = v;
+  }
+  /// Set the value of the MayBeCaptured property.
+  void setMayBeCaptured(bool v) {
+    PropValue.Fields.MayBeCaptured = v;
+  }
+  /// Set the value of the LoadReadBeforeWriteInBB property.
+  void setLoadReadBeforeWriteInBB(bool v) {
+    PropValue.Fields.LoadReadBeforeWriteInBB = v;
+  }
+
+private:
+  typedef union {
+    // Must match the definition of property type in csi.h
+    struct {
+      unsigned Alignment : 8;
+      unsigned IsVtableAccess : 1;
+      unsigned IsConstant : 1;
+      unsigned IsOnStack : 1;
+      unsigned MayBeCaptured : 1;
+      unsigned LoadReadBeforeWriteInBB : 1;
+      uint64_t Padding : 53;
+    } Fields;
+    uint64_t Bits;
+  } Property;
+
+  /// The underlying values of the properties.
+  Property PropValue;
+
+  typedef struct {
+    int Alignment;
+    int IsVtableAccess;
+    int IsConstant;
+    int IsOnStack;
+    int MayBeCaptured;
+    int LoadReadBeforeWriteInBB;
+    int Padding;
+  } PropertyBits;
+
+  /// The number of bits representing each property.
+  static constexpr PropertyBits PropBits = { 8, 1, 1, 1, 1, 1, (64-8-1-1-1-1-1) };
+};
+
+struct CSIImpl {
+public:
+  CSIImpl(Module &M, CallGraph *CG,
+          const CSIOptions &Options = CSIOptions())
+      : M(M), DL(M.getDataLayout()), CG(CG), Options(Options),
+        CsiFuncEntry(nullptr), CsiFuncExit(nullptr), CsiBBEntry(nullptr),
+        CsiBBExit(nullptr), CsiBeforeCallsite(nullptr),
+        CsiAfterCallsite(nullptr), CsiBeforeRead(nullptr),
+        CsiAfterRead(nullptr), CsiBeforeWrite(nullptr), CsiAfterWrite(nullptr),
+        MemmoveFn(nullptr), MemcpyFn(nullptr), MemsetFn(nullptr),
+        InitCallsiteToFunction(nullptr), RTUnitInit(nullptr)
+  {}
+
+  bool run();
+
+  /// Get the number of bytes accessed via the given address.
+  static int getNumBytesAccessed(Value *Addr, const DataLayout &DL);
+
+  /// Members to extract properties of loads/stores.
+  static bool isVtableAccess(Instruction *I);
+  static bool addrPointsToConstantData(Value *Addr);
+  static bool isAtomic(Instruction *I);
+
+protected:
+  /// Initialize the CSI pass.
+  void initializeCsi();
+  /// Finalize the CSI pass.
+  void finalizeCsi();
+
+  /// Initialize llvm::Functions for the CSI hooks.
+  /// @{
+  void initializeLoadStoreHooks();
+  void initializeFuncHooks();
+  void initializeBasicBlockHooks();
+  void initializeCallsiteHooks();
+  void initializeMemIntrinsicsHooks();
+  /// @}
+
+  static StructType *getUnitFedTableType(LLVMContext &C,
+                                         PointerType *EntryPointerType);
+  static Constant *fedTableToUnitFedTable(Module &M,
+                                          StructType *UnitFedTableType,
+                                          FrontEndDataTable &FedTable);
+  /// Initialize the front-end data table structures.
+  void initializeFEDTables();
+  /// Collect unit front-end data table structures for finalization.
+  void collectUnitFEDTables();
+
+  virtual CallInst *createRTUnitInitCall(IRBuilder<> &IRB);
+
+  // Get the local ID of the given function.
+  uint64_t getLocalFunctionID(Function &F);
+  /// Generate a function that stores global function IDs into a set
+  /// of externally-visible global variables.
+  void generateInitCallsiteToFunction();
+
+  /// Compute CSI properties on the given ordered list of loads and stores.
+  void computeLoadAndStoreProperties(
+      SmallVectorImpl<std::pair<Instruction *, CsiLoadStoreProperty>>
+      &LoadAndStoreProperties,
+      SmallVectorImpl<Instruction *> &BBLoadsAndStores,
+      const DataLayout &DL);
+
+  /// Insert calls to the instrumentation hooks.
+  /// @{
+  void addLoadStoreInstrumentation(Instruction *I, Function *BeforeFn,
+                                   Function *AfterFn, Value *CsiId,
+                                   Type *AddrType, Value *Addr, int NumBytes,
+                                   CsiLoadStoreProperty &Prop);
+  void instrumentLoadOrStore(Instruction *I, CsiLoadStoreProperty &Prop,
+                             const DataLayout &DL);
+  void instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I);
+  void instrumentCallsite(Instruction *I);
+  void instrumentBasicBlock(BasicBlock &BB);
+  void instrumentFunction(Function &F);
+  /// @}
+
+  /// Insert a conditional call to the given hook function before the
+  /// given instruction. The condition is based on the value of
+  /// __csi_disable_instrumentation.
+  void insertConditionalHookCall(Instruction *I, Function *HookFunction,
+                                 ArrayRef<Value *> HookArgs);
+
+  /// Return true if the given function should not be instrumented.
+  bool shouldNotInstrumentFunction(Function &F);
+
+  Module &M;
+  const DataLayout &DL;
+  CallGraph *CG;
+  CSIOptions Options;
+
+  FrontEndDataTable FunctionFED, FunctionExitFED, BasicBlockFED, CallsiteFED,
+      LoadFED, StoreFED;
+
+  SmallVector<Constant *, 6> UnitFedTables;
+
+  // Instrumentation hooks
+  Function *CsiFuncEntry, *CsiFuncExit;
+  Function *CsiBBEntry, *CsiBBExit;
+  Function *CsiBeforeCallsite, *CsiAfterCallsite;
+  Function *CsiBeforeRead, *CsiAfterRead;
+  Function *CsiBeforeWrite, *CsiAfterWrite;
+
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
+  Function *InitCallsiteToFunction;
+  // GlobalVariable *DisableInstrGV;
+
+  // Runtime unit initialization
+  Function *RTUnitInit;
+
+  Type *IntptrTy;
+  DenseMap<StringRef, uint64_t> FuncOffsetMap;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_CSI_H
diff --git a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 276306f686ffac..34170aff4f44ff 100644
--- a/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/llvm/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -113,8 +113,15 @@ class PassManagerBuilder {
     /// passes at the end of the main CallGraphSCC passes and before any
     /// function simplification passes run by CGPassManager.
     EP_CGSCCOptimizerLate,
+
+    /// EP_TapirLate - This extension point allows adding passes just before
+    /// Tapir instructions are lowered to calls into a parallel runtime system.
+    EP_TapirLate,
   };
 
+  /// Whether the Cilk Calls should be instrumented
+  bool InstrumentCilk;
+
   /// The Optimization Level - Specify the basic optimization level.
   ///    0 = -O0, 1 = -O1, 2 = -O2, 3 = -O3
   unsigned OptLevel;
@@ -123,6 +130,12 @@ class PassManagerBuilder {
   ///    0 = none, 1 = -Os, 2 = -Oz
   unsigned SizeLevel;
 
+  /// The Pre-lowering to parallel runtime calls optimization level
+  ///    0 = -P0 = leave with detach instructions, 1 = no optimizations before conversion, 2 = optimize before conversion
+  unsigned ParallelLevel;
+
+  bool Rhino;
+
   /// LibraryInfo - Specifies information about the runtime library for the
   /// optimizer.  If this is non-null, it is added to both the function and
   /// per-module pass pipeline.
@@ -189,6 +202,7 @@ class PassManagerBuilder {
   void addPGOInstrPasses(legacy::PassManagerBase &MPM);
   void addFunctionSimplificationPasses(legacy::PassManagerBase &MPM);
   void addInstructionCombiningPass(legacy::PassManagerBase &MPM) const;
+  void prepopulateModulePassManager(legacy::PassManagerBase &MPM);
 
 public:
   /// populateFunctionPassManager - This fills in the function pass manager,
diff --git a/llvm/include/llvm/Transforms/Instrumentation.h b/llvm/include/llvm/Transforms/Instrumentation.h
index 017cab0a7750df..78dca4e1ef0ffd 100644
--- a/llvm/include/llvm/Transforms/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Instrumentation.h
@@ -203,7 +203,26 @@ struct SanitizerCoverageOptions {
 ModulePass *createSanitizerCoverageModulePass(
     const SanitizerCoverageOptions &Options = SanitizerCoverageOptions());
 
-/// Calculate what to divide by to scale counts.
+// Insert CilkSanitizer (Cilk determinacy race detection) instrumentation
+ModulePass *createCilkSanitizerPass();
+
+// Options for comprehensive static instrumentation
+struct CSIOptions {
+  bool InstrumentFuncEntryExit = true;
+  bool InstrumentBasicBlocks = true;
+  bool InstrumentMemoryAccesses = true;
+  bool InstrumentCalls = true;
+  bool InstrumentAtomics = true;
+  bool InstrumentMemIntrinsics = true;
+
+  CSIOptions() = default;
+};
+
+// Insert ComprehensiveStaticInstrumentation instrumentation
+ModulePass *createComprehensiveStaticInstrumentationPass(
+    const CSIOptions &Options = CSIOptions());
+
+/// \brief Calculate what to divide by to scale counts.
 ///
 /// Given the maximum count, calculate a divisor that will scale all the
 /// weights to strictly less than std::numeric_limits<uint32_t>::max().
diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h
index 8fcf9296ba47c6..1808ba38ae7e8a 100644
--- a/llvm/include/llvm/Transforms/Scalar.h
+++ b/llvm/include/llvm/Transforms/Scalar.h
@@ -451,6 +451,12 @@ FunctionPass *createNaryReassociatePass();
 //
 FunctionPass *createLoopDistributePass();
 
+//===----------------------------------------------------------------------===//
+//
+// LoopFuse - Fuse loops.
+//
+FunctionPass *createLoopFusePass();
+
 //===----------------------------------------------------------------------===//
 //
 // LoopLoadElimination - Perform loop-aware load elimination.
diff --git a/llvm/include/llvm/Transforms/Scalar/LoopFuse.h b/llvm/include/llvm/Transforms/Scalar/LoopFuse.h
new file mode 100644
index 00000000000000..5b7011e3b432a5
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Scalar/LoopFuse.h
@@ -0,0 +1,130 @@
+//===------------- LoopFuse.h - Loop Fusion Utility -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Fuse two adjacent loops to improve cache locality.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <list>
+
+namespace llvm {
+/// \brief The pass class.
+class LoopFuse : public FunctionPass {
+
+public:
+  // Kind of fusion made.
+  enum Kind {
+    NO_FUSION = 0,   // Fusion was not made even to check dependence legality.
+                     // This is when loops had failed basic structure checks.
+    REVERTED_FUSION, // Fusion was reverted due to failed dependence legality.
+    PURE_FUSION,     // Fusion succeeded with removal of original loops.
+    VERSIONED_FUSION // Fusion succeeded with versioning due to runtime checks.
+  };
+
+private:
+  // Analyses used.
+  LoopInfo *LI;
+  LoopAccessLegacyAnalysis *LAA;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+
+  // FusionSwitcher - Branch instruction that controls switching between
+  // original and fused versions. This gets initialized to true when loops are
+  // multiversioned to check fusion legality. By default, it points to original
+  // version.
+  BranchInst *FusionSwitcher;
+
+  Loop *FusedLoop;
+
+  // LAI for FusedLoop.
+  const LoopAccessInfo *LAI;
+
+  // Kind of fusion that happened.
+  Kind FusionKind = NO_FUSION;
+
+  // CustomVMap: VMap of BBs for fused loop. The problem about having
+  // ValueToValueMapTy passed from a client is that it gets updated when the
+  // loops are removed based on fusion success and this is undesirable. Also
+  // a ValueToValueMapTy is used when both Values are present. So, only a
+  // normal llvm::Value* is maintained as map's value in contrast with
+  // ValueToValueMapTy's WeakVH. Clients can use this mapping as a VMap.
+  typedef std::map<const Value *, Value *> CustomVMap;
+  CustomVMap VMap;
+
+  // Rewrite IncomingBlocks in PHIs of @Br's successor blocks from Br's parent
+  // to @To.
+  void RewritePHI(BranchInst *Br, BasicBlock *To);
+
+  // Fuse loops - @L1 and @L2 and return the fused loop.
+  Loop *FuseLoops(Loop &L1, Loop &L2);
+
+  // Legality and profitability checks.
+  bool DependenceLegal(Loop &L1, Loop &L2);
+  bool DefsUsedAcrossLoops(Loop &L1, Loop &L2);
+  bool IsLegalAndProfitable(Loop &L1, Loop &L2);
+
+  // Removal routines based on fusion success.
+  void RemoveLoopCompletelyWithPreheader(Loop &L);
+  void RemoveFusionSwitcher(Loop &L);
+
+  // Outside use updates.
+  void UpdateUsesOutsideLoop(Loop &L);
+  void AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock);
+
+public:
+  LoopFuse() : FunctionPass(ID) {
+    initializeLoopFusePass(*PassRegistry::getPassRegistry());
+  }
+
+  // Initialization interface when this pass is used as a utility.
+  LoopFuse(LoopInfo *_LI, LoopAccessLegacyAnalysis *_LAA, DominatorTree *_DT,
+           ScalarEvolution *_SE)
+      : FunctionPass(ID), LI(_LI), LAA(_LAA), DT(_DT), SE(_SE) {}
+
+  Loop *getFusedLoop() { return FusedLoop; }
+
+  const CustomVMap &getVMap() { return VMap; }
+
+  unsigned getFusionKind() { return FusionKind; }
+
+  // Interface; when this pass is used as a utility.
+  bool run(Loop &L1, Loop &L2);
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<LoopAccessLegacyAnalysis>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  static char ID;
+};
+} // anonymous namespace
diff --git a/llvm/include/llvm/Transforms/Scalar/SROA.h b/llvm/include/llvm/Transforms/Scalar/SROA.h
index b36c6f492be12a..fcd43fad841f27 100644
--- a/llvm/include/llvm/Transforms/Scalar/SROA.h
+++ b/llvm/include/llvm/Transforms/Scalar/SROA.h
@@ -64,6 +64,7 @@ class SROALegacyPass;
 ///    this form. By doing so, it will enable promotion of vector aggregates to
 ///    SSA vector values.
 class SROA : public PassInfoMixin<SROA> {
+  bool FunctionContainsDetach = false;
   LLVMContext *C = nullptr;
   DominatorTree *DT = nullptr;
   AssumptionCache *AC = nullptr;
diff --git a/llvm/include/llvm/Transforms/Tapir.h b/llvm/include/llvm/Transforms/Tapir.h
new file mode 100644
index 00000000000000..96626c283bf40c
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir.h
@@ -0,0 +1,68 @@
+//===-- Tapir.h - Tapir Transformations -------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file defines prototypes for accessor functions that expose passes
+// in the Tapir transformations library.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_TAPIR_H
+#define LLVM_TRANSFORMS_TAPIR_H
+
+namespace llvm {
+class Pass;
+class ModulePass;
+class FunctionPass;
+
+//===----------------------------------------------------------------------===//
+//
+// LoopSpawning - Create a loop spawning pass.
+//
+Pass *createLoopSpawningPass();
+
+//===----------------------------------------------------------------------===//
+//
+// SmallBlock - Do SmallBlock Pass
+//
+FunctionPass *createSmallBlockPass();
+
+//===----------------------------------------------------------------------===//
+//
+// SyncElimination - TODO
+//
+FunctionPass *createSyncEliminationPass();
+
+//===----------------------------------------------------------------------===//
+//
+// RedundantSpawn - Do RedundantSpawn Pass
+//
+FunctionPass *createRedundantSpawnPass();
+
+//===----------------------------------------------------------------------===//
+//
+// SpawnRestructure - Do SpawnRestructure Pass
+//
+FunctionPass *createSpawnRestructurePass();
+
+//===----------------------------------------------------------------------===//
+//
+// SpawnUnswitch - Do SpawnUnswitch Pass
+//
+FunctionPass *createSpawnUnswitchPass();
+
+//===----------------------------------------------------------------------===//
+//
+// PromoteDetachToCilk
+//
+ModulePass *createLowerTapirToCilkPass(bool DisablePostOpts = false,
+                                       bool Instrument = false);
+
+} // End llvm namespace
+
+#endif
diff --git a/llvm/include/llvm/Transforms/Tapir/CilkABI.h b/llvm/include/llvm/Transforms/Tapir/CilkABI.h
new file mode 100644
index 00000000000000..6c6bd7f4b21f51
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir/CilkABI.h
@@ -0,0 +1,368 @@
+//===- CilkABI.h - Interface to the Intel Cilk Plus runtime ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+#ifndef CILK_ABI_H_
+#define CILK_ABI_H_
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/TypeBuilder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <deque>
+
+extern llvm::cl::opt<bool> fastCilk;
+
+namespace {
+
+typedef void *__CILK_JUMP_BUFFER[5];
+
+struct __cilkrts_pedigree {};
+struct __cilkrts_stack_frame {};
+struct __cilkrts_worker {};
+struct global_state_t {};
+
+enum {
+  __CILKRTS_ABI_VERSION = 1
+};
+
+enum {
+  CILK_FRAME_STOLEN           =    0x01,
+  CILK_FRAME_UNSYNCHED        =    0x02,
+  CILK_FRAME_DETACHED         =    0x04,
+  CILK_FRAME_EXCEPTION_PROBED =    0x08,
+  CILK_FRAME_EXCEPTING        =    0x10,
+  CILK_FRAME_LAST             =    0x80,
+  CILK_FRAME_EXITING          =  0x0100,
+  CILK_FRAME_SUSPENDED        =  0x8000,
+  CILK_FRAME_UNWINDING        = 0x10000
+};
+
+#define CILK_FRAME_VERSION (__CILKRTS_ABI_VERSION << 24)
+#define CILK_FRAME_VERSION_MASK  0xFF000000
+#define CILK_FRAME_FLAGS_MASK    0x00FFFFFF
+#define CILK_FRAME_VERSION_VALUE(_flags) (((_flags) & CILK_FRAME_VERSION_MASK) >> 24)
+#define CILK_FRAME_MBZ  (~ (CILK_FRAME_STOLEN           |       \
+                            CILK_FRAME_UNSYNCHED        |       \
+                            CILK_FRAME_DETACHED         |       \
+                            CILK_FRAME_EXCEPTION_PROBED |       \
+                            CILK_FRAME_EXCEPTING        |       \
+                            CILK_FRAME_LAST             |       \
+                            CILK_FRAME_EXITING          |       \
+                            CILK_FRAME_SUSPENDED        |       \
+                            CILK_FRAME_UNWINDING        |       \
+                            CILK_FRAME_VERSION_MASK))
+
+
+typedef uint32_t cilk32_t;
+typedef uint64_t cilk64_t;
+typedef void (*__cilk_abi_f32_t)(void *data, cilk32_t low, cilk32_t high);
+typedef void (*__cilk_abi_f64_t)(void *data, cilk64_t low, cilk64_t high);
+
+typedef void (__cilkrts_init)();
+
+typedef void (__cilkrts_enter_frame_1)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_enter_frame_fast_1)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_leave_frame)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_rethrow)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_sync)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_detach)(__cilkrts_stack_frame *sf);
+typedef void (__cilkrts_pop_frame)(__cilkrts_stack_frame *sf);
+typedef int (__cilkrts_get_nworkers)();
+typedef __cilkrts_worker *(__cilkrts_get_tls_worker)();
+typedef __cilkrts_worker *(__cilkrts_get_tls_worker_fast)();
+typedef __cilkrts_worker *(__cilkrts_bind_thread_1)();
+
+typedef void (cilk_func)(__cilkrts_stack_frame *);
+
+typedef void (cilk_enter_begin)(uint32_t, __cilkrts_stack_frame *, void *, void *);
+typedef void (cilk_enter_helper_begin)(__cilkrts_stack_frame *, void *, void *);
+typedef void (cilk_enter_end)(__cilkrts_stack_frame *, void *);
+typedef void (cilk_detach_begin)(__cilkrts_stack_frame *);
+typedef void (cilk_detach_end)();
+typedef void (cilk_spawn_prepare)(__cilkrts_stack_frame *);
+typedef void (cilk_spawn_or_continue)(int);
+typedef void (cilk_sync_begin)(__cilkrts_stack_frame *);
+typedef void (cilk_sync_end)(__cilkrts_stack_frame *);
+typedef void (cilk_leave_begin)(__cilkrts_stack_frame *);
+typedef void (cilk_leave_end)();
+typedef void (__cilkrts_cilk_for_32)(__cilk_abi_f32_t body, void *data,
+                                     cilk32_t count, int grain);
+typedef void (__cilkrts_cilk_for_64)(__cilk_abi_f64_t body, void *data,
+                                     cilk64_t count, int grain);
+
+#define CILKRTS_FUNC(name, CGF) Get__cilkrts_##name(CGF)
+
+#define DEFAULT_GET_CILKRTS_FUNC(name)                                  \
+  static llvm::Function *Get__cilkrts_##name(llvm::Module& M) {         \
+    return llvm::cast<llvm::Function>(M.getOrInsertFunction(            \
+                                          "__cilkrts_"#name,            \
+                                          llvm::TypeBuilder<__cilkrts_##name, false>::get(M.getContext()) \
+                                                                        )); \
+  }
+
+//DEFAULT_GET_CILKRTS_FUNC(get_nworkers)
+#pragma GCC diagnostic ignored "-Wunused-function"
+static llvm::Function *Get__cilkrts_get_nworkers(llvm::Module& M) {
+  llvm::LLVMContext &C = M.getContext();
+  llvm::AttributeList AL;
+  AL = AL.addAttribute(C, llvm::AttributeList::FunctionIndex,
+                       llvm::Attribute::ReadNone);
+  // AL = AL.addAttribute(C, llvm::AttributeSet::FunctionIndex,
+  //                      llvm::Attribute::InaccessibleMemOnly);
+  AL = AL.addAttribute(C, llvm::AttributeList::FunctionIndex,
+                       llvm::Attribute::NoUnwind);
+  llvm::Function *F = llvm::cast<llvm::Function>(
+      M.getOrInsertFunction(
+          "__cilkrts_get_nworkers",
+          llvm::TypeBuilder<__cilkrts_get_nworkers, false>::get(C),
+          AL));
+  return F;
+}
+
+// TODO: set up these CILKRTS and CILK_CSI functions in a cleaner
+// way so we don't need these pragmas.
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(init)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(sync)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(rethrow)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(leave_frame)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(get_tls_worker)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(get_tls_worker_fast)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(bind_thread_1)
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(cilk_for_32)
+#pragma GCC diagnostic ignored "-Wunused-function"
+DEFAULT_GET_CILKRTS_FUNC(cilk_for_64)
+
+#define CILK_CSI_FUNC(name, CGF) Get_cilk_##name(CGF)
+
+#define GET_CILK_CSI_FUNC(name)                                         \
+  static llvm::Function *Get_cilk_##name(llvm::Module& M) {             \
+    return llvm::cast<llvm::Function>(M.getOrInsertFunction(            \
+                                          "cilk_"#name,                 \
+                                          llvm::TypeBuilder<cilk_##name, false>::get(M.getContext()) \
+                                                                        )); \
+  }
+
+#define GET_CILK_CSI_FUNC2(name)                                        \
+  static llvm::Function *Get_cilk_##name(llvm::Module& M) {             \
+    return llvm::cast<llvm::Function>(M.getOrInsertFunction(            \
+                                          "cilk_"#name,                 \
+                                          llvm::TypeBuilder<cilk_##name, false>::get(M.getContext()) \
+                                                                        )); \
+  }
+
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(enter_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(enter_helper_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(enter_end)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(detach_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(detach_end)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC2(spawn_prepare)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC2(spawn_or_continue)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(sync_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(sync_end)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(leave_begin)
+#pragma GCC diagnostic ignored "-Wunused-function"
+GET_CILK_CSI_FUNC(leave_end)
+
+  typedef std::map<llvm::LLVMContext*, llvm::StructType*> TypeBuilderCache;
+
+}  // namespace
+
+namespace llvm {
+
+/// Specializations of llvm::TypeBuilder for:
+///   __cilkrts_pedigree,
+///   __cilkrts_worker,
+///   __cilkrts_stack_frame
+template <bool X>
+class TypeBuilder<__cilkrts_pedigree, X> {
+public:
+  static StructType *get(LLVMContext &C) {
+    static TypeBuilderCache cache;
+    TypeBuilderCache::iterator I = cache.find(&C);
+    if (I != cache.end())
+      return I->second;
+    StructType *ExistingTy = StructType::getOrCreate(C, "struct.__cilkrts_pedigree");
+    cache[&C] = ExistingTy;
+    StructType *NewTy = StructType::create(C);
+    NewTy->setBody(
+        TypeBuilder<uint64_t,            X>::get(C), // rank
+        TypeBuilder<__cilkrts_pedigree*, X>::get(C)  // next
+                );
+    if (ExistingTy->isOpaque())
+      ExistingTy->setBody(NewTy->elements());
+    else
+      assert(ExistingTy->isLayoutIdentical(NewTy) &&
+             "Conflicting definition of tye struct.__cilkrts_pedigree");
+    return ExistingTy;
+  }
+  enum {
+    rank,
+    next
+  };
+};
+
+template <bool X>
+class TypeBuilder<__cilkrts_worker, X> {
+public:
+  static StructType *get(LLVMContext &C) {
+    static TypeBuilderCache cache;
+    TypeBuilderCache::iterator I = cache.find(&C);
+    if (I != cache.end())
+      return I->second;
+    // Try looking up this type by name.
+    StructType *Ty = StructType::getOrCreate(C, "struct.__cilkrts_worker");
+    assert(Ty->isOpaque() &&
+           "Conflicting definition of type struct.__cilkrts_worker.");
+    cache[&C] = Ty;
+    Ty->setBody(
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // tail
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // head
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // exc
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // protected_tail
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // ltq_limit
+        TypeBuilder<int32_t,                 X>::get(C), // self
+        TypeBuilder<void*,                   X>::get(C), // g
+        TypeBuilder<void*,                   X>::get(C), // l
+        TypeBuilder<void*,                   X>::get(C), // reducer_map
+        TypeBuilder<__cilkrts_stack_frame*,  X>::get(C), // current_stack_frame
+        TypeBuilder<__cilkrts_stack_frame**, X>::get(C), // saved_protected_tail
+        TypeBuilder<void*,                   X>::get(C), // sysdep
+        TypeBuilder<__cilkrts_pedigree,      X>::get(C)  // pedigree
+                );
+    return Ty;
+  }
+  enum {
+    tail,
+    head,
+    exc,
+    protected_tail,
+    ltq_limit,
+    self,
+    g,
+    l,
+    reducer_map,
+    current_stack_frame,
+    saved_protected_tail,
+    sysdep,
+    pedigree
+  };
+};
+
+template <bool X>
+class TypeBuilder<__cilkrts_stack_frame, X> {
+public:
+  static StructType *get(LLVMContext &C) {
+    static TypeBuilderCache cache;
+    TypeBuilderCache::iterator I = cache.find(&C);
+    if (I != cache.end())
+      return I->second;
+    StructType *Ty = StructType::create(C, "struct.__cilkrts_stack_frame");
+    cache[&C] = Ty;
+    Ty->setBody(
+        TypeBuilder<uint32_t,               X>::get(C), // flags
+        TypeBuilder<int32_t,                X>::get(C), // size
+        TypeBuilder<__cilkrts_stack_frame*, X>::get(C), // call_parent
+        TypeBuilder<__cilkrts_worker*,      X>::get(C), // worker
+        TypeBuilder<void*,                  X>::get(C), // except_data
+        TypeBuilder<__CILK_JUMP_BUFFER,     X>::get(C), // ctx
+        TypeBuilder<uint32_t,               X>::get(C), // mxcsr
+        TypeBuilder<uint16_t,               X>::get(C), // fpcsr
+        TypeBuilder<uint16_t,               X>::get(C), // reserved
+        TypeBuilder<__cilkrts_pedigree,     X>::get(C)  // parent_pedigree
+                );
+    return Ty;
+  }
+  enum {
+    flags,
+    size,
+    call_parent,
+    worker,
+    except_data,
+    ctx,
+    mxcsr,
+    fpcsr,
+    reserved,
+    parent_pedigree
+  };
+};
+
+} // namespace llvm
+
+
+////////////////////////////////////////////////////////////////////////////////
+
+namespace llvm {
+namespace cilk {
+
+Value *GetOrCreateWorker8(Function &F);
+void createSync(SyncInst &inst, ValueToValueMapTy &DetachCtxToStackFrame,
+                bool instrument = false);
+
+bool verifyDetachedCFG(const DetachInst &Detach, DominatorTree &DT,
+                       bool error = true);
+
+bool populateDetachedCFG(const DetachInst &Detach, DominatorTree &DT,
+                         SmallPtrSetImpl<BasicBlock *> &functionPieces,
+                         SmallVectorImpl<BasicBlock *> &reattachB,
+                         SmallPtrSetImpl<BasicBlock *> &ExitBlocks,
+                         bool replace, bool error = true);
+
+Function *extractDetachBodyToFunction(DetachInst &Detach,
+                                      DominatorTree &DT, AssumptionCache &AC,
+                                      CallInst **call = nullptr);
+
+Function *createDetach(DetachInst &Detach,
+                       ValueToValueMapTy &DetachCtxToStackFrame,
+                       DominatorTree &DT, AssumptionCache &AC,
+                       bool instrument = false);
+
+}  // end of cilk namespace
+}  // end of llvm namespace
+
+#endif
diff --git a/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h b/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h
new file mode 100644
index 00000000000000..df6718c99418c1
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir/LoopSpawning.h
@@ -0,0 +1,37 @@
+//===---- LoopSpawning.h ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass modifies Tapir loops to spawn their iterations efficiently.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H
+#define LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+/// The LoopSpawning Pass.
+struct LoopSpawningPass : public PassInfoMixin<LoopSpawningPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+}
+
+#endif // LLVM_TRANSFORMS_TAPIR_LOOPSPAWNING_H
diff --git a/llvm/include/llvm/Transforms/Tapir/Outline.h b/llvm/include/llvm/Transforms/Tapir/Outline.h
new file mode 100644
index 00000000000000..a11ef83007556d
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Tapir/Outline.h
@@ -0,0 +1,88 @@
+//===- llvm/Transforms/Tapir/Outline.h - Outlining for Tapir -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines helper functions for outlining portions of code containing
+// Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_TAPIR_OUTLINE_H
+#define LLVM_TRANSFORMS_TAPIR_OUTLINE_H
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+namespace llvm {
+
+typedef SetVector<Value *> ValueSet;
+
+/// Find the inputs and outputs for a function outlined from the gives set of
+/// basic blocks.
+void findInputsOutputs(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                       ValueSet &Inputs,
+                       ValueSet &Outputs,
+                       const SmallPtrSetImpl<BasicBlock *> *ExitBlocks =
+                       nullptr);
+
+/// Clone Blocks into NewFunc, transforming the old arguments into references to
+/// VMap values.
+///
+/// TODO: Fix the std::vector part of the type of this function.
+void CloneIntoFunction(Function *NewFunc, const Function *OldFunc,
+                       std::vector<BasicBlock *> Blocks,
+                       ValueToValueMapTy &VMap,
+                       bool ModuleLevelChanges,
+                       SmallVectorImpl<ReturnInst *> &Returns,
+                       const StringRef NameSuffix,
+                       SmallPtrSetImpl<BasicBlock *> *ExitBlocks = nullptr,
+                       DISubprogram *SP = nullptr,
+                       ClonedCodeInfo *CodeInfo = nullptr,
+                       ValueMapTypeRemapper *TypeMapper = nullptr,
+                       ValueMaterializer *Materializer = nullptr);
+
+/// Create a helper function whose signature is based on Inputs and
+/// Outputs as follows: f(in0, ..., inN, out0, ..., outN)
+///
+/// TODO: Fix the std::vector part of the type of this function.
+Function *CreateHelper(const ValueSet &Inputs,
+                       const ValueSet &Outputs,
+                       std::vector<BasicBlock *> Blocks,
+                       BasicBlock *Header,
+                       const BasicBlock *OldEntry,
+                       const BasicBlock *OldExit,
+                       ValueToValueMapTy &VMap,
+                       Module *DestM,
+                       bool ModuleLevelChanges,
+                       SmallVectorImpl<ReturnInst *> &Returns,
+                       const StringRef NameSuffix,
+                       SmallPtrSetImpl<BasicBlock *> *ExitBlocks = nullptr,
+                       const Instruction *InputSyncRegion = nullptr,
+                       ClonedCodeInfo *CodeInfo = nullptr,
+                       ValueMapTypeRemapper *TypeMapper = nullptr,
+                       ValueMaterializer *Materializer = nullptr);
+
+// Add alignment assumptions to parameters of outlined function, based on known
+// alignment data in the caller.
+void AddAlignmentAssumptions(const Function *Caller,
+                             const ValueSet &Inputs,
+                             ValueToValueMapTy &VMap,
+                             const Instruction *CallSite,
+                             AssumptionCache *AC,
+                             DominatorTree *DT);
+
+} // End llvm namespace
+
+#endif
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 5b16a2c0d0b1a3..4bc6bdc3378a27 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -97,6 +97,7 @@ struct CriticalEdgeSplittingOptions {
   bool MergeIdenticalEdges = false;
   bool DontDeleteUselessPHIs = false;
   bool PreserveLCSSA = false;
+  bool SplitDetachContinue = false;
 
   CriticalEdgeSplittingOptions(DominatorTree *DT = nullptr,
                                LoopInfo *LI = nullptr,
@@ -117,6 +118,11 @@ struct CriticalEdgeSplittingOptions {
     PreserveLCSSA = true;
     return *this;
   }
+
+  CriticalEdgeSplittingOptions &setSplitDetachContinue() {
+    SplitDetachContinue = true;
+    return *this;
+  }
 };
 
 /// If this edge is a critical edge, insert a new node to split the critical
diff --git a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
index fee492be2a9023..5e33ba151fc592 100644
--- a/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/ModuleUtils.h
@@ -40,6 +40,13 @@ void appendToGlobalCtors(Module &M, Function *F, int Priority,
 void appendToGlobalDtors(Module &M, Function *F, int Priority,
                          Constant *Data = nullptr);
 
+// Validate the result of Module::getOrInsertFunction called for an
+// interface function of ComprehensiveStaticInstrumentation. If the
+// instrumented module defines a function with the same name, their
+// prototypes must match, otherwise getOrInsertFunction returns a
+// bitcast.
+Function *checkCsiInterfaceFunction(Constant *FuncOrBitcast);
+
 // Validate the result of Module::getOrInsertFunction called for an interface
 // function of given sanitizer. If the instrumented module defines a function
 // with the same name, their prototypes must match, otherwise
diff --git a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
index 5ddfbe2bf05881..5342bd1c418123 100644
--- a/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/llvm/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -30,6 +30,7 @@ class AssumptionCache;
 /// ever one layer of bitcasts or GEPs between the alloca and the lifetime
 /// markers.
 bool isAllocaPromotable(const AllocaInst *AI);
+bool isAllocaParallelPromotable(const AllocaInst *AI, DominatorTree &DT);
 
 /// Promote the specified list of alloca instructions into scalar
 /// registers, inserting PHI nodes as appropriate.
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
index d02607acbbb579..355422e0e4b46f 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -54,6 +54,9 @@ class SSAUpdater {
   /// the vector.
   SmallVectorImpl<PHINode *> *InsertedPHIs;
 
+  /// This keeps track of which values are defined in detached blocks.
+  void *VID = nullptr;
+
 public:
   /// If InsertedPHIs is specified, it will be filled
   /// in with all PHI Nodes created by rewriting.
@@ -106,6 +109,8 @@ class SSAUpdater {
   /// merge the appropriate values, and this value isn't live out of the block.
   Value *GetValueInMiddleOfBlock(BasicBlock *BB);
 
+  bool GetValueIsDetachedInBlock(BasicBlock *BB);
+
   /// Rewrite a use of the symbolic value.
   ///
   /// This handles PHI nodes, which use their value in the corresponding
diff --git a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
index cab0f3e7157578..2b2d7a168ae729 100644
--- a/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
+++ b/llvm/include/llvm/Transforms/Utils/SSAUpdaterImpl.h
@@ -66,6 +66,9 @@ class SSAUpdaterImpl {
     // Marker for existing PHIs that match.
     PhiT *PHITag = nullptr;
 
+    // Flag to indicate that the AvailableVal would be used after a Reattach.
+    bool DetachedUse = false;
+
     BBInfo(BlkT *ThisBB, ValT V)
       : BB(ThisBB), AvailableVal(V), DefBB(V ? this : nullptr) {}
   };
@@ -76,6 +79,10 @@ class SSAUpdaterImpl {
 
   SmallVectorImpl<PhiT *> *InsertedPHIs;
 
+  using  ValIsDetachedTy = DenseMap<BlkT *, bool>;
+
+  ValIsDetachedTy *ValIsDetached;
+
   using BlockListTy = SmallVectorImpl<BBInfo *>;
   using BBMapTy = DenseMap<BlkT *, BBInfo *>;
 
@@ -84,8 +91,9 @@ class SSAUpdaterImpl {
 
 public:
   explicit SSAUpdaterImpl(UpdaterT *U, AvailableValsTy *A,
-                          SmallVectorImpl<PhiT *> *Ins) :
-    Updater(U), AvailableVals(A), InsertedPHIs(Ins) {}
+                          SmallVectorImpl<PhiT *> *Ins,
+                          ValIsDetachedTy *D = nullptr) :
+      Updater(U), AvailableVals(A), InsertedPHIs(Ins), ValIsDetached(D) {}
 
   /// GetValue - Check to see if AvailableVals has an entry for the specified
   /// BB and if so, return it.  If not, construct SSA form by first
@@ -350,6 +358,10 @@ class SSAUpdaterImpl {
       (*AvailableVals)[Info->BB] = PHI;
     }
 
+    // Set of blocks with detached values that would be used except
+    // for Reattach.
+    SmallVector<BBInfo*, 64> DetachedValBlocks;
+
     // Now go back through the worklist in reverse order to fill in the
     // arguments for any new PHIs added in the forward traversal.
     for (typename BlockListTy::reverse_iterator I = BlockList->rbegin(),
@@ -368,14 +380,34 @@ class SSAUpdaterImpl {
       if (!PHI)
         continue;
 
+      // TODO: Change this so we do not assume that a block has at
+      // most one Detach and Reattach predecessor.
+      BBInfo *DetachPredInfo = nullptr;
+      BBInfo *ReattachPredInfo = nullptr;
       // Iterate through the block's predecessors.
       for (unsigned p = 0; p != Info->NumPreds; ++p) {
         BBInfo *PredInfo = Info->Preds[p];
         BlkT *Pred = PredInfo->BB;
+        if (Traits::BlockReattaches(Pred, Updater)) {
+          ReattachPredInfo = PredInfo;
+          continue;
+        }
         // Skip to the nearest preceding definition.
         if (PredInfo->DefBB != PredInfo)
           PredInfo = PredInfo->DefBB;
         Traits::AddPHIOperand(PHI, PredInfo->AvailableVal, Pred);
+        if (Traits::BlockDetaches(Pred, Updater))
+          DetachPredInfo = PredInfo;
+      }
+      if (ReattachPredInfo) {
+        assert(DetachPredInfo &&
+               "Reattach predecessor found with no corresponding Detach predecessor.");
+        // Available value from predecessor through a reattach is the
+        // same as that for the corresponding detach.
+        Traits::AddPHIOperand(PHI, DetachPredInfo->AvailableVal,
+                              ReattachPredInfo->BB);
+        if (DetachPredInfo->AvailableVal != ReattachPredInfo->AvailableVal)
+          DetachedValBlocks.push_back(Info);
       }
 
       LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *PHI << "\n");
@@ -383,6 +415,9 @@ class SSAUpdaterImpl {
       // If the client wants to know about all new instructions, tell it.
       if (InsertedPHIs) InsertedPHIs->push_back(PHI);
     }
+
+    // Mark any definitions that are detached from their use.
+    MarkDetachedDefs(&DetachedValBlocks);
   }
 
   /// FindExistingPHI - Look through the PHI nodes in a block to see if any of
@@ -416,7 +451,21 @@ class SSAUpdaterImpl {
       for (typename Traits::PHI_iterator I = Traits::PHI_begin(PHI),
              E = Traits::PHI_end(PHI); I != E; ++I) {
         ValT IncomingVal = I.getIncomingValue();
-        BBInfo *PredInfo = BBMap[I.getIncomingBlock()];
+        BlkT *BB = I.getIncomingBlock();
+
+        // Replace a reattach predecessor with the corresponding
+        // detach predecessor.
+        //
+        // TODO: Remove the implicit assumption here that each basic
+        // block has at most one reattach predecessor.
+        if (Traits::BlockReattaches(BB, Updater))
+          for (typename Traits::PHI_iterator PI = Traits::PHI_begin(PHI),
+                   PE = Traits::PHI_end(PHI); PI != PE; ++PI)
+            if (Traits::BlockDetaches(PI.getIncomingBlock(), Updater)) {
+              BB = PI.getIncomingBlock();
+              break;
+            }
+        BBInfo *PredInfo = BBMap[BB];
         // Skip to the nearest preceding definition.
         if (PredInfo->DefBB != PredInfo)
           PredInfo = PredInfo->DefBB;
@@ -459,6 +508,30 @@ class SSAUpdaterImpl {
         BBMap[BB]->AvailableVal = PHIVal;
       }
   }
+
+  /// MarkDetachedDefs - Mark all definitions that reach the basic
+  /// blocks in WorkList as having detached uses.
+  void MarkDetachedDefs(SmallVector<BBInfo*, 64> *WorkList) {
+    BBInfo *Info;
+    while (!WorkList->empty()) {
+      Info = WorkList->pop_back_val();
+      Info->DetachedUse = true;
+
+      ValT AvailableVal = Info->AvailableVal;
+      if (!AvailableVal)
+        continue;
+
+      if (ValIsDetached)
+        (*ValIsDetached)[Info->BB] = true;
+
+      if (Traits::ValueIsPHI(AvailableVal, Updater) ||
+          Info->DefBB != Info)
+        for (unsigned p = 0; p != Info->NumPreds; ++p)
+          if (!Info->Preds[p]->DetachedUse)
+            WorkList->push_back(Info->Preds[p]);
+    }
+  }
+
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Transforms/Utils/TapirUtils.h b/llvm/include/llvm/Transforms/Utils/TapirUtils.h
new file mode 100644
index 00000000000000..f8e4e98850c237
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Utils/TapirUtils.h
@@ -0,0 +1,53 @@
+//===-- TapirUtils.h - Utility methods for Tapir ---------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file utility methods for handling code containing Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_TAPIRUITLS_H
+#define LLVM_TRANSFORMS_UTILS_TAPIRUTILS_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+class BasicBlock;
+class DetachInst;
+class DominatorTree;
+class TerminatorInst;
+
+/// Move static allocas in a block into the specified entry block.  Leave
+/// lifetime markers behind for those static allocas.  Returns true if the
+/// cloned block still contains dynamic allocas, which cannot be moved.
+bool MoveStaticAllocasInBlock(
+    BasicBlock *Entry, BasicBlock *Block,
+    SmallVectorImpl<Instruction *> &ExitPoints);
+
+/// Serialize the sub-CFG detached by the specified detach
+/// instruction.  Removes the detach instruction and returns a pointer
+/// to the branch instruction that replaces it.
+BranchInst* SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT = nullptr);
+
+/// Get the entry basic block to the detached context that contains
+/// the specified block.
+const BasicBlock *GetDetachedCtx(const BasicBlock *BB);
+BasicBlock *GetDetachedCtx(BasicBlock *BB);
+
+/// isCriticalContinueEdge - Return true if the specified edge is a critical
+/// detach-continue edge.  Critical detach-continue edges are critical edges -
+/// from a block with multiple successors to a block with multiple predecessors
+/// - even after ignoring all reattach edges.
+bool isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum);
+
+} // End llvm namespace
+
+#endif
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 3446aef399381f..9d1efdfeddc68e 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -24,6 +24,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
@@ -139,6 +141,42 @@ ModRefInfo AAResults::getModRefInfo(Instruction *I, const CallBase *Call2) {
   } else if (I->isFenceLike()) {
     // If this is a fence, just return ModRef.
     return ModRefInfo::ModRef;
+  } else if (auto D = dyn_cast<DetachInst>(I)) {
+    ModRefInfo Result = ModRefInfo::NoModRef;
+    SmallPtrSet<BasicBlock *, 32> Visited;
+    SmallVector<BasicBlock *, 32> WorkList;
+    WorkList.push_back(D->getDetached());
+    while (!WorkList.empty()) {
+      BasicBlock *BB = WorkList.pop_back_val();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      // for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      for (Instruction &DI : *BB) {
+        // Fail fast if we encounter an invalid CFG.
+        assert(!(D == &DI) &&
+               "Detached CFG reaches its own Detach instruction.");
+
+        // Ignore sync instructions in this analysis
+        if (isa<SyncInst>(DI) || isa<DetachInst>(DI))
+          continue;
+
+        if (isa<LoadInst>(DI) || isa<StoreInst>(DI) ||
+            isa<AtomicCmpXchgInst>(DI) || isa<AtomicRMWInst>(DI) ||
+            DI.isFenceLike() || ImmutableCallSite(&DI))
+          Result = ModRefInfo(Result | getModRefInfo(&DI, Call));
+        if (&DI == Call.getInstruction())
+          return ModRefInfo::NoModRef;
+      }
+
+      // Add successors
+      const TerminatorInst *T = BB->getTerminator();
+      if (!isa<ReattachInst>(T) ||
+          T->getSuccessor(0) != D->getContinue())
+        for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx)
+          WorkList.push_back(T->getSuccessor(idx));
+    }
+    return Result;
   } else {
     // Otherwise, check if the call modifies or references the
     // location this memory access defines.  The best we can say
@@ -540,7 +578,90 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
   return ModRefInfo::ModRef;
 }
 
-/// Return information about whether a particular call site modifies
+ModRefInfo AAResults::getModRefInfo(const DetachInst *D,
+                                    const MemoryLocation &Loc) {
+  ModRefInfo Result = MRI_NoModRef;
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  SmallVector<const BasicBlock *, 32> WorkList;
+  WorkList.push_back(D->getSuccessor(0));
+  while (!WorkList.empty()) {
+    const BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Ignore sync instructions in this analysis
+      if (isa<SyncInst>(I))
+	continue;
+
+      // Fail fast if we encounter an invalid CFG.
+      assert(!(D == &*I) &&
+             "Invalid CFG found: Detached CFG reaches its own Detach instruction.");
+
+      if (!Loc.Ptr)
+        Result = ModRefInfo(Result | getModRefInfo(&*I));
+      else
+        Result = ModRefInfo(Result | getModRefInfo(&*I, Loc));
+
+      // Early-exit the moment we reach the top of the lattice.
+      if (Result == MRI_ModRef)
+	return Result;
+    }
+
+    // Add successors
+    const TerminatorInst *T = BB->getTerminator();
+    if (!isa<ReattachInst>(T) ||
+	T->getSuccessor(0) != D->getSuccessor(1))
+      for (unsigned idx = 0, max = T->getNumSuccessors(); idx < max; ++idx)
+	WorkList.push_back(T->getSuccessor(idx));
+  }
+
+  return Result;
+}
+
+ModRefInfo AAResults::getModRefInfo(const SyncInst *S,
+                                    const MemoryLocation &Loc) {
+  ModRefInfo Result = MRI_NoModRef;
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  SmallVector<const BasicBlock *, 32> WorkList;
+  WorkList.push_back(S->getParent());
+  while(!WorkList.empty()) {
+    const BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    const TerminatorInst *T = BB->getTerminator();
+    if (isa<DetachInst>(T)) {
+      Result = ModRefInfo(Result | getModRefInfo(T, Loc));
+
+      // Early-exit the moment we reach the top of the lattice.
+      if (Result == MRI_ModRef)
+	return Result;
+    }
+
+    // Add predecessors
+    for (const_pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+	 PI != E; ++PI) {
+      const BasicBlock *Pred = *PI;
+      const TerminatorInst *PT = Pred->getTerminator();
+      // Ignore reattached predecessors and predecessors that end in
+      // syncs, because this sync does not wait on those predecessors.
+      if (isa<ReattachInst>(PT) || isa<SyncInst>(PT))
+	continue;
+      // If this block is detached, ignore the predecessor that
+      // detaches it.
+      if (const DetachInst *Det = dyn_cast<DetachInst>(PT))
+        if (Det->getDetached() == BB)
+          continue;
+
+      WorkList.push_back(Pred);
+    }
+  }
+
+  return Result;
+}
+
+/// \brief Return information about whether a particular call site modifies
 /// or reads the specified memory location \p MemLoc before instruction \p I
 /// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
 /// instruction-ordering queries inside the BasicBlock containing \p I.
diff --git a/llvm/lib/Analysis/Analysis.cpp b/llvm/lib/Analysis/Analysis.cpp
index bb8742123a0f08..be402b1990f75b 100644
--- a/llvm/lib/Analysis/Analysis.cpp
+++ b/llvm/lib/Analysis/Analysis.cpp
@@ -85,6 +85,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeLCSSAVerificationPassPass(Registry);
   initializeMemorySSAWrapperPassPass(Registry);
   initializeMemorySSAPrinterLegacyPassPass(Registry);
+  initializeDetachSSAWrapperPassPass(Registry);
+  initializeDetachSSAPrinterLegacyPassPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
diff --git a/llvm/lib/Analysis/CMakeLists.txt b/llvm/lib/Analysis/CMakeLists.txt
index c57d8ef69d69b7..1742260bb24e52 100644
--- a/llvm/lib/Analysis/CMakeLists.txt
+++ b/llvm/lib/Analysis/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMAnalysis
   Delinearization.cpp
   DemandedBits.cpp
   DependenceAnalysis.cpp
+  DetachSSA.cpp
   DivergenceAnalysis.cpp
   DomPrinter.cpp
   DominanceFrontier.cpp
diff --git a/llvm/lib/Analysis/DetachSSA.cpp b/llvm/lib/Analysis/DetachSSA.cpp
new file mode 100644
index 00000000000000..545280e5c3e930
--- /dev/null
+++ b/llvm/lib/Analysis/DetachSSA.cpp
@@ -0,0 +1,1082 @@
+//===-- DetachSSA.cpp - Detach SSA Builder---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the DetachSSA class.
+//
+//===----------------------------------------------------------------===//
+#include "llvm/Analysis/DetachSSA.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+
+#define DEBUG_TYPE "detachssa"
+using namespace llvm;
+INITIALIZE_PASS_BEGIN(DetachSSAWrapperPass, "detachssa", "Detach SSA", false,
+                      true)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DetachSSAWrapperPass, "detachssa", "Detach SSA", false,
+                    true)
+
+INITIALIZE_PASS_BEGIN(DetachSSAPrinterLegacyPass, "print-detachssa",
+                      "Detach SSA Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DetachSSAWrapperPass)
+INITIALIZE_PASS_END(DetachSSAPrinterLegacyPass, "print-detachssa",
+                    "Detach SSA Printer", false, false)
+
+static cl::opt<bool>
+    VerifyDetachSSA("verify-detachssa", cl::init(false), cl::Hidden,
+                    cl::desc("Verify DetachSSA in legacy printer pass."));
+
+namespace llvm {
+/// \brief An assembly annotator class to print Detach SSA information in
+/// comments.
+class DetachSSAAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class DetachSSA;
+  const DetachSSA *DSSA;
+
+public:
+  DetachSSAAnnotatedWriter(const DetachSSA *D) : DSSA(D) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {
+    if (DetachAccess *DA = DSSA->getDetachAccess(BB))
+      OS << "; " << *DA << "\n";
+  }
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (DetachAccess *DA = DSSA->getDetachAccess(I))
+      OS << "; " << *DA << "\n";
+  }
+};
+
+struct RenamePassData {
+  DomTreeNode *DTN;
+  DomTreeNode::const_iterator ChildIt;
+  DetachAccess *IncomingVal;
+
+  RenamePassData(DomTreeNode *D, DomTreeNode::const_iterator It,
+                 DetachAccess *M)
+      : DTN(D), ChildIt(It), IncomingVal(M) {}
+  void swap(RenamePassData &RHS) {
+    std::swap(DTN, RHS.DTN);
+    std::swap(ChildIt, RHS.ChildIt);
+    std::swap(IncomingVal, RHS.IncomingVal);
+  }
+};
+} // anonymous namespace
+
+namespace llvm {
+
+void DetachSSA::renameSuccessorPhis(BasicBlock *BB, DetachAccess *IncomingVal,
+                                    bool RenameAllUses) {
+  // Pass through values to our successors
+  for (const BasicBlock *S : successors(BB)) {
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<DetachPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<DetachPhi>(&Accesses->front());
+    if (RenameAllUses) {
+      int PhiIndex = Phi->getBasicBlockIndex(BB);
+      assert(PhiIndex != -1 && "Incomplete phi during partial rename");
+      Phi->setIncomingValue(PhiIndex, IncomingVal);
+    } else
+      Phi->addIncoming(IncomingVal, BB);
+  }
+}
+
+/// \brief Rename a single basic block into DetachSSA form.
+/// Uses the standard SSA renaming algorithm.
+/// \returns The new incoming value.
+DetachAccess *DetachSSA::renameBlock(BasicBlock *BB, DetachAccess *IncomingVal,
+                                     bool RenameAllUses) {
+  auto It = PerBlockAccesses.find(BB);
+  // Skip most processing if the list is empty.
+  if (It != PerBlockAccesses.end()) {
+    AccessList *Accesses = It->second.get();
+    for (DetachAccess &L : *Accesses) {
+      if (DetachUseOrDef *DUD = dyn_cast<DetachUseOrDef>(&L)) {
+        if (DUD->getDefiningAccess() == nullptr || RenameAllUses)
+          DUD->setDefiningAccess(IncomingVal);
+        if (isa<DetachDef>(&L))
+          IncomingVal = &L;
+      } else {
+        IncomingVal = &L;
+      }
+    }
+  }
+  return IncomingVal;
+}
+
+/// \brief This is the standard SSA renaming algorithm.
+///
+/// We walk the dominator tree in preorder, renaming accesses, and then filling
+/// in phi nodes in our successors.
+void DetachSSA::renamePass(DomTreeNode *Root, DetachAccess *IncomingVal,
+                           SmallPtrSetImpl<BasicBlock *> &Visited,
+                           bool SkipVisited, bool RenameAllUses) {
+  SmallVector<RenamePassData, 32> WorkStack;
+  // Skip everything if we already renamed this block and we are skipping.
+  // Note: You can't sink this into the if, because we need it to occur
+  // regardless of whether we skip blocks or not.
+  bool AlreadyVisited = !Visited.insert(Root->getBlock()).second;
+  if (SkipVisited && AlreadyVisited)
+    return;
+
+  IncomingVal = renameBlock(Root->getBlock(), IncomingVal, RenameAllUses);
+  renameSuccessorPhis(Root->getBlock(), IncomingVal, RenameAllUses);
+  WorkStack.push_back({Root, Root->begin(), IncomingVal});
+
+  while (!WorkStack.empty()) {
+    DomTreeNode *Node = WorkStack.back().DTN;
+    DomTreeNode::const_iterator ChildIt = WorkStack.back().ChildIt;
+    IncomingVal = WorkStack.back().IncomingVal;
+
+    if (ChildIt == Node->end()) {
+      WorkStack.pop_back();
+    } else {
+      DomTreeNode *Child = *ChildIt;
+      ++WorkStack.back().ChildIt;
+      BasicBlock *BB = Child->getBlock();
+      // Note: You can't sink this into the if, because we need it to occur
+      // regardless of whether we skip blocks or not.
+      AlreadyVisited = !Visited.insert(BB).second;
+      if (SkipVisited && AlreadyVisited) {
+        // We already visited this during our renaming, which can happen when
+        // being asked to rename multiple blocks. Figure out the incoming val,
+        // which is the last def.
+        // Incoming value can only change if there is a block def, and in that
+        // case, it's the last block def in the list.
+        if (auto *BlockDefs = getWritableBlockDefs(BB))
+          IncomingVal = &*BlockDefs->rbegin();
+      } else
+        IncomingVal = renameBlock(BB, IncomingVal, RenameAllUses);
+      renameSuccessorPhis(BB, IncomingVal, RenameAllUses);
+      WorkStack.push_back({Child, Child->begin(), IncomingVal});
+    }
+  }
+}
+
+/// \brief This handles unreachable block accesses by deleting phi nodes in
+/// unreachable blocks, and marking all other unreachable DetachAccess's as
+/// being uses of the live on entry definition.
+void DetachSSA::markUnreachableAsLiveOnEntry(BasicBlock *BB) {
+  assert(!DT->isReachableFromEntry(BB) &&
+         "Reachable block found while handling unreachable blocks");
+
+  // Make sure phi nodes in our reachable successors end up with a
+  // LiveOnEntryDef for our incoming edge, even though our block is forward
+  // unreachable.  We could just disconnect these blocks from the CFG fully,
+  // but we do not right now.
+  for (const BasicBlock *S : successors(BB)) {
+    if (!DT->isReachableFromEntry(S))
+      continue;
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<DetachPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<DetachPhi>(&Accesses->front());
+    Phi->addIncoming(LiveOnEntryDef.get(), BB);
+  }
+
+  auto It = PerBlockAccesses.find(BB);
+  if (It == PerBlockAccesses.end())
+    return;
+
+  auto &Accesses = It->second;
+  for (auto AI = Accesses->begin(), AE = Accesses->end(); AI != AE;) {
+    auto Next = std::next(AI);
+    // If we have a phi, just remove it. We are going to replace all
+    // users with live on entry.
+    if (auto *UseOrDef = dyn_cast<DetachUseOrDef>(AI))
+      UseOrDef->setDefiningAccess(LiveOnEntryDef.get());
+    else
+      Accesses->erase(AI);
+    AI = Next;
+  }
+}
+
+DetachSSA::DetachSSA(Function &Func, DominatorTree *DT)
+    : DT(DT), F(Func),
+      NextID(INVALID_DETACHACCESS_ID) {
+  buildDetachSSA();
+}
+
+DetachSSA::~DetachSSA() {
+  // Drop all our references
+  for (const auto &Pair : PerBlockAccesses)
+    for (DetachAccess &DA : *Pair.second)
+      DA.dropAllReferences();
+}
+
+DetachSSA::AccessList *DetachSSA::getOrCreateAccessList(const BasicBlock *BB) {
+  auto Res = PerBlockAccesses.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<AccessList>();
+  return Res.first->second.get();
+}
+DetachSSA::DefsList *DetachSSA::getOrCreateDefsList(const BasicBlock *BB) {
+  auto Res = PerBlockDefs.insert(std::make_pair(BB, nullptr));
+
+  if (Res.second)
+    Res.first->second = make_unique<DefsList>();
+  return Res.first->second.get();
+}
+
+// /// This class is a batch walker of all DetachUse's in the program, and points
+// /// their defining access at the thing that actually clobbers them.  Because it
+// /// is a batch walker that touches everything, it does not operate like the
+// /// other walkers.  This walker is basically performing a top-down SSA renaming
+// /// pass, where the version stack is used as the cache.  This enables it to be
+// /// significantly more time and detach efficient than using the regular walker,
+// /// which is walking bottom-up.
+// class DetachSSA::OptimizeUses {
+// public:
+//   OptimizeUses(DetachSSA *DSSA, DetachSSAWalker *Walker, AliasAnalysis *AA,
+//                DominatorTree *DT)
+//       : DSSA(DSSA), Walker(Walker), AA(AA), DT(DT) {
+//     Walker = DSSA->getWalker();
+//   }
+
+//   void optimizeUses();
+
+// private:
+//   /// This represents where a given detachlocation is in the stack.
+//   struct MemlocStackInfo {
+//     // This essentially is keeping track of versions of the stack. Whenever
+//     // the stack changes due to pushes or pops, these versions increase.
+//     unsigned long StackEpoch;
+//     unsigned long PopEpoch;
+//     // This is the lower bound of places on the stack to check. It is equal to
+//     // the place the last stack walk ended.
+//     // Note: Correctness depends on this being initialized to 0, which densemap
+//     // does
+//     unsigned long LowerBound;
+//     const BasicBlock *LowerBoundBlock;
+//     // This is where the last walk for this detach location ended.
+//     unsigned long LastKill;
+//     bool LastKillValid;
+//   };
+//   void optimizeUsesInBlock(const BasicBlock *, unsigned long &, unsigned long &,
+//                            SmallVectorImpl<DetachAccess *> &,
+//                            DenseMap<DetachLocOrCall, MemlocStackInfo> &);
+//   DetachSSA *DSSA;
+//   DetachSSAWalker *Walker;
+//   AliasAnalysis *AA;
+//   DominatorTree *DT;
+// };
+
+// /// Optimize the uses in a given block This is basically the SSA renaming
+// /// algorithm, with one caveat: We are able to use a single stack for all
+// /// DetachUses.  This is because the set of *possible* reaching DetachDefs is
+// /// the same for every DetachUse.  The *actual* clobbering DetachDef is just
+// /// going to be some position in that stack of possible ones.
+// ///
+// /// We track the stack positions that each DetachLocation needs
+// /// to check, and last ended at.  This is because we only want to check the
+// /// things that changed since last time.  The same DetachLocation should
+// /// get clobbered by the same store (getModRefInfo does not use invariantness or
+// /// things like this, and if they start, we can modify DetachLocOrCall to
+// /// include relevant data)
+// void DetachSSA::OptimizeUses::optimizeUsesInBlock(
+//     const BasicBlock *BB, unsigned long &StackEpoch, unsigned long &PopEpoch,
+//     SmallVectorImpl<DetachAccess *> &VersionStack,
+//     DenseMap<DetachLocOrCall, MemlocStackInfo> &LocStackInfo) {
+
+//   /// If no accesses, nothing to do.
+//   DetachSSA::AccessList *Accesses = DSSA->getWritableBlockAccesses(BB);
+//   if (Accesses == nullptr)
+//     return;
+
+//   // Pop everything that doesn't dominate the current block off the stack,
+//   // increment the PopEpoch to account for this.
+//   while (true) {
+//     assert(
+//         !VersionStack.empty() &&
+//         "Version stack should have liveOnEntry sentinel dominating everything");
+//     BasicBlock *BackBlock = VersionStack.back()->getBlock();
+//     if (DT->dominates(BackBlock, BB))
+//       break;
+//     while (VersionStack.back()->getBlock() == BackBlock)
+//       VersionStack.pop_back();
+//     ++PopEpoch;
+//   }
+
+//   for (DetachAccess &DA : *Accesses) {
+//     auto *MU = dyn_cast<DetachUse>(&DA);
+//     if (!MU) {
+//       VersionStack.push_back(&DA);
+//       ++StackEpoch;
+//       continue;
+//     }
+
+//     if (isUseTriviallyOptimizableToLiveOnEntry(*AA, MU->getDetachInst())) {
+//       MU->setDefiningAccess(DSSA->getLiveOnEntryDef(), true);
+//       continue;
+//     }
+
+//     DetachLocOrCall UseMLOC(MU);
+//     auto &LocInfo = LocStackInfo[UseMLOC];
+//     // If the pop epoch changed, it means we've removed stuff from top of
+//     // stack due to changing blocks. We may have to reset the lower bound or
+//     // last kill info.
+//     if (LocInfo.PopEpoch != PopEpoch) {
+//       LocInfo.PopEpoch = PopEpoch;
+//       LocInfo.StackEpoch = StackEpoch;
+//       // If the lower bound was in something that no longer dominates us, we
+//       // have to reset it.
+//       // We can't simply track stack size, because the stack may have had
+//       // pushes/pops in the meantime.
+//       // XXX: This is non-optimal, but only is slower cases with heavily
+//       // branching dominator trees.  To get the optimal number of queries would
+//       // be to make lowerbound and lastkill a per-loc stack, and pop it until
+//       // the top of that stack dominates us.  This does not seem worth it ATM.
+//       // A much cheaper optimization would be to always explore the deepest
+//       // branch of the dominator tree first. This will guarantee this resets on
+//       // the smallest set of blocks.
+//       if (LocInfo.LowerBoundBlock && LocInfo.LowerBoundBlock != BB &&
+//           !DT->dominates(LocInfo.LowerBoundBlock, BB)) {
+//         // Reset the lower bound of things to check.
+//         // TODO: Some day we should be able to reset to last kill, rather than
+//         // 0.
+//         LocInfo.LowerBound = 0;
+//         LocInfo.LowerBoundBlock = VersionStack[0]->getBlock();
+//         LocInfo.LastKillValid = false;
+//       }
+//     } else if (LocInfo.StackEpoch != StackEpoch) {
+//       // If all that has changed is the StackEpoch, we only have to check the
+//       // new things on the stack, because we've checked everything before.  In
+//       // this case, the lower bound of things to check remains the same.
+//       LocInfo.PopEpoch = PopEpoch;
+//       LocInfo.StackEpoch = StackEpoch;
+//     }
+//     if (!LocInfo.LastKillValid) {
+//       LocInfo.LastKill = VersionStack.size() - 1;
+//       LocInfo.LastKillValid = true;
+//     }
+
+//     // At this point, we should have corrected last kill and LowerBound to be
+//     // in bounds.
+//     assert(LocInfo.LowerBound < VersionStack.size() &&
+//            "Lower bound out of range");
+//     assert(LocInfo.LastKill < VersionStack.size() &&
+//            "Last kill info out of range");
+//     // In any case, the new upper bound is the top of the stack.
+//     unsigned long UpperBound = VersionStack.size() - 1;
+
+//     if (UpperBound - LocInfo.LowerBound > MaxCheckLimit) {
+//       DEBUG(dbgs() << "DetachSSA skipping optimization of " << *MU << " ("
+//                    << *(MU->getDetachInst()) << ")"
+//                    << " because there are " << UpperBound - LocInfo.LowerBound
+//                    << " stores to disambiguate\n");
+//       // Because we did not walk, LastKill is no longer valid, as this may
+//       // have been a kill.
+//       LocInfo.LastKillValid = false;
+//       continue;
+//     }
+//     bool FoundClobberResult = false;
+//     while (UpperBound > LocInfo.LowerBound) {
+//       if (isa<DetachPhi>(VersionStack[UpperBound])) {
+//         // For phis, use the walker, see where we ended up, go there
+//         Instruction *UseInst = MU->getDetachInst();
+//         DetachAccess *Result = Walker->getClobberingDetachAccess(UseInst);
+//         // We are guaranteed to find it or something is wrong
+//         while (VersionStack[UpperBound] != Result) {
+//           assert(UpperBound != 0);
+//           --UpperBound;
+//         }
+//         FoundClobberResult = true;
+//         break;
+//       }
+
+//       DetachDef *MD = cast<DetachDef>(VersionStack[UpperBound]);
+//       // If the lifetime of the pointer ends at this instruction, it's live on
+//       // entry.
+//       if (!UseMLOC.IsCall && lifetimeEndsAt(MD, UseMLOC.getLoc(), *AA)) {
+//         // Reset UpperBound to liveOnEntryDef's place in the stack
+//         UpperBound = 0;
+//         FoundClobberResult = true;
+//         break;
+//       }
+//       if (instructionClobbersQuery(MD, MU, UseMLOC, *AA)) {
+//         FoundClobberResult = true;
+//         break;
+//       }
+//       --UpperBound;
+//     }
+//     // At the end of this loop, UpperBound is either a clobber, or lower bound
+//     // PHI walking may cause it to be < LowerBound, and in fact, < LastKill.
+//     if (FoundClobberResult || UpperBound < LocInfo.LastKill) {
+//       MU->setDefiningAccess(VersionStack[UpperBound], true);
+//       // We were last killed now by where we got to
+//       LocInfo.LastKill = UpperBound;
+//     } else {
+//       // Otherwise, we checked all the new ones, and now we know we can get to
+//       // LastKill.
+//       MU->setDefiningAccess(VersionStack[LocInfo.LastKill], true);
+//     }
+//     LocInfo.LowerBound = VersionStack.size() - 1;
+//     LocInfo.LowerBoundBlock = BB;
+//   }
+// }
+
+// /// Optimize uses to point to their actual clobbering definitions.
+// void DetachSSA::OptimizeUses::optimizeUses() {
+//   SmallVector<DetachAccess *, 16> VersionStack;
+//   DenseMap<DetachLocOrCall, MemlocStackInfo> LocStackInfo;
+//   VersionStack.push_back(DSSA->getLiveOnEntryDef());
+
+//   unsigned long StackEpoch = 1;
+//   unsigned long PopEpoch = 1;
+//   // We perform a non-recursive top-down dominator tree walk.
+//   for (const auto *DomNode : depth_first(DT->getRootNode()))
+//     optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack,
+//                         LocStackInfo);
+// }
+
+void DetachSSA::placePHINodes(
+    const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks,
+    const DenseMap<const BasicBlock *, unsigned int> &BBNumbers) {
+  // Determine where our DetachPhi's should go
+  ForwardIDFCalculator IDFs(*DT);
+  IDFs.setDefiningBlocks(DefiningBlocks);
+  SmallVector<BasicBlock *, 32> IDFBlocks;
+  IDFs.calculate(IDFBlocks);
+
+  std::sort(IDFBlocks.begin(), IDFBlocks.end(),
+            [&BBNumbers](const BasicBlock *A, const BasicBlock *B) {
+              return BBNumbers.lookup(A) < BBNumbers.lookup(B);
+            });
+
+  // Now place DetachPhi nodes.
+  for (auto &BB : IDFBlocks)
+    createDetachPhi(BB);
+}
+
+void DetachSSA::buildDetachSSA() {
+  BasicBlock &StartingPoint = F.getEntryBlock();
+  LiveOnEntryDef = make_unique<DetachDef>(F.getContext(), nullptr, nullptr,
+                                          &StartingPoint, NextID++);
+  DenseMap<const BasicBlock *, unsigned int> BBNumbers;
+  unsigned NextBBNum = 0;
+
+  // We maintain lists of detach accesses per block, trading memory for time. We
+  // could just look up the detach access for every possible instruction in the
+  // stream.
+  SmallPtrSet<BasicBlock *, 32> DefiningBlocks;
+  // Go through each block, figure out where defs occur, and chain together all
+  // the accesses.
+  for (BasicBlock &B : F) {
+    BBNumbers[&B] = NextBBNum++;
+    bool InsertIntoDef = false;
+    AccessList *Accesses = nullptr;
+    DefsList *Defs = nullptr;
+    if (isa<SyncInst>(B.getTerminator()) ||
+        isa<DetachInst>(B.getTerminator())) {
+      DetachUseOrDef *DUD = new DetachDef(B.getContext(), nullptr,
+                                          B.getTerminator(), &B,
+                                          NextID++);
+      ValueToDetachAccess[B.getTerminator()] = DUD;
+
+      if (!Accesses)
+        Accesses = getOrCreateAccessList(&B);
+      Accesses->push_back(DUD);
+      InsertIntoDef = true;
+      if (!Defs)
+        Defs = getOrCreateDefsList(&B);
+      Defs->push_back(*DUD);
+    }
+    if (InsertIntoDef)
+      DefiningBlocks.insert(&B);
+  }
+  placePHINodes(DefiningBlocks, BBNumbers);
+
+  // Now do regular SSA renaming on the DetachDef/DetachUse. Visited will get
+  // filled in with all blocks.
+  SmallPtrSet<BasicBlock *, 16> Visited;
+  renamePass(DT->getRootNode(), LiveOnEntryDef.get(), Visited);
+
+  // CachingWalker *Walker = getWalkerImpl();
+
+  // // We're doing a batch of updates; don't drop useful caches between them.
+  // Walker->setAutoResetWalker(false);
+  // OptimizeUses(this, Walker, AA, DT).optimizeUses();
+  // Walker->setAutoResetWalker(true);
+  // Walker->resetClobberWalker();
+
+  // Mark the uses in unreachable blocks as live on entry, so that they go
+  // somewhere.
+  for (auto &BB : F)
+    if (!Visited.count(&BB))
+      markUnreachableAsLiveOnEntry(&BB);
+}
+
+// This is a helper function used by the creation routines. It places NewAccess
+// into the access and defs lists for a given basic block, at the given
+// insertion point.
+void DetachSSA::insertIntoListsForBlock(DetachAccess *NewAccess,
+                                        const BasicBlock *BB,
+                                        InsertionPlace Point) {
+  auto *Accesses = getOrCreateAccessList(BB);
+  if (Point == Beginning) {
+    // If it's a phi node, it goes first, otherwise, it goes after any phi
+    // nodes.
+    if (isa<DetachPhi>(NewAccess)) {
+      Accesses->push_front(NewAccess);
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_front(*NewAccess);
+    } else {
+      auto AI = find_if_not(
+          *Accesses, [](const DetachAccess &DA) { return isa<DetachPhi>(DA); });
+      Accesses->insert(AI, NewAccess);
+      if (!isa<DetachUse>(NewAccess)) {
+        auto *Defs = getOrCreateDefsList(BB);
+        auto DI = find_if_not(
+            *Defs, [](const DetachAccess &DA) { return isa<DetachPhi>(DA); });
+        Defs->insert(DI, *NewAccess);
+      }
+    }
+  } else {
+    Accesses->push_back(NewAccess);
+    if (!isa<DetachUse>(NewAccess)) {
+      auto *Defs = getOrCreateDefsList(BB);
+      Defs->push_back(*NewAccess);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+void DetachSSA::insertIntoListsBefore(DetachAccess *What, const BasicBlock *BB,
+                                      AccessList::iterator InsertPt) {
+  auto *Accesses = getWritableBlockAccesses(BB);
+  bool WasEnd = InsertPt == Accesses->end();
+  Accesses->insert(AccessList::iterator(InsertPt), What);
+  if (!isa<DetachUse>(What)) {
+    auto *Defs = getOrCreateDefsList(BB);
+    // If we got asked to insert at the end, we have an easy job, just shove it
+    // at the end. If we got asked to insert before an existing def, we also get
+    // an terator. If we got asked to insert before a use, we have to hunt for
+    // the next def.
+    if (WasEnd) {
+      Defs->push_back(*What);
+    } else if (isa<DetachDef>(InsertPt)) {
+      Defs->insert(InsertPt->getDefsIterator(), *What);
+    } else {
+      while (InsertPt != Accesses->end() && !isa<DetachDef>(InsertPt))
+        ++InsertPt;
+      // Either we found a def, or we are inserting at the end
+      if (InsertPt == Accesses->end())
+        Defs->push_back(*What);
+      else
+        Defs->insert(InsertPt->getDefsIterator(), *What);
+    }
+  }
+  BlockNumberingValid.erase(BB);
+}
+
+// Move What before Where in the IR.  The end result is that What will belong to
+// the right lists and have the right Block set, but will not otherwise be
+// correct. It will not have the right defining access, and if it is a def,
+// things below it will not properly be updated.
+void DetachSSA::moveTo(DetachUseOrDef *What, BasicBlock *BB,
+                       AccessList::iterator Where) {
+  // Keep it in the lookup tables, remove from the lists
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsBefore(What, BB, Where);
+}
+
+void DetachSSA::moveTo(DetachUseOrDef *What, BasicBlock *BB,
+                       InsertionPlace Point) {
+  removeFromLists(What, false);
+  What->setBlock(BB);
+  insertIntoListsForBlock(What, BB, Point);
+}
+
+DetachPhi *DetachSSA::createDetachPhi(BasicBlock *BB) {
+  assert(!getDetachAccess(BB) && "DetachPhi already exists for this BB");
+  DetachPhi *Phi = new DetachPhi(BB->getContext(), BB, NextID++);
+  // Phi's always are placed at the front of the block.
+  insertIntoListsForBlock(Phi, BB, Beginning);
+  ValueToDetachAccess[BB] = Phi;
+  return Phi;
+}
+
+// DetachUseOrDef *DetachSSA::createDefinedAccess(Instruction *I,
+//                                                DetachAccess *Definition) {
+//   assert(!isa<PHINode>(I) && "Cannot create a defined access for a PHI");
+//   DetachUseOrDef *NewAccess = createNewAccess(I);
+//   assert(
+//       NewAccess != nullptr &&
+//       "Tried to create a detach access for a non-detach touching instruction");
+//   NewAccess->setDefiningAccess(Definition);
+//   return NewAccess;
+// }
+
+// /// \brief Helper function to create new detach accesses
+// DetachUseOrDef *DetachSSA::createNewAccess(Instruction *I) {
+//   bool Def = isa<DetachInst>(I);
+//   bool Use = isa<SyncInst>(I);
+
+//   if (!Def && !Use)
+//     return nullptr;
+
+//   DetachUseOrDef *DUD;
+//   if (Def)
+//     DUD = new DetachDef(I->getContext, nullptr, I,
+//                         cast<DetachInst>(I)->getContinue(), NextID++);
+//   else if (Use)
+//     DUD = new DetachUse(I->getContext, nullptr, I, I->getParent());
+//   ValueToDetachAccess[I] = DUD;
+//   return DUD;
+// }
+
+/// \brief Returns true if \p Replacer dominates \p Replacee .
+bool DetachSSA::dominatesUse(const DetachAccess *Replacer,
+                             const DetachAccess *Replacee) const {
+  if (isa<DetachUseOrDef>(Replacee))
+    return DT->dominates(Replacer->getBlock(), Replacee->getBlock());
+  const auto *DP = cast<DetachPhi>(Replacee);
+  // For a phi node, the use occurs in the predecessor block of the phi node.
+  // Since we may occur multiple times in the phi node, we have to check each
+  // operand to ensure Replacer dominates each operand where Replacee occurs.
+  for (const Use &Arg : DP->operands()) {
+    if (Arg.get() != Replacee &&
+        !DT->dominates(Replacer->getBlock(), DP->getIncomingBlock(Arg)))
+      return false;
+  }
+  return true;
+}
+
+/// \brief Properly remove \p DA from all of DetachSSA's lookup tables.
+void DetachSSA::removeFromLookups(DetachAccess *DA) {
+  assert(DA->use_empty() &&
+         "Trying to remove detach access that still has uses");
+  BlockNumbering.erase(DA);
+  if (DetachUseOrDef *MUD = dyn_cast<DetachUseOrDef>(DA))
+    MUD->setDefiningAccess(nullptr);
+  // // Invalidate our walker's cache if necessary
+  // if (!isa<DetachUse>(DA))
+  //   Walker->invalidateInfo(DA);
+  // The call below to erase will destroy DA, so we can't change the order we
+  // are doing things here
+  Value *DAInst;
+  if (DetachUseOrDef *DUD = dyn_cast<DetachUseOrDef>(DA)) {
+    DAInst = DUD->getDAInst();
+  } else {
+    DAInst = DA->getBlock();
+  }
+  auto VDA = ValueToDetachAccess.find(DAInst);
+  if (VDA->second == DA)
+    ValueToDetachAccess.erase(VDA);
+}
+
+/// \brief Properly remove \p DA from all of DetachSSA's lists.
+///
+/// Because of the way the intrusive list and use lists work, it is important to
+/// do removal in the right order.
+/// ShouldDelete defaults to true, and will cause the detach access to also be
+/// deleted, not just removed.
+void DetachSSA::removeFromLists(DetachAccess *DA, bool ShouldDelete) {
+  // The access list owns the reference, so we erase it from the non-owning list
+  // first.
+  if (!isa<DetachUse>(DA)) {
+    auto DefsIt = PerBlockDefs.find(DA->getBlock());
+    std::unique_ptr<DefsList> &Defs = DefsIt->second;
+    Defs->remove(*DA);
+    if (Defs->empty())
+      PerBlockDefs.erase(DefsIt);
+  }
+
+  // The erase call here will delete it. If we don't want it deleted, we call
+  // remove instead.
+  auto AccessIt = PerBlockAccesses.find(DA->getBlock());
+  std::unique_ptr<AccessList> &Accesses = AccessIt->second;
+  if (ShouldDelete)
+    Accesses->erase(DA);
+  else
+    Accesses->remove(DA);
+
+  if (Accesses->empty())
+    PerBlockAccesses.erase(AccessIt);
+}
+
+void DetachSSA::print(raw_ostream &OS) const {
+  DetachSSAAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void DetachSSA::dump() const { print(dbgs()); }
+#endif
+
+void DetachSSA::verifyDetachSSA() const {
+  verifyDefUses(F);
+  verifyDomination(F);
+  verifyOrdering(F);
+  // Walker->verify(this);
+}
+
+/// \brief Verify that the order and existence of DetachAccesses matches the
+/// order and existence of detach affecting instructions.
+void DetachSSA::verifyOrdering(Function &F) const {
+  // Walk all the blocks, comparing what the lookups think and what the access
+  // lists think, as well as the order in the blocks vs the order in the access
+  // lists.
+  SmallVector<DetachAccess *, 32> ActualAccesses;
+  SmallVector<DetachAccess *, 32> ActualDefs;
+  for (BasicBlock &B : F) {
+    const AccessList *AL = getBlockAccesses(&B);
+    const auto *DL = getBlockDefs(&B);
+    DetachAccess *Phi = getDetachAccess(&B);
+    if (Phi) {
+      ActualAccesses.push_back(Phi);
+      ActualDefs.push_back(Phi);
+    }
+
+    for (Instruction &I : B) {
+      DetachAccess *DA = getDetachAccess(&I);
+      assert((!DA || (AL && (isa<DetachUse>(DA) || DL))) &&
+             "We have detach affecting instructions "
+             "in this block but they are not in the "
+             "access list or defs list");
+      if (DA) {
+        ActualAccesses.push_back(DA);
+        if (isa<DetachDef>(DA))
+          ActualDefs.push_back(DA);
+      }
+    }
+    // Either we hit the assert, really have no accesses, or we have both
+    // accesses and an access list.
+    // Same with defs.
+    if (!AL && !DL)
+      continue;
+    assert(AL->size() == ActualAccesses.size() &&
+           "We don't have the same number of accesses in the block as on the "
+           "access list");
+    assert((DL || ActualDefs.size() == 0) &&
+           "Either we should have a defs list, or we should have no defs");
+    assert((!DL || DL->size() == ActualDefs.size()) &&
+           "We don't have the same number of defs in the block as on the "
+           "def list");
+    auto ALI = AL->begin();
+    auto AAI = ActualAccesses.begin();
+    while (ALI != AL->end() && AAI != ActualAccesses.end()) {
+      assert(&*ALI == *AAI && "Not the same accesses in the same order");
+      ++ALI;
+      ++AAI;
+    }
+    ActualAccesses.clear();
+    if (DL) {
+      auto DLI = DL->begin();
+      auto ADI = ActualDefs.begin();
+      while (DLI != DL->end() && ADI != ActualDefs.end()) {
+        assert(&*DLI == *ADI && "Not the same defs in the same order");
+        ++DLI;
+        ++ADI;
+      }
+    }
+    ActualDefs.clear();
+  }
+}
+
+/// \brief Verify the domination properties of DetachSSA by checking that each
+/// definition dominates all of its uses.
+void DetachSSA::verifyDomination(Function &F) const {
+#ifndef NDEBUG
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (DetachPhi *DP = getDetachAccess(&B))
+      for (const Use &U : DP->uses())
+        assert(dominates(DP, U) && "Detach PHI does not dominate it's uses");
+
+    for (Instruction &I : B) {
+      DetachAccess *MD = dyn_cast_or_null<DetachDef>(getDetachAccess(&I));
+      if (!MD)
+        continue;
+
+      for (const Use &U : MD->uses())
+        assert(dominates(MD, U) && "Detach Def does not dominate it's uses");
+    }
+  }
+#endif
+}
+
+/// \brief Verify the def-use lists in DetachSSA, by verifying that \p Use
+/// appears in the use list of \p Def.
+
+void DetachSSA::verifyUseInDefs(DetachAccess *Def, DetachAccess *Use) const {
+#ifndef NDEBUG
+  if (!Def)
+    assert(isLiveOnEntryDef(Use) &&
+           "Null def but use not point to live on entry def");
+  else
+    assert(is_contained(Def->users(), Use) &&
+           "Did not find use in def's use list");
+#endif
+}
+
+/// \brief Verify the immediate use information, by walking all the detach
+/// accesses and verifying that, for each use, it appears in the
+/// appropriate def's use list
+void DetachSSA::verifyDefUses(Function &F) const {
+  for (BasicBlock &B : F) {
+    // Phi nodes are attached to basic blocks
+    if (DetachPhi *Phi = getDetachAccess(&B)) {
+      assert(Phi->getNumOperands() == static_cast<unsigned>(std::distance(
+                                          pred_begin(&B), pred_end(&B))) &&
+             "Incomplete DetachPhi Node");
+      for (unsigned I = 0, E = Phi->getNumIncomingValues(); I != E; ++I)
+        verifyUseInDefs(Phi->getIncomingValue(I), Phi);
+    }
+
+    for (Instruction &I : B) {
+      if (DetachUseOrDef *DA = getDetachAccess(&I)) {
+        verifyUseInDefs(DA->getDefiningAccess(), DA);
+      }
+    }
+  }
+}
+
+DetachUseOrDef *DetachSSA::getDetachAccess(const Instruction *I) const {
+  return cast_or_null<DetachUseOrDef>(ValueToDetachAccess.lookup(I));
+}
+
+DetachPhi *DetachSSA::getDetachAccess(const BasicBlock *BB) const {
+  return cast_or_null<DetachPhi>(ValueToDetachAccess.lookup(cast<Value>(BB)));
+}
+
+/// Perform a local numbering on blocks so that instruction ordering can be
+/// determined in constant time.
+/// TODO: We currently just number in order.  If we numbered by N, we could
+/// allow at least N-1 sequences of insertBefore or insertAfter (and at least
+/// log2(N) sequences of mixed before and after) without needing to invalidate
+/// the numbering.
+void DetachSSA::renumberBlock(const BasicBlock *B) const {
+  // The pre-increment ensures the numbers really start at 1.
+  unsigned long CurrentNumber = 0;
+  const AccessList *AL = getBlockAccesses(B);
+  assert(AL != nullptr && "Asking to renumber an empty block");
+  for (const auto &I : *AL)
+    BlockNumbering[&I] = ++CurrentNumber;
+  BlockNumberingValid.insert(B);
+}
+
+/// \brief Determine, for two detach accesses in the same block,
+/// whether \p Dominator dominates \p Dominatee.
+/// \returns True if \p Dominator dominates \p Dominatee.
+bool DetachSSA::locallyDominates(const DetachAccess *Dominator,
+                                 const DetachAccess *Dominatee) const {
+
+  const BasicBlock *DominatorBlock = Dominator->getBlock();
+
+  assert((DominatorBlock == Dominatee->getBlock()) &&
+         "Asking for local domination when accesses are in different blocks!");
+  // A node dominates itself.
+  if (Dominatee == Dominator)
+    return true;
+
+  // When Dominatee is defined on function entry, it is not dominated by another
+  // detach access.
+  if (isLiveOnEntryDef(Dominatee))
+    return false;
+
+  // When Dominator is defined on function entry, it dominates the other detach
+  // access.
+  if (isLiveOnEntryDef(Dominator))
+    return true;
+
+  if (!BlockNumberingValid.count(DominatorBlock))
+    renumberBlock(DominatorBlock);
+
+  unsigned long DominatorNum = BlockNumbering.lookup(Dominator);
+  // All numbers start with 1
+  assert(DominatorNum != 0 && "Block was not numbered properly");
+  unsigned long DominateeNum = BlockNumbering.lookup(Dominatee);
+  assert(DominateeNum != 0 && "Block was not numbered properly");
+  return DominatorNum < DominateeNum;
+}
+
+bool DetachSSA::dominates(const DetachAccess *Dominator,
+                          const DetachAccess *Dominatee) const {
+  if (Dominator == Dominatee)
+    return true;
+
+  if (isLiveOnEntryDef(Dominatee))
+    return false;
+
+  if (Dominator->getBlock() != Dominatee->getBlock())
+    return DT->dominates(Dominator->getBlock(), Dominatee->getBlock());
+  return locallyDominates(Dominator, Dominatee);
+}
+
+bool DetachSSA::dominates(const DetachAccess *Dominator,
+                          const Use &Dominatee) const {
+  if (DetachPhi *DP = dyn_cast<DetachPhi>(Dominatee.getUser())) {
+    BasicBlock *UseBB = DP->getIncomingBlock(Dominatee);
+    // The def must dominate the incoming block of the phi.
+    if (UseBB != Dominator->getBlock())
+      return DT->dominates(Dominator->getBlock(), UseBB);
+    // If the UseBB and the DefBB are the same, compare locally.
+    return locallyDominates(Dominator, cast<DetachAccess>(Dominatee));
+  }
+  // If it's not a PHI node use, the normal dominates can already handle it.
+  return dominates(Dominator, cast<DetachAccess>(Dominatee.getUser()));
+}
+
+void DetachAccess::print(raw_ostream &OS) const {
+  switch (getValueID()) {
+  case DetachPhiVal: return static_cast<const DetachPhi *>(this)->print(OS);
+  case DetachDefVal: return static_cast<const DetachDef *>(this)->print(OS);
+  case DetachUseVal: return static_cast<const DetachUse *>(this)->print(OS);
+  }
+  llvm_unreachable("invalid value id");
+}
+
+void DetachDef::print(raw_ostream &OS) const {
+  DetachAccess *UO = getDefiningAccess();
+
+  OS << getID() << " = DetachDef(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  OS << ')';
+}
+
+void DetachPhi::print(raw_ostream &OS) const {
+  bool First = true;
+  OS << getID() << " = DetachPhi(";
+  for (const auto &Op : operands()) {
+    BasicBlock *BB = getIncomingBlock(Op);
+    DetachAccess *DA = cast<DetachAccess>(Op);
+    if (!First)
+      OS << ',';
+    else
+      First = false;
+
+    OS << '{';
+    if (BB->hasName())
+      OS << BB->getName();
+    else
+      BB->printAsOperand(OS, false);
+    OS << ',';
+    if (unsigned ID = DA->getID())
+      OS << ID;
+    OS << '}';
+  }
+  OS << ')';
+}
+
+void DetachUse::print(raw_ostream &OS) const {
+  DetachAccess *UO = getDefiningAccess();
+  OS << "DetachUse(";
+  if (UO && UO->getID())
+    OS << UO->getID();
+  OS << ')';
+}
+
+void DetachAccess::dump() const {
+// Cannot completely remove virtual function even in release mode.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  print(dbgs());
+  dbgs() << "\n";
+#endif
+}
+
+char DetachSSAPrinterLegacyPass::ID = 0;
+
+DetachSSAPrinterLegacyPass::DetachSSAPrinterLegacyPass() : FunctionPass(ID) {
+  initializeDetachSSAPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
+}
+
+void DetachSSAPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<DetachSSAWrapperPass>();
+  AU.addPreserved<DetachSSAWrapperPass>();
+}
+
+bool DetachSSAPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &DSSA = getAnalysis<DetachSSAWrapperPass>().getDSSA();
+  DSSA.print(dbgs());
+  if (VerifyDetachSSA)
+    DSSA.verifyDetachSSA();
+  return false;
+}
+
+AnalysisKey DetachSSAAnalysis::Key;
+
+DetachSSAAnalysis::Result DetachSSAAnalysis::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  return DetachSSAAnalysis::Result(make_unique<DetachSSA>(F, &DT));
+}
+
+PreservedAnalyses DetachSSAPrinterPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  OS << "DetachSSA for function: " << F.getName() << "\n";
+  AM.getResult<DetachSSAAnalysis>(F).getDSSA().print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses DetachSSAVerifierPass::run(Function &F,
+                                             FunctionAnalysisManager &AM) {
+  AM.getResult<DetachSSAAnalysis>(F).getDSSA().verifyDetachSSA();
+
+  return PreservedAnalyses::all();
+}
+
+char DetachSSAWrapperPass::ID = 0;
+
+DetachSSAWrapperPass::DetachSSAWrapperPass() : FunctionPass(ID) {
+  initializeDetachSSAWrapperPassPass(*PassRegistry::getPassRegistry());
+}
+
+void DetachSSAWrapperPass::releaseMemory() { DSSA.reset(); }
+
+void DetachSSAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+}
+
+bool DetachSSAWrapperPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DSSA.reset(new DetachSSA(F, &DT));
+  return false;
+}
+
+void DetachSSAWrapperPass::verifyAnalysis() const { DSSA->verifyDetachSSA(); }
+
+void DetachSSAWrapperPass::print(raw_ostream &OS, const Module *M) const {
+  DSSA->print(OS);
+}
+} // namespace llvm
+
+void DetachPhi::deleteMe(DerivedUser *Self) {
+  delete static_cast<DetachPhi *>(Self);
+}
+
+void DetachDef::deleteMe(DerivedUser *Self) {
+  delete static_cast<DetachDef *>(Self);
+}
+
+void DetachUse::deleteMe(DerivedUser *Self) {
+  delete static_cast<DetachUse *>(Self);
+}
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index 6a5567ed765bb2..7df5d9a8c03da8 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -137,7 +137,7 @@ class MemoryLocOrCall {
       IsCall = false;
       // There is no such thing as a memorylocation for a fence inst, and it is
       // unique in that regard.
-      if (!isa<FenceInst>(Inst))
+      if (!isa<FenceInst>(Inst) && !isa<SyncInst>(Inst))
         Loc = MemoryLocation::get(Inst);
     }
   }
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index eab7ec81953609..6b4e0e0207fcf9 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -859,6 +859,9 @@ lltok::Kind LLLexer::LexIdentifier() {
   INSTKEYWORD(invoke,      Invoke);
   INSTKEYWORD(resume,      Resume);
   INSTKEYWORD(unreachable, Unreachable);
+  INSTKEYWORD(detach,      Detach);
+  INSTKEYWORD(reattach,    Reattach);
+  INSTKEYWORD(sync,        Sync);
 
   INSTKEYWORD(alloca,      Alloca);
   INSTKEYWORD(load,        Load);
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index ee634505581e81..6c4cd4207c61cf 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -5577,6 +5577,9 @@ int LLParser::ParseInstruction(Instruction *&Inst, BasicBlock *BB,
       Inst->setFastMathFlags(FMF);
     return false;
   }
+  case lltok::kw_detach:      return ParseDetach(Inst, PFS);
+  case lltok::kw_reattach:    return ParseReattach(Inst, PFS);
+  case lltok::kw_sync:        return ParseSync(Inst, PFS);
   // Binary Operators.
   case lltok::kw_add:
   case lltok::kw_sub:
@@ -5776,6 +5779,89 @@ bool LLParser::ParseBr(Instruction *&Inst, PerFunctionState &PFS) {
   return false;
 }
 
+/// ParseDetach
+///   ::= 'detach' within SyncRegion ',' TypeAndValue ',' TypeAndValue
+bool LLParser::ParseDetach(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc, Loc2;
+  Value *SR;
+  BasicBlock *Op1, *Op2;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after detach"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for detach");
+
+  if (ParseValue(Type::getTokenTy(Context), SR, PFS))
+    return true;
+
+  if (ParseToken(lltok::comma, "expected ',' after detach scope"))
+    return true;
+
+  if (ParseTypeAndBasicBlock(Op1, Loc, PFS) ||
+      ParseToken(lltok::comma, "expected ',' after detached destination") ||
+      ParseTypeAndBasicBlock(Op2, Loc2, PFS))
+    return true;
+
+  Inst = DetachInst::Create(Op1, Op2, SR);
+  return false;
+}
+
+/// ParseReattach
+///   ::= 'reattach' within SyncRegion ',' TypeAndValue
+bool LLParser::ParseReattach(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *SR;
+  BasicBlock *Op;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after reatach"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for reattach");
+
+  if (ParseValue(Type::getTokenTy(Context), SR, PFS))
+    return true;
+
+  if (ParseToken(lltok::comma, "expected ',' after reattach scope"))
+    return true;
+
+  if (ParseTypeAndBasicBlock(Op, Loc, PFS))
+    return true;
+
+  Inst = ReattachInst::Create(Op, SR);
+  return false;
+}
+
+/// ParseSync
+///   ::= 'sync' within SyncRegion ',' TypeAndValue
+bool LLParser::ParseSync(Instruction *&Inst, PerFunctionState &PFS) {
+  LocTy Loc;
+  Value *SR;
+  BasicBlock *Op;
+
+  if (ParseToken(lltok::kw_within, "expected 'within' after sync"))
+    return true;
+
+  if (Lex.getKind() != lltok::kw_none && Lex.getKind() != lltok::LocalVar &&
+      Lex.getKind() != lltok::LocalVarID)
+    return TokError("expected scope value for reattach");
+
+  if (ParseValue(Type::getTokenTy(Context), SR, PFS))
+    return true;
+
+  if (ParseToken(lltok::comma, "expected ',' after scope in sync"))
+    return true;
+
+  if (ParseTypeAndBasicBlock(Op, Loc, PFS))
+    return true;
+
+  Inst = SyncInst::Create(Op, SR);
+  return false;
+}
+
 /// ParseSwitch
 ///  Instruction
 ///    ::= 'switch' TypeAndValue ',' TypeAndValue '[' JumpTable ']'
diff --git a/llvm/lib/AsmParser/LLParser.h b/llvm/lib/AsmParser/LLParser.h
index 5a0fc297265d4d..2b53bbea557b4d 100644
--- a/llvm/lib/AsmParser/LLParser.h
+++ b/llvm/lib/AsmParser/LLParser.h
@@ -571,6 +571,9 @@ namespace llvm {
     bool ParseCatchSwitch(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseCatchPad(Instruction *&Inst, PerFunctionState &PFS);
     bool ParseCleanupPad(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseDetach(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseReattach(Instruction *&Inst, PerFunctionState &PFS);
+    bool ParseSync(Instruction *&Inst, PerFunctionState &PFS);
 
     bool ParseUnaryOp(Instruction *&Inst, PerFunctionState &PFS, unsigned Opc,
                       unsigned OperandType);
diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index c2e2795a9467be..d21527f347a211 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -344,6 +344,11 @@ enum Kind {
   kw_insertvalue,
   kw_blockaddress,
 
+  // Tapir types
+  kw_detach,
+  kw_reattach,
+  kw_sync,
+
   // Metadata types.
   kw_distinct,
 
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index fe051e7a91256d..1173e1e8792616 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -4231,6 +4231,59 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       I = new UnreachableInst(Context);
       InstructionList.push_back(I);
       break;
+    case bitc::FUNC_CODE_INST_DETACH: { // DETACH: [bb#, bb#, val]
+      if (Record.size() != 3)
+        return error("Invalid record");
+      BasicBlock *Detached = getBasicBlock(Record[0]);
+      if (!Detached)
+        return error("Invalid record");
+
+      BasicBlock *Continue = getBasicBlock(Record[1]);
+      if (!Continue)
+        return error("Invalid record");
+
+      Value *SyncRegion =
+        getValue(Record, 2, NextValueNo, Type::getTokenTy(Context));
+      if (!SyncRegion)
+        return error("Invalid record");
+
+      I = DetachInst::Create(Detached, Continue, SyncRegion);
+      InstructionList.push_back(I);
+      break;
+    }
+      case bitc::FUNC_CODE_INST_REATTACH: { // REATTACH: [bb#, val]
+      if (Record.size() != 2)
+        return error("Invalid record");
+
+      BasicBlock *DetachContinue = getBasicBlock(Record[0]);
+      if (!DetachContinue)
+        return error("Invalid record");
+
+      Value *SyncRegion =
+        getValue(Record, 1, NextValueNo, Type::getTokenTy(Context));
+      if (!SyncRegion)
+        return error("Invalid record");
+
+      I = ReattachInst::Create(DetachContinue, SyncRegion);
+      InstructionList.push_back(I);
+      break;
+    }
+    case bitc::FUNC_CODE_INST_SYNC: { // Sync: [bb#, val]
+      if (Record.size() != 1)
+        return error("Invalid record");
+      BasicBlock *Continue = getBasicBlock(Record[0]);
+      if (!Continue)
+        return error("Invalid record");
+
+      Value *SyncRegion =
+        getValue(Record, 1, NextValueNo, Type::getTokenTy(Context));
+      if (!SyncRegion)
+        return error("Invalid record");
+
+      I = SyncInst::Create(Continue, SyncRegion);
+      InstructionList.push_back(I);
+      break;
+    }
     case bitc::FUNC_CODE_INST_PHI: { // PHI: [ty, val0,bb0, ...]
       if (Record.size() < 1 || ((Record.size()-1)&1))
         return error("Invalid record");
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index ba4f932e2e6db8..26d032ffe47c1d 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -2780,6 +2780,31 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Code = bitc::FUNC_CODE_INST_UNREACHABLE;
     AbbrevToUse = FUNCTION_INST_UNREACHABLE_ABBREV;
     break;
+  case Instruction::Detach:
+    {
+      Code = bitc::FUNC_CODE_INST_DETACH;
+      const DetachInst &DI = cast<DetachInst>(I);
+      Vals.push_back(VE.getValueID(DI.getSuccessor(0)));
+      Vals.push_back(VE.getValueID(DI.getSuccessor(1)));
+      pushValue(DI.getSyncRegion(), InstID, Vals);
+    }
+    break;
+  case Instruction::Reattach:
+    {
+      Code = bitc::FUNC_CODE_INST_REATTACH;
+      const ReattachInst &RI = cast<ReattachInst>(I);
+      Vals.push_back(VE.getValueID(RI.getSuccessor(0)));
+      pushValue(RI.getSyncRegion(), InstID, Vals);
+    }
+    break;
+  case Instruction::Sync:
+    {
+      Code = bitc::FUNC_CODE_INST_SYNC;
+      const SyncInst &SI = cast<SyncInst>(I);
+      Vals.push_back(VE.getValueID(SI.getSuccessor(0)));
+      pushValue(SI.getSyncRegion(), InstID, Vals);
+    }
+    break;
 
   case Instruction::PHI: {
     const PHINode &PN = cast<PHINode>(I);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 95f6274aa068be..a451527c5bb472 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -476,6 +476,62 @@ bool IRTranslator::translateIndirectBr(const User &U,
   return true;
 }
 
+bool IRTranslator::translateDetach(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  const DetachInst &DetInst = cast<DetachInst>(U);
+
+  // Lowering of Tapir instructions should have happened already.  At this
+  // stage, treat Detach like an unconditional branch to the detached successor.
+  const BasicBlock &DetTgt = *cast<BasicBlock>(DetInst.getDetached());
+  MachineBasicBlock &TgtBB = getMBB(DetTgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the detached successor is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
+
+  // Link detached successor.
+  CurBB.addSuccessor(&getMBB(*cast<BasicBlock>(DetInst.getDetached())));
+  return true;
+}
+
+bool IRTranslator::translateReattach(const User &U,
+                                     MachineIRBuilder &MIRBuilder) {
+  const ReattachInst &ReatInst = cast<ReattachInst>(U);
+
+  // Lowering of Tapir instructions should have happened already.  At this
+  // stage, treat Reattach like an unconditional branch to its successor.
+  const BasicBlock &ReatTgt = *cast<BasicBlock>(ReatInst.getSuccessor(0));
+  MachineBasicBlock &TgtBB = getMBB(ReatTgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the reattach successor is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
+
+  // Link the Reattach instruction's successor.
+  CurBB.addSuccessor(&getMBB(*cast<BasicBlock>(ReatInst.getSuccessor(0))));
+  return true;
+}
+
+bool IRTranslator::translateSync(const User &U, MachineIRBuilder &MIRBuilder) {
+  const SyncInst &SInst = cast<SyncInst>(U);
+
+  // Lowering of Tapir instructions should have happened already.  At this
+  // stage, treat Sync like an unconditional branch to its successor.
+  const BasicBlock &STgt = *cast<BasicBlock>(SInst.getSuccessor(0));
+  MachineBasicBlock &TgtBB = getMBB(STgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the sync successor is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
+
+  // Link the Sync instruction's successor.
+  CurBB.addSuccessor(&getMBB(*cast<BasicBlock>(SInst.getSuccessor(0))));
+  return true;
+}
+
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 542491eabbf29c..a6fef51aa3098d 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -299,6 +299,16 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
     return NewDef->getOperand(0).getReg();
   }
 
+  static bool BlockReattaches(MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    return false;
+  }
+
+  static bool BlockDetaches(MachineBasicBlock *BB,
+                            MachineSSAUpdater *Updater) {
+    return false;
+  }
+
   /// CreateEmptyPHI - Create a PHI instruction that defines a new register.
   /// Add it into the specified block and return the register.
   static unsigned CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
@@ -344,6 +354,12 @@ class SSAUpdaterTraits<MachineSSAUpdater> {
   static unsigned GetPHIValue(MachineInstr *PHI) {
     return PHI->getOperand(0).getReg();
   }
+
+  static void MarkDetachedDef(unsigned Val, MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    return;
+  }
+
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index cdc597db640166..43e4fd352c6ddb 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -555,6 +555,28 @@ bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
   return false;
 }
 
+static inline bool hasSetJmpPred( MachineBasicBlock *bl0 ) {
+
+//    llvm::errs() << "<considering block>\n";
+//    bl0->dump();
+//    llvm::errs() << "</considering block>\n";
+
+    for( auto bl : bl0->predecessors() ) {    
+//      llvm::errs() << "  <foo>\n";
+    auto term = bl->getFirstTerminator();
+    while( term != bl->end() ) {
+      auto mc = (*term).getDesc();
+//      if (mc.Opcode != 777) continue;
+       if (mc.Opcode == 777) { return true; }
+//      llvm::errs() << "    flags:" << mc.Flags << " opc:" << mc.Opcode << "\n";
+//      term->dump();
+      term++;
+    }
+//      llvm::errs() << "  </foo>\n";
+    }
+    return false;
+}
+
 /// Get the sorted sequence of successors for this MachineBasicBlock, possibly
 /// computing it if it was not already cached.
 SmallVector<MachineBasicBlock *, 4> &
@@ -565,7 +587,7 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
   if (Succs != AllSuccessors.end())
     return Succs->second;
 
-  SmallVector<MachineBasicBlock *, 4> AllSuccs(MBB->succ_begin(),
+  SmallPtrSet<MachineBasicBlock *, 4> AllSuccs0(MBB->succ_begin(),
                                                MBB->succ_end());
 
   // Handle cases where sinking can happen but where the sink point isn't a
@@ -582,7 +604,43 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
     if (DTChild->getIDom()->getBlock() == MI.getParent() &&
         // Skip MBBs already added to the AllSuccs vector above.
         !MBB->isSuccessor(DTChild->getBlock()))
-      AllSuccs.push_back(DTChild->getBlock());
+      AllSuccs0.insert(DTChild->getBlock());
+
+  ///*
+  bool unstable = true;
+  while(unstable) {
+    unstable = false;
+    SmallPtrSet<MachineBasicBlock*, 10> toRemove;
+    for( auto bl0 : AllSuccs0 ) {
+      //if (hasSetJmpPred(bl0)) assert(bl0->hasAddressTaken());
+      if (toRemove.count(bl0) == 0 && (hasSetJmpPred(bl0) || bl0->hasAddressTaken()) ) {   
+        SmallVector<MachineBasicBlock *, 10> Q;
+        Q.push_back(bl0);
+        toRemove.insert(bl0);
+        while( Q.size() > 0 ) {
+          auto f = Q.back();
+          Q.pop_back();
+          //llvm::errs() << "saw and removing: " << f->getFullName() << "$BB#" << f->getNumber() << "\n";
+          for( auto a : f->successors() ) {
+            if ( toRemove.count(a) > 0 || AllSuccs0.count(a) == 0 ) continue;
+            toRemove.insert(a);
+            Q.push_back(a);
+          }
+        }
+        unstable = true;
+      }
+    }
+    for (auto b : toRemove) {
+      AllSuccs0.erase(b);
+    }
+  } // */
+
+  //MBB->dump();
+  //llvm::errs() << "CHECK CHILDREN FOR " << MBB->getFullName() << "$BB#" << MBB->getNumber() << ": " << "|{";
+  //for( auto a : AllSuccs0 ) llvm::errs() << a->getFullName() << "$BB#" << a->getNumber() << ",";
+  //llvm::errs() << "}\n";
+  SmallVector<MachineBasicBlock *, 4> AllSuccs(AllSuccs0.begin(),
+                                               AllSuccs0.end());
 
   // Sort Successors according to their loop depth or block frequency info.
   std::stable_sort(
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index bfeb3d1bc2b91f..7fa157cc1bac4b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2671,6 +2671,66 @@ void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
   DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
 }
 
+void SelectionDAGBuilder::visitDetach(const DetachInst &I) {
+  MachineBasicBlock *DetachMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Detached = FuncInfo.MBBMap[I.getSuccessor(0)];
+  //MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  // Update machine-CFG edges.
+  DetachMBB->addSuccessor(Detached);
+
+  // If this is not a fall-through branch or optimizations are switched off,
+  // emit the branch.
+  if (Detached != NextBlock(DetachMBB) || TM.getOptLevel() == CodeGenOpt::None)
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                            MVT::Other, getControlRoot(),
+                            DAG.getBasicBlock(Detached)));
+
+  return;
+
+}
+
+void SelectionDAGBuilder::visitReattach(const ReattachInst &I) {
+  MachineBasicBlock *ReattachMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  // Update machine-CFG edges.
+  ReattachMBB->addSuccessor(Continue);
+
+  // If this is not a fall-through branch or optimizations are switched off,
+  // emit the branch.
+  if (Continue != NextBlock(ReattachMBB) || TM.getOptLevel() == CodeGenOpt::None)
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                            MVT::Other, getControlRoot(),
+                            DAG.getBasicBlock(Continue)));
+
+  return;
+}
+
+void SelectionDAGBuilder::visitSync(const SyncInst &I) {
+  MachineBasicBlock *SyncMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Continue = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  // Update machine-CFG edges.
+  SyncMBB->addSuccessor(Continue);
+
+  // If this is not a fall-through branch or optimizations are switched off,
+  // emit the branch.
+  if (Continue != NextBlock(SyncMBB) || TM.getOptLevel() == CodeGenOpt::None)
+    DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                            MVT::Other, getControlRoot(),
+                            DAG.getBasicBlock(Continue)));
+
+  return;
+}
+
+
 void SelectionDAGBuilder::visitFSub(const User &I) {
   // -0.0 - X --> fneg
   Type *Ty = I.getType();
@@ -6375,6 +6435,10 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     // MachineFunction in SelectionDAGISel::PrepareEHLandingPad. We can safely
     // delete it now.
     return nullptr;
+  // Tapir intrinsics
+  // Lower the starting point of a sync region to a no-op.
+  case Intrinsic::syncregion_start:
+    return nullptr;
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 5f9cdb69daf72d..b0cc4725884aa8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -57,6 +57,7 @@ class ConstantInt;
 class ConstrainedFPIntrinsic;
 class DbgValueInst;
 class DataLayout;
+class DetachInst;
 class DIExpression;
 class DILocalVariable;
 class DILocation;
@@ -72,11 +73,13 @@ class LLVMContext;
 class LoadInst;
 class MachineBasicBlock;
 class PHINode;
+class ReattachInst;
 class ResumeInst;
 class ReturnInst;
 class SDDbgValue;
 class StoreInst;
 class SwitchInst;
+class SyncInst;
 class TargetLibraryInfo;
 class TargetMachine;
 class Type;
@@ -825,6 +828,9 @@ class SelectionDAGBuilder {
   void visitCatchRet(const CatchReturnInst &I);
   void visitCatchPad(const CatchPadInst &I);
   void visitCleanupPad(const CleanupPadInst &CPI);
+  void visitDetach(const DetachInst& I);
+  void visitReattach(const ReattachInst& I);
+  void visitSync(const SyncInst& I);
 
   BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
                                        const MachineBasicBlock *Dst) const;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index e8619037564245..4edebace9622ea 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1455,6 +1455,9 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case CatchSwitch:    return 0;
   case CleanupPad:     return 0;
   case FNeg:           return ISD::FNEG;
+  case Detach:         return 0;
+  case Reattach:       return 0;
+  case Sync:           return 0;
   case Add:            return ISD::ADD;
   case FAdd:           return ISD::FADD;
   case Sub:            return ISD::SUB;
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index a5dc623e1a30fe..adead5e5dc1d62 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -3637,6 +3637,29 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     writeOperand(BI.getSuccessor(0), true);
     Out << ", ";
     writeOperand(BI.getSuccessor(1), true);
+  } else if (isa<DetachInst>(I)) {
+    // Special case detach instruction to get formatting nice and correct
+    const DetachInst &DI(cast<DetachInst>(I));
+    Out << " within ";
+    writeOperand(DI.getSyncRegion(), /*PrintType=*/false);
+    Out << ", ";
+    writeOperand(DI.getDetached(), true);
+    Out << ", ";
+    writeOperand(DI.getContinue(), true);
+  } else if (isa<ReattachInst>(I)) {
+    // Special case reattach instruction to get formatting nice and correct
+    const ReattachInst &RI(cast<ReattachInst>(I));
+    Out << " within ";
+    writeOperand(RI.getSyncRegion(), /*PrintType=*/false);
+    Out << ", ";
+    writeOperand(RI.getSuccessor(0), true);
+  } else if (isa<SyncInst>(I)) {
+    // Special case sync instruction to get formatting nice and correct
+    const SyncInst &SI(cast<SyncInst>(I));
+    Out << " within ";
+    writeOperand(SI.getSyncRegion(), /*PrintType=*/false);
+    Out << ", ";
+    writeOperand(SI.getSuccessor(0), true);
 
   } else if (isa<SwitchInst>(I)) {
     const SwitchInst& SI(cast<SwitchInst>(I));
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 375924360dda83..213c8deedc0bd7 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -443,6 +443,48 @@ BasicBlock *BasicBlock::splitBasicBlock(iterator I, const Twine &BBName) {
   return New;
 }
 
+BasicBlock *BasicBlock::splitBasicBlockWithTerminator(const Twine &BBName) {
+  auto term = getTerminator();
+  assert(term && "Can't use splitBasicBlock on degenerate BB!");
+  assert(term->getNumSuccessors() == 1 && "Number of successors must be 1");
+
+  BasicBlock *New = BasicBlock::Create(getContext(), BBName, getParent(),
+                                       this->getNextNode());
+
+  // Save DebugLoc of split point before invalidating iterator.
+  DebugLoc Loc = term->getDebugLoc();
+  // Move all of the specified instructions from the original basic block into
+  // the new basic block.
+  auto suc = term->getSuccessor(0);
+  term->setSuccessor(0, New);
+
+  // Add a branch instruction to the newly formed basic block.
+  BranchInst *BI = BranchInst::Create(suc, New);
+  BI->setDebugLoc(Loc);
+
+  // Now we must loop through all of the successors of the New block (which
+  // _were_ the successors of the 'this' block), and update any PHI nodes in
+  // successors.  If there were PHI nodes in the successors, then they need to
+  // know that incoming branches will be from New, not from Old.
+  //
+  for (succ_iterator I = succ_begin(New), E = succ_end(New); I != E; ++I) {
+    // Loop over any phi nodes in the basic block, updating the BB field of
+    // incoming values...
+    BasicBlock *Successor = *I;
+    PHINode *PN;
+    for (BasicBlock::iterator II = Successor->begin();
+         (PN = dyn_cast<PHINode>(II)); ++II) {
+      int IDX = PN->getBasicBlockIndex(this);
+      while (IDX != -1) {
+        PN->setIncomingBlock((unsigned)IDX, New);
+        IDX = PN->getBasicBlockIndex(this);
+      }
+    }
+  }
+
+  return New;
+}
+
 void BasicBlock::replaceSuccessorsPhiUsesWith(BasicBlock *New) {
   Instruction *TI = getTerminator();
   if (!TI)
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index d861b5288592ca..57d3923622991b 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -302,6 +302,9 @@ const char *Instruction::getOpcodeName(unsigned OpCode) {
   case CatchRet: return "catchret";
   case CatchPad: return "catchpad";
   case CatchSwitch: return "catchswitch";
+  case Detach: return "detach";
+  case Reattach: return "reattach";
+  case Sync:   return "sync";
 
   // Standard unary operators...
   case FNeg: return "fneg";
@@ -510,6 +513,7 @@ bool Instruction::mayReadFromMemory() const {
   case Instruction::VAArg:
   case Instruction::Load:
   case Instruction::Fence: // FIXME: refine definition of mayReadFromMemory
+  case Instruction::Sync: // Like Instruction::Fence
   case Instruction::AtomicCmpXchg:
   case Instruction::AtomicRMW:
   case Instruction::CatchPad:
@@ -528,6 +532,7 @@ bool Instruction::mayWriteToMemory() const {
   switch (getOpcode()) {
   default: return false;
   case Instruction::Fence: // FIXME: refine definition of mayWriteToMemory
+  case Instruction::Sync: // Like Instruction::Fence
   case Instruction::Store:
   case Instruction::VAArg:
   case Instruction::AtomicCmpXchg:
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index 06b46724a87f80..81bb40423e8234 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -973,6 +973,180 @@ UnreachableInst::UnreachableInst(LLVMContext &Context, BasicBlock *InsertAtEnd)
     : Instruction(Type::getVoidTy(Context), Instruction::Unreachable, nullptr,
                   0, InsertAtEnd) {}
 
+//===----------------------------------------------------------------------===//
+//                        DetachInst Implementation
+//===----------------------------------------------------------------------===//
+
+void DetachInst::AssertOK() {
+  assert(getSyncRegion()->getType()->isTokenTy() &&
+         "Sync region must be a token!");
+}
+
+DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+                       Value *SyncRegion,
+                       Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(Detached->getContext()),
+                     Instruction::Detach,
+                     OperandTraits<DetachInst>::op_end(this) - 3, 3,
+                     InsertBefore) {
+  Op<-1>() = Detached;
+  Op<-2>() = Continue;
+  Op<-3>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+DetachInst::DetachInst(BasicBlock *Detached, BasicBlock *Continue,
+                       Value *SyncRegion,
+                       BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(Detached->getContext()),
+                     Instruction::Detach,
+                     OperandTraits<DetachInst>::op_end(this) - 3, 3,
+                     InsertAtEnd) {
+  Op<-1>() = Detached;
+  Op<-2>() = Continue;
+  Op<-3>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+
+DetachInst::DetachInst(const DetachInst &DI)
+    : TerminatorInst(Type::getVoidTy(DI.getContext()), Instruction::Detach,
+                     OperandTraits<DetachInst>::op_end(this) -
+                     DI.getNumOperands(),
+                     DI.getNumOperands()) {
+  Op<-1>() = DI.Op<-1>();
+  Op<-2>() = DI.Op<-2>();
+  Op<-3>() = DI.Op<-3>();
+  assert(DI.getNumOperands() == 3 && "Detach must have 3 operands!");
+  SubclassOptionalData = DI.SubclassOptionalData;
+}
+
+BasicBlock *DetachInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned DetachInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void DetachInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+//===----------------------------------------------------------------------===//
+//                      ReattachInst Implementation
+//===----------------------------------------------------------------------===//
+
+void ReattachInst::AssertOK() {
+  assert(getSyncRegion()->getType()->isTokenTy() &&
+         "Sync region must be a token!");
+}
+
+ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+                           Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(DetachContinue->getContext()),
+                     Instruction::Reattach,
+                     OperandTraits<ReattachInst>::op_end(this) - 2, 2,
+                     InsertBefore) {
+  Op<-1>() = DetachContinue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+ReattachInst::ReattachInst(BasicBlock *DetachContinue, Value *SyncRegion,
+                           BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(DetachContinue->getContext()),
+                     Instruction::Reattach,
+                     OperandTraits<ReattachInst>::op_end(this) - 2, 2,
+                     InsertAtEnd) {
+  Op<-1>() = DetachContinue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+ReattachInst::ReattachInst(const ReattachInst &RI)
+    : TerminatorInst(Type::getVoidTy(RI.getContext()), Instruction::Reattach,
+                     OperandTraits<ReattachInst>::op_end(this) -
+                     RI.getNumOperands(),
+                     RI.getNumOperands()) {
+  Op<-1>() = RI.Op<-1>();
+  Op<-2>() = RI.Op<-2>();
+  assert(RI.getNumOperands() == 2 && "Reattach must have 2 operands!");
+  SubclassOptionalData = RI.SubclassOptionalData;
+}
+
+unsigned ReattachInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+
+BasicBlock *ReattachInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+
+void ReattachInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
+//===----------------------------------------------------------------------===//
+//                        SyncInst Implementation
+//===----------------------------------------------------------------------===//
+
+void SyncInst::AssertOK() {
+  assert(getSyncRegion()->getType()->isTokenTy() &&
+         "Sync region must be a token!");
+}
+
+SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion,
+                   Instruction *InsertBefore)
+    : TerminatorInst(Type::getVoidTy(Continue->getContext()), Instruction::Sync,
+                     OperandTraits<SyncInst>::op_end(this) - 2, 2,
+                     InsertBefore) {
+  Op<-1>() = Continue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+SyncInst::SyncInst(BasicBlock *Continue, Value *SyncRegion,
+                   BasicBlock *InsertAtEnd)
+    : TerminatorInst(Type::getVoidTy(Continue->getContext()), Instruction::Sync,
+                     OperandTraits<SyncInst>::op_end(this) - 2, 2,
+                     InsertAtEnd) {
+  Op<-1>() = Continue;
+  Op<-2>() = SyncRegion;
+#ifndef NDEBUG
+  AssertOK();
+#endif
+}
+
+
+SyncInst::SyncInst(const SyncInst &SI) :
+    TerminatorInst(Type::getVoidTy(SI.getContext()), Instruction::Sync,
+                   OperandTraits<SyncInst>::op_end(this) - SI.getNumOperands(),
+                   SI.getNumOperands()) {
+  Op<-1>() = SI.Op<-1>();
+  Op<-2>() = SI.Op<-2>();
+  assert(SI.getNumOperands() == 2 && "Sync must have 2 operands!");
+  SubclassOptionalData = SI.SubclassOptionalData;
+}
+
+BasicBlock *SyncInst::getSuccessorV(unsigned idx) const {
+  return getSuccessor(idx);
+}
+unsigned SyncInst::getNumSuccessorsV() const {
+  return getNumSuccessors();
+}
+void SyncInst::setSuccessorV(unsigned idx, BasicBlock *B) {
+  setSuccessor(idx, B);
+}
+
 //===----------------------------------------------------------------------===//
 //                        BranchInst Implementation
 //===----------------------------------------------------------------------===//
@@ -4000,3 +4174,15 @@ UnreachableInst *UnreachableInst::cloneImpl() const {
   LLVMContext &Context = getContext();
   return new UnreachableInst(Context);
 }
+
+DetachInst *DetachInst::cloneImpl() const {
+  return new(getNumOperands()) DetachInst(*this);
+}
+
+ReattachInst *ReattachInst::cloneImpl() const {
+  return new(getNumOperands()) ReattachInst(*this);
+}
+
+SyncInst *SyncInst::cloneImpl() const {
+  return new(getNumOperands()) SyncInst(*this);
+}
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 0fb079c5ab7395..6c7255f4319e50 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -366,6 +366,13 @@ StructType *StructType::get(LLVMContext &Context, ArrayRef<Type*> ETypes,
   return ST;
 }
 
+StructType *StructType::getOrCreate(LLVMContext &Context, StringRef Name) {
+  StructType *Ty = Context.pImpl->NamedStructTypes.lookup(Name);
+  if (!Ty)
+    Ty = StructType::create(Context, Name);
+  return Ty;
+}
+
 void StructType::setBody(ArrayRef<Type*> Elements, bool isPacked) {
   assert(isOpaque() && "Struct body already set!");
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 30e77b92009f0f..00fdc08b066e2c 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -105,6 +105,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+//#include "llvm/Transforms/Tapir/CilkABI.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -342,6 +343,12 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
         BB.printAsOperand(*OS, true, MST);
         *OS << "\n";
       }
+      // if (const DetachInst* Det = dyn_cast<DetachInst>(&I->back())) {
+      //   if (!cilk::verifyDetachedCFG(*Det, DT)) {
+      //     OS << "Invalid end to detached CFG\n";
+      //     return true;
+      //   }
+      // }
       return false;
     }
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5ec94ea6f40ab0..2d935a1074dd88 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DetachSSA.h"
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVUsers.h"
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 771d2f5b212ae9..c24cf8e33375d1 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -112,6 +112,7 @@ FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("postdomtree", PostDominatorTreeAnalysis())
 FUNCTION_ANALYSIS("demanded-bits", DemandedBitsAnalysis())
+FUNCTION_ANALYSIS("detachssa", DetachSSAAnalysis())
 FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
@@ -202,6 +203,7 @@ FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
 FUNCTION_PASS("print<branch-prob>", BranchProbabilityPrinterPass(dbgs()))
 FUNCTION_PASS("print<da>", DependenceAnalysisPrinterPass(dbgs()))
+FUNCTION_PASS("print<detachssa>", DetachSSAPrinterPass(dbgs()))
 FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
@@ -224,6 +226,7 @@ FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("verify", VerifierPass())
+FUNCTION_PASS("verify<detachssa>", DetachSSAVerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
 FUNCTION_PASS("verify<memoryssa>", MemorySSAVerifierPass())
diff --git a/llvm/lib/Transforms/CMakeLists.txt b/llvm/lib/Transforms/CMakeLists.txt
index 74db9e53304da9..c39bc7e368d379 100644
--- a/llvm/lib/Transforms/CMakeLists.txt
+++ b/llvm/lib/Transforms/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(Vectorize)
 add_subdirectory(Hello)
 add_subdirectory(ObjCARC)
 add_subdirectory(Coroutines)
+add_subdirectory(Tapir)
diff --git a/llvm/lib/Transforms/IPO/LLVMBuild.txt b/llvm/lib/Transforms/IPO/LLVMBuild.txt
index 54ce23876e66b4..e0d6b8353fc3a7 100644
--- a/llvm/lib/Transforms/IPO/LLVMBuild.txt
+++ b/llvm/lib/Transforms/IPO/LLVMBuild.txt
@@ -20,4 +20,4 @@ type = Library
 name = IPO
 parent = Transforms
 library_name = ipo
-required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TransformUtils Vectorize Instrumentation
+required_libraries = AggressiveInstCombine Analysis BitReader BitWriter Core InstCombine IRReader Linker Object ProfileData Scalar Support TapirOpts TransformUtils Vectorize Instrumentation
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 9764944dc3329e..6f0c86f64fd304 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -41,6 +41,8 @@
 #include "llvm/Transforms/Scalar/InstSimplifyPass.h"
 #include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Tapir.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include "llvm/Transforms/Vectorize.h"
 
 using namespace llvm;
@@ -100,6 +102,10 @@ static cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam",
                                         cl::init(false), cl::Hidden,
                                         cl::desc("Enable Unroll And Jam Pass"));
 
+static cl::opt<bool> EnableLoopFuse(
+    "enable-loop-fuse", cl::init(false), cl::Hidden,
+    cl::desc("Enable the new, experimental LoopFusion Pass"));
+
 static cl::opt<bool>
     EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
                             cl::desc("Enable preparation for ThinLTO."));
@@ -161,8 +167,11 @@ static cl::opt<bool>
               cl::desc("Enable control height reduction optimization (CHR)"));
 
 PassManagerBuilder::PassManagerBuilder() {
+    InstrumentCilk = false;
     OptLevel = 2;
     SizeLevel = 0;
+    ParallelLevel = 0;
+    Rhino = false;
     LibraryInfo = nullptr;
     Inliner = nullptr;
     DisableUnrollLoops = false;
@@ -423,6 +432,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createControlHeightReductionLegacyPass());
 }
 
+// void PassManagerBuilder::prepopulateModulePassManager(
 void PassManagerBuilder::populateModulePassManager(
     legacy::PassManagerBase &MPM) {
   if (!PGOSampleUse.empty()) {
@@ -442,6 +452,15 @@ void PassManagerBuilder::populateModulePassManager(
       Inliner = nullptr;
     }
 
+    if (ParallelLevel > 0) {
+      MPM.add(createInferFunctionAttrsLegacyPass());
+      // MPM.add(createUnifyFunctionExitNodesPass());
+      MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+      // The lowering pass may leave cruft around.  Clean it up.
+      MPM.add(createCFGSimplificationPass());
+      MPM.add(createInferFunctionAttrsLegacyPass());
+    }
+
     // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
     // creates a CGSCC pass manager, but we don't want to add extensions into
     // that pass manager. To prevent this we insert a no-op module pass to reset
@@ -498,6 +517,15 @@ void PassManagerBuilder::populateModulePassManager(
   if (PrepareForThinLTOUsingPGOSampleProfile)
     DisableUnrollLoops = true;
 
+  bool RerunAfterTapirLowering = false;
+  bool TapirHasBeenLowered = (ParallelLevel == 0);
+  if (ParallelLevel == 3) // -fdetach
+    MPM.add(createLowerTapirToCilkPass(false, InstrumentCilk));
+
+  do {
+    RerunAfterTapirLowering =
+       !TapirHasBeenLowered && (ParallelLevel > 0) && !PrepareForThinLTO;
+      
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
@@ -745,6 +773,45 @@ void PassManagerBuilder::populateModulePassManager(
   // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
   MPM.add(createCFGSimplificationPass());
 
+  if (RerunAfterTapirLowering || (ParallelLevel == 0))
+    // Add passes to run just before Tapir lowering.
+    addExtensionsToPM(EP_TapirLate, MPM);
+
+  if (!TapirHasBeenLowered) {
+    // First handle Tapir loops.
+    MPM.add(createIndVarSimplifyPass());
+
+    // Re-rotate loops in all our loop nests. These may have fallout out of
+    // rotated form due to GVN or other transformations, and loop spawning
+    // relies on the rotated form.  Disable header duplication at -Oz.
+    MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+
+    MPM.add(createLoopSpawningPass());
+
+    // The LoopSpawning pass may leave cruft around.  Clean it up.
+    MPM.add(createLoopDeletionPass());
+    MPM.add(createCFGSimplificationPass());
+    addInstructionCombiningPass(MPM);
+    addExtensionsToPM(EP_Peephole, MPM);
+
+    // Now lower Tapir to Cilk runtime calls.
+    //
+    // TODO: Make this sequence of passes check the library info for the Cilk
+    // RTS.
+
+    MPM.add(createInferFunctionAttrsLegacyPass());
+    // MPM.add(createUnifyFunctionExitNodesPass());
+    MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+    // The lowering pass may leave cruft around.  Clean it up.
+    MPM.add(createCFGSimplificationPass());
+    MPM.add(createInferFunctionAttrsLegacyPass());
+    MPM.add(createMergeFunctionsPass());
+    MPM.add(createBarrierNoopPass());
+
+    TapirHasBeenLowered = true;
+  }
+  } while (RerunAfterTapirLowering);
+
   addExtensionsToPM(EP_OptimizerLast, MPM);
 
   if (PrepareForLTO) {
@@ -754,6 +821,58 @@ void PassManagerBuilder::populateModulePassManager(
   }
 }
 
+// void PassManagerBuilder::populateModulePassManager(legacy::PassManagerBase& MPM) {
+//   if (ParallelLevel != 0) {
+//     switch (ParallelLevel) {
+//       case 1: //fcilkplus
+//       case 2: //ftapir
+//         prepopulateModulePassManager(MPM);
+//         addExtensionsToPM(EP_TapirLate, MPM);
+//         break;
+//       case 3: //fdetach
+//         MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+//         prepopulateModulePassManager(MPM);
+//         addExtensionsToPM(EP_TapirLate, MPM);
+//         break;
+//       case 0: llvm_unreachable("invalid");
+//     }
+
+//     MPM.add(createBarrierNoopPass());
+
+//     if (OptLevel > 0) {
+//       MPM.add(createIndVarSimplifyPass());
+
+//       // Re-rotate loops in all our loop nests. These may have fallout out of
+//       // rotated form due to GVN or other transformations, and loop spawning
+//       // relies on the rotated form.  Disable header duplication at -Oz.
+//       MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+
+//       MPM.add(createLoopSpawningPass());
+
+//       // The LoopSpawning pass may leave cruft around.  Clean it up.
+//       MPM.add(createLoopDeletionPass());
+//       MPM.add(createCFGSimplificationPass());
+//       addInstructionCombiningPass(MPM);
+//       addExtensionsToPM(EP_Peephole, MPM);
+//     }
+
+//     // if (ParallelLevel != 3) MPM.add(createInferFunctionAttrsLegacyPass());
+//     MPM.add(createInferFunctionAttrsLegacyPass());
+//     MPM.add(createUnifyFunctionExitNodesPass());
+//     MPM.add(createLowerTapirToCilkPass(ParallelLevel == 2, InstrumentCilk));
+//     // The lowering pass may leave cruft around.  Clean it up.
+//     MPM.add(createCFGSimplificationPass());
+//     // if (ParallelLevel != 3) MPM.add(createInferFunctionAttrsLegacyPass());
+//     MPM.add(createInferFunctionAttrsLegacyPass());
+//     if (OptLevel != 0) MPM.add(createMergeFunctionsPass());
+//     MPM.add(createBarrierNoopPass());
+//   }
+//   prepopulateModulePassManager(MPM);
+//   if (ParallelLevel == 0)
+//     addExtensionsToPM(EP_TapirLate, MPM);
+//   addExtensionsToPM(EP_OptimizerLast, MPM);
+// }
+
 void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Load sample profile before running the LTO optimization pipeline.
   if (!PGOSampleUse.empty()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index aeb25d530d71b3..3f15930c467c57 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3908,6 +3908,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
                                   Intrinsic::lifetime_end, *this))
       return nullptr;
     break;
+  case Intrinsic::syncregion_start: {
+    int NumUsers = 0;
+    for (User *U : II->users())
+      if (isa<DetachInst>(U) || isa<ReattachInst>(U) || isa<SyncInst>(U))
+        ++NumUsers;
+    if (!NumUsers)
+      return eraseInstFromFunction(CI);
+    break;
+  }
   case Intrinsic::assume: {
     Value *IIOperand = II->getArgOperand(0);
     // Remove an assume if it is followed by an identical assume.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 76ab614090faa8..c1fe6ff1c54ae3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1539,6 +1539,7 @@ bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
   if (StoreBB == DestBB || OtherBB == DestBB)
     return false;
 
+  assert(OtherBB);
   // Verify that the other block ends in a branch and is not otherwise empty.
   BasicBlock::iterator BBI(OtherBB->getTerminator());
   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index fef051aa1b7c35..421d4346c4593c 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -3085,6 +3085,11 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
   if (I->mayReadFromMemory()) {
+    // We can't generally move an instruction that reads from memory past a
+    // detach or reattach.
+    if (isa<DetachInst>(I->getParent()->getTerminator()) ||
+        isa<ReattachInst>(I->getParent()->getTerminator()))
+      return false;
     for (BasicBlock::iterator Scan = I->getIterator(),
                               E = I->getParent()->end();
          Scan != E; ++Scan)
@@ -3185,8 +3190,10 @@ bool InstCombiner::run() {
 
         // If the user is one of our immediate successors, and if that successor
         // only has us as a predecessors (we'd have to split the critical edge
-        // otherwise), we can keep going.
-        if (UserIsSuccessor && UserParent->getUniquePredecessor()) {
+        // otherwise), we can keep going.  Don't do this if the successor
+        // follows through a sync instruction, because that's a pessimization.
+        if (UserIsSuccessor && UserParent->getUniquePredecessor() &&
+            !isa<SyncInst>(BB->getTerminator())) {
           // Okay, the CFG is simple enough, try to sink this instruction.
           if (TryToSinkInstruction(I, UserParent)) {
             LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index f1558c75cb90bf..dd57e8a31e9587 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1194,6 +1194,11 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
   if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
     return PreviouslySeenAllocaInfo->getSecond();
 
+  bool FunctionContainsDetach = false;
+  {
+    for (const BasicBlock &BB : *(AI.getParent()->getParent()))
+      FunctionContainsDetach |= isa<DetachInst>(BB.getTerminator());
+  }
   bool IsInteresting =
       (AI.getAllocatedType()->isSized() &&
        // alloca() may be called with 0 size, ignore it.
@@ -1201,6 +1206,8 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
        // We are only interested in allocas not promotable to registers.
        // Promotable allocas are common under -O0.
        (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
+       (!ClSkipPromotableAllocas ||
+        (!FunctionContainsDetach || !isAllocaParallelPromotable(&AI, *DT))) &&
        // inalloca allocas are not treated as static, and we don't want
        // dynamic alloca instrumentation for them as well.
        !AI.isUsedWithInAlloca() &&
diff --git a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
index 94461849d5094e..0b41031ae18280 100644
--- a/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/llvm/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMInstrumentation
   BoundsChecking.cpp
   CGProfile.cpp
   ControlHeightReduction.cpp
+  CilkSanitizer.cpp
   DataFlowSanitizer.cpp
   GCOVProfiling.cpp
   MemorySanitizer.cpp
@@ -15,6 +16,7 @@ add_llvm_library(LLVMInstrumentation
   ThreadSanitizer.cpp
   EfficiencySanitizer.cpp
   HWAddressSanitizer.cpp
+  ComprehensiveStaticInstrumentation.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp
new file mode 100644
index 00000000000000..62b3e0b1ed5710
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/CilkSanitizer.cpp
@@ -0,0 +1,1164 @@
+//===- CilkSanitizer.cpp - determinacy race detector for Cilk/Tapir -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of CilkSan, a determinacy race detector for Cilk
+// programs.
+//
+// This instrumentation pass inserts calls to the runtime library before
+// appropriate memory accesses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/DetachSSA.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/CSI.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cilksan"
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumOmittedReadsBeforeWrite,
+          "Number of reads ignored due to following writes");
+STATISTIC(NumOmittedReadsFromConstants,
+          "Number of reads from constant data");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
+STATISTIC(NumInstrumentedDetaches, "Number of instrumented detaches");
+STATISTIC(NumInstrumentedDetachExits, "Number of instrumented detach exits");
+STATISTIC(NumInstrumentedSyncs, "Number of instrumented syncs");
+
+static const char *const CsanDetachBaseIdName = "__csan_unit_detach_base_id";
+static const char *const CsanTaskBaseIdName = "__csan_unit_task_base_id";
+static const char *const CsanTaskExitBaseIdName =
+  "__csan_unit_task_exit_base_id";
+static const char *const CsanDetachContinueBaseIdName =
+  "__csan_unit_detach_continue_base_id";
+static const char *const CsanSyncBaseIdName = "__csan_unit_sync_base_id";
+static const char *const CsiUnitObjTableName = "__csi_unit_obj_table";
+static const char *const CsiUnitObjTableArrayName = "__csi_unit_obj_tables";
+
+/// Maintains a mapping from CSI ID of a load or store to the source information
+/// of the object accessed by that load or store.
+class ObjectTable : public ForensicTable {
+public:
+  ObjectTable() : ForensicTable() {}
+  ObjectTable(Module &M, StringRef BaseIdName)
+      : ForensicTable(M, BaseIdName) {}
+
+  /// The number of entries in this table
+  uint64_t size() const { return LocalIdToSourceLocationMap.size(); }
+
+  /// Add the given instruction to this table.
+  /// \returns The local ID of the Instruction.
+  uint64_t add(Instruction &I, Value *Addr, const DataLayout &DL);
+
+  /// Get the Type for a pointer to a table entry.
+  ///
+  /// A table entry is just a source location.
+  static PointerType *getPointerType(LLVMContext &C);
+
+  /// Insert this table into the given Module.
+  ///
+  /// The table is constructed as a ConstantArray indexed by local IDs.  The
+  /// runtime is responsible for performing the mapping that allows the table to
+  /// be indexed by global ID.
+  Constant *insertIntoModule(Module &M) const;
+
+private:
+  struct SourceLocation {
+    StringRef Name;
+    int32_t Line;
+    StringRef Filename;
+    StringRef Directory;
+  };
+
+  /// Map of local ID to SourceLocation.
+  DenseMap<uint64_t, SourceLocation> LocalIdToSourceLocationMap;
+
+  /// Create a struct type to match the "struct SourceLocation" type.
+  /// (and the source_loc_t type in csi.h).
+  static StructType *getSourceLocStructType(LLVMContext &C);
+
+  /// Append the line and file information to the table.
+  void add(uint64_t ID, int32_t Line = -1,
+           StringRef Filename = "", StringRef Directory = "",
+           StringRef Name = "");
+};
+
+namespace {
+
+struct CilkSanitizerImpl : public CSIImpl {
+  // CilkSanitizerImpl(Module &M, CallGraph *CG,
+  //                   function_ref<DetachSSA &(Function &)> GetDSSA,
+  //                   function_ref<MemorySSA &(Function &)> GetMSSA)
+  //     : CSIImpl(M, CG), GetDSSA(GetDSSA), GetMSSA(GetMSSA) {
+  CilkSanitizerImpl(Module &M, CallGraph *CG,
+                    function_ref<DominatorTree &(Function &)> GetDomTree,
+                    const TargetLibraryInfo *TLI)
+      : CSIImpl(M, CG), GetDomTree(GetDomTree), TLI(TLI),
+        CsanFuncEntry(nullptr), CsanFuncExit(nullptr), CsanRead(nullptr),
+        CsanWrite(nullptr), CsanDetach(nullptr), CsanDetachContinue(nullptr),
+        CsanTaskEntry(nullptr), CsanTaskExit(nullptr), CsanSync(nullptr) {
+    // Even though we're doing our own instrumentation, we want the CSI setup
+    // for the instrumentation of function entry/exit, memory accesses (i.e.,
+    // loads and stores), atomics, memory intrinsics.  We also want call sites,
+    // for extracting debug information.
+    Options.InstrumentBasicBlocks = false;
+    // Options.InstrumentCalls = false;
+    Options.InstrumentMemoryAccesses = false;
+    Options.InstrumentMemIntrinsics = false;
+  }
+  bool run();
+
+  static StructType *getUnitObjTableType(LLVMContext &C,
+                                         PointerType *EntryPointerType);
+  static Constant *objTableToUnitObjTable(Module &M,
+                                          StructType *UnitObjTableType,
+                                          ObjectTable &ObjTable);
+
+  // Methods for handling FED tables
+  void initializeCsanFEDTables();
+  void collectUnitFEDTables();
+
+  // Methods for handling object tables
+  void initializeCsanObjectTables();
+  void collectUnitObjectTables();
+
+  CallInst *createRTUnitInitCall(IRBuilder<> &IRB) override;
+
+  // Initialize custom hooks for CilkSanitizer
+  void initializeCsanHooks();
+
+  // Insert hooks at relevant program points
+  bool instrumentLoadOrStore(Instruction *I, const DataLayout &DL);
+  bool instrumentAtomic(Instruction *I, const DataLayout &DL);
+  bool instrumentMemIntrinsic(Instruction *I, const DataLayout &DL);
+  bool instrumentCallsite(Instruction *I, DominatorTree *DT);
+  bool instrumentDetach(DetachInst *DI, DominatorTree *DT);
+  bool instrumentSync(SyncInst *SI);
+  bool instrumentFunction(Function &F);
+  void chooseInstructionsToInstrument(
+      SmallVectorImpl<Instruction *> &Local,
+      SmallVectorImpl<Instruction *> &All,
+      const DataLayout &DL);
+
+private:
+  // Analysis results
+  // function_ref<DetachSSA &(Function &)> GetDSSA;
+  // function_ref<MemorySSA &(Function &)> GetMSSA;
+  function_ref<DominatorTree &(Function &)> GetDomTree;
+  const TargetLibraryInfo *TLI;
+
+  // Instrumentation hooks
+  Function *CsanFuncEntry, *CsanFuncExit; 
+  Function *CsanRead, *CsanWrite;
+  Function *CsanLargeRead, *CsanLargeWrite;
+  Function *CsanDetach, *CsanDetachContinue;
+  Function *CsanTaskEntry, *CsanTaskExit;
+  Function *CsanSync;
+
+  // CilkSanitizer FED tables
+  FrontEndDataTable DetachFED, TaskFED, TaskExitFED, DetachContinueFED,
+    SyncFED; 
+
+  // CilkSanitizer custom forensic tables
+  ObjectTable LoadObj, StoreObj;
+
+  SmallVector<Constant *, 2> UnitObjTables;
+
+};
+
+/// CilkSanitizer: instrument the code in module to find races.
+struct CilkSanitizer : public ModulePass {
+  static char ID;  // Pass identification, replacement for typeid.
+  CilkSanitizer() : ModulePass(ID) {
+    initializeCilkSanitizerPass(*PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override {
+    return "CilkSanitizer";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M);
+};
+} // namespace
+
+char CilkSanitizer::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+    CilkSanitizer, "csan",
+    "CilkSanitizer: detects determinacy races in Cilk programs.",
+    false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+// INITIALIZE_PASS_DEPENDENCY(DetachSSAWrapperPass)
+// INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(
+    CilkSanitizer, "csan",
+    "CilkSanitizer: detects determinacy races in Cilk programs.",
+    false, false)
+
+void CilkSanitizer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<CallGraphWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  // AU.addRequired<DetachSSAWrapperPass>();
+  // AU.addRequired<MemorySSAWrapperPass>();
+}
+
+ModulePass *llvm::createCilkSanitizerPass() {
+  return new CilkSanitizer();
+}
+
+uint64_t ObjectTable::add(Instruction &I,
+                          Value *Addr,
+                          const DataLayout &DL) {
+  uint64_t ID = getId(&I);
+  Value *Obj = GetUnderlyingObject(Addr, DL);
+
+  // First, if the underlying object is a global variable, get that variable's
+  // debug information.
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Obj)) {
+    SmallVector<DIGlobalVariableExpression *, 1> DbgGVExprs;
+    GV->getDebugInfo(DbgGVExprs);
+    for (auto *GVE : DbgGVExprs) {
+      auto *DGV = GVE->getVariable();
+      if (DGV->getName() != "") {
+        add(ID, DGV->getLine(), DGV->getFilename(), DGV->getDirectory(),
+            DGV->getName());
+        return ID;
+      }
+    }
+    add(ID);
+    return ID;
+  }
+
+  // Next, if this is an alloca instruction, look for a llvm.dbg.declare
+  // intrinsic.
+  if (isa<AllocaInst>(Obj)) {
+    if (auto *DDI = FindAllocaDbgDeclare(Obj)) {
+      auto *LV = DDI->getVariable();
+      if (LV->getName() != "") {
+        add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(),
+            LV->getName());
+        return ID;
+      }
+    }
+  }
+
+  // Otherwise just examine the llvm.dbg.value intrinsics for this object.
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  findDbgValues(DbgValues, Obj);
+  for (auto *DVI : DbgValues) {
+    auto *LV = DVI->getVariable();
+    if (LV->getName() != "") {
+      add(ID, LV->getLine(), LV->getFilename(), LV->getDirectory(),
+          LV->getName());
+      return ID;
+    }
+  }
+
+  add(ID);
+  return ID;
+}
+
+PointerType *ObjectTable::getPointerType(LLVMContext &C) {
+  return PointerType::get(getSourceLocStructType(C), 0);
+}
+
+StructType *ObjectTable::getSourceLocStructType(LLVMContext &C) {
+  return StructType::get(
+      /* Name */ PointerType::get(IntegerType::get(C, 8), 0),
+      /* Line */ IntegerType::get(C, 32),
+      /* File */ PointerType::get(IntegerType::get(C, 8), 0));
+}
+
+void ObjectTable::add(uint64_t ID, int32_t Line,
+                      StringRef Filename, StringRef Directory,
+                      StringRef Name) {
+  assert(LocalIdToSourceLocationMap.find(ID) ==
+             LocalIdToSourceLocationMap.end() &&
+         "Id already exists in FED table.");
+  LocalIdToSourceLocationMap[ID] = {Name, Line, Filename, Directory};
+}
+
+Constant *ObjectTable::insertIntoModule(Module &M) const {
+  LLVMContext &C = M.getContext();
+  StructType *TableType = getSourceLocStructType(C);
+  IntegerType *Int32Ty = IntegerType::get(C, 32);
+  Constant *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *GepArgs[] = {Zero, Zero};
+  SmallVector<Constant *, 6> TableEntries;
+
+  for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) {
+    const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second;
+    Constant *Line = ConstantInt::get(Int32Ty, E.Line);
+    Constant *File;
+    {
+      std::string Filename = E.Filename.str();
+      if (!E.Directory.empty())
+        Filename = E.Directory.str() + "/" + Filename;
+      Constant *FileStrConstant = ConstantDataArray::getString(C, Filename);
+      GlobalVariable *GV =
+        M.getGlobalVariable("__csi_unit_filename_" + Filename, true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, FileStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                FileStrConstant,
+                                "__csi_unit_filename_" + Filename,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      File =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    Constant *Name;
+    if (E.Name.empty())
+      Name = ConstantPointerNull::get(PointerType::get(
+                                          IntegerType::get(C, 8), 0));
+    else {
+      Constant *NameStrConstant = ConstantDataArray::getString(C, E.Name);
+      GlobalVariable *GV =
+        M.getGlobalVariable(("__csi_unit_object_name_" + E.Name).str(), true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, NameStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                NameStrConstant,
+                                "__csi_unit_object_name_" + E.Name,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      Name =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    // The order of arguments to ConstantStruct::get() must match the
+    // source_loc_t type in csi.h.
+    TableEntries.push_back(ConstantStruct::get(TableType, Name, Line, File));
+  }
+
+  ArrayType *TableArrayType = ArrayType::get(TableType, TableEntries.size());
+  Constant *Table = ConstantArray::get(TableArrayType, TableEntries);
+  GlobalVariable *GV =
+    new GlobalVariable(M, TableArrayType, false, GlobalValue::InternalLinkage,
+                       Table, CsiUnitObjTableName);
+  return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+}
+
+bool CilkSanitizerImpl::run() {
+  initializeCsi();
+  initializeCsanFEDTables();
+  initializeCsanObjectTables();
+  initializeCsanHooks();
+
+  for (Function &F : M) {
+    DEBUG(dbgs() << "Instrumenting " << F.getName() << "\n");
+    instrumentFunction(F);
+  }
+
+  collectUnitFEDTables();
+  collectUnitObjectTables();
+  finalizeCsi();
+  return true;
+}
+
+void CilkSanitizerImpl::initializeCsanFEDTables() {
+  DetachFED = FrontEndDataTable(M, CsanDetachBaseIdName);
+  TaskFED = FrontEndDataTable(M, CsanTaskBaseIdName);
+  TaskExitFED = FrontEndDataTable(M, CsanTaskExitBaseIdName);
+  DetachContinueFED = FrontEndDataTable(M, CsanDetachContinueBaseIdName);
+  SyncFED = FrontEndDataTable(M, CsanSyncBaseIdName);
+}
+
+void CilkSanitizerImpl::initializeCsanObjectTables() {
+  LoadObj = ObjectTable(M, CsiLoadBaseIdName);
+  StoreObj = ObjectTable(M, CsiStoreBaseIdName);
+}
+
+void CilkSanitizerImpl::collectUnitFEDTables() {
+  CSIImpl::collectUnitFEDTables();
+  LLVMContext &C = M.getContext();
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+
+  // The order of the FED tables here must match the enum in csanrt.c and the
+  // csan_instrumentation_counts_t in csan.h.
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, DetachFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, TaskFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, TaskExitFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, DetachContinueFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, SyncFED));
+}
+
+// Create a struct type to match the unit_obj_entry_t type in csanrt.c.
+StructType *CilkSanitizerImpl::getUnitObjTableType(LLVMContext &C,
+                                                   PointerType *EntryPointerType) {
+  return StructType::get(IntegerType::get(C, 64),
+                         EntryPointerType);
+}
+
+Constant *CilkSanitizerImpl::objTableToUnitObjTable(
+    Module &M, StructType *UnitObjTableType, ObjectTable &ObjTable) {
+  Constant *NumEntries =
+    ConstantInt::get(IntegerType::get(M.getContext(), 64), ObjTable.size());
+  // Constant *BaseIdPtr =
+  //   ConstantExpr::getPointerCast(FedTable.baseId(),
+  //                                Type::getInt8PtrTy(M.getContext(), 0));
+  Constant *InsertedTable = ObjTable.insertIntoModule(M);
+  return ConstantStruct::get(UnitObjTableType, NumEntries,
+                             InsertedTable);
+}
+
+void CilkSanitizerImpl::collectUnitObjectTables() {
+  LLVMContext &C = M.getContext();
+  StructType *UnitObjTableType =
+      getUnitObjTableType(C, ObjectTable::getPointerType(C));
+
+  UnitObjTables.push_back(
+      objTableToUnitObjTable(M, UnitObjTableType, LoadObj));
+  UnitObjTables.push_back(
+      objTableToUnitObjTable(M, UnitObjTableType, StoreObj));
+}
+
+CallInst *CilkSanitizerImpl::createRTUnitInitCall(IRBuilder<> &IRB) {
+  LLVMContext &C = M.getContext();
+
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+  StructType *UnitObjTableType =
+      getUnitObjTableType(C, ObjectTable::getPointerType(C));
+
+  // Lookup __csirt_unit_init
+  SmallVector<Type *, 4> InitArgTypes({IRB.getInt8PtrTy(),
+                                       PointerType::get(UnitFedTableType, 0),
+                                       PointerType::get(UnitObjTableType, 0),
+                                       InitCallsiteToFunction->getType()});
+  FunctionType *InitFunctionTy =
+      FunctionType::get(IRB.getVoidTy(), InitArgTypes, false);
+  RTUnitInit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction(CsiRtUnitInitName, InitFunctionTy));
+  assert(RTUnitInit);
+
+  ArrayType *UnitFedTableArrayType =
+      ArrayType::get(UnitFedTableType, UnitFedTables.size());
+  Constant *FEDTable = ConstantArray::get(UnitFedTableArrayType, UnitFedTables);
+  GlobalVariable *FEDGV = new GlobalVariable(M, UnitFedTableArrayType, false,
+                                             GlobalValue::InternalLinkage, FEDTable,
+                                             CsiUnitFedTableArrayName);
+
+  ArrayType *UnitObjTableArrayType =
+      ArrayType::get(UnitObjTableType, UnitObjTables.size());
+  Constant *ObjTable = ConstantArray::get(UnitObjTableArrayType, UnitObjTables);
+  GlobalVariable *ObjGV = new GlobalVariable(M, UnitObjTableArrayType, false,
+                                             GlobalValue::InternalLinkage, ObjTable,
+                                             CsiUnitObjTableArrayName);
+
+  Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0);
+  Value *GepArgs[] = {Zero, Zero};
+
+  // Insert call to __csirt_unit_init
+  return IRB.CreateCall(
+      RTUnitInit,
+      {IRB.CreateGlobalStringPtr(M.getName()),
+          ConstantExpr::getGetElementPtr(FEDGV->getValueType(), FEDGV, GepArgs),
+          ConstantExpr::getGetElementPtr(ObjGV->getValueType(), ObjGV, GepArgs),
+          InitCallsiteToFunction});
+}
+
+void CilkSanitizerImpl::initializeCsanHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *FuncPropertyTy = CsiFuncProperty::getType(C);
+  Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C);
+  Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *StorePropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *RetType = IRB.getVoidTy();
+  Type *AddrType = IRB.getInt8PtrTy();
+  Type *NumBytesType = IRB.getInt32Ty();
+  Type *LargeNumBytesType = IntptrTy;
+  Type *IDType = IRB.getInt64Ty();
+
+  CsanFuncEntry = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_func_entry", RetType,
+                            /* func_id */ IDType,
+                            /* stack_ptr */ AddrType,
+                            FuncPropertyTy));
+  CsanFuncExit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_func_exit", RetType,
+                            /* func_exit_id */ IDType,
+                            /* func_id */ IDType,
+                            FuncExitPropertyTy));
+
+  CsanRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_load", RetType, IDType,
+                            AddrType, NumBytesType, LoadPropertyTy));
+  CsanWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_store", RetType, IDType,
+                            AddrType, NumBytesType, StorePropertyTy));
+  CsanLargeRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_large_load", RetType, IDType,
+                            AddrType, LargeNumBytesType, LoadPropertyTy));
+  CsanLargeWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_large_store", RetType, IDType,
+                            AddrType, LargeNumBytesType, StorePropertyTy));
+  // CsanWrite = checkCsiInterfaceFunction(
+  //     M.getOrInsertFunction("__csan_atomic_exchange", RetType, IDType,
+  //                           AddrType, NumBytesType, StorePropertyTy));
+
+  CsanDetach = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_detach", RetType,
+                            /* detach_id */ IDType));
+  CsanTaskEntry = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_task", RetType,
+                            /* task_id */ IDType,
+                            /* detach_id */ IDType,
+                            /* stack_ptr */ AddrType));
+  CsanTaskExit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_task_exit", RetType,
+                            /* task_exit_id */ IDType,
+                            /* task_id */ IDType,
+                            /* detach_id */ IDType));
+  CsanDetachContinue = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_detach_continue", RetType,
+                            /* detach_continue_id */ IDType,
+                            /* detach_id */ IDType));
+  CsanSync = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csan_sync", RetType, IDType));
+}
+
+// Do not instrument known races/"benign races" that come from compiler
+// instrumentatin. The user has no way of suppressing them.
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
+  // Peel off GEPs and BitCasts.
+  Addr = Addr->stripInBoundsOffsets();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->hasSection()) {
+      StringRef SectionName = GV->getSection();
+      // Check if the global is in the PGO counters section.
+      auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+      if (SectionName.endswith(
+              getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+        return false;
+    }
+
+    // Check if the global is private gcov data.
+    if (GV->getName().startswith("__llvm_gcov") ||
+        GV->getName().startswith("__llvm_gcda"))
+      return false;
+  }
+
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  if (Addr) {
+    Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+    if (PtrTy->getPointerAddressSpace() != 0)
+      return false;
+  }
+
+  return true;
+}
+
+// Examine the uses of a given AllocaInst to determine if some use is detached.
+static bool MightHaveDetachedUse(const AllocaInst *AI) {
+  const BasicBlock *AllocaCtx = GetDetachedCtx(AI->getParent());
+  SmallVector<const Use *, 20> Worklist;
+  SmallSet<const Use *, 20> Visited;
+
+  for (const Use &U : AI->uses()) {
+    Visited.insert(&U);
+    Worklist.push_back(&U);
+  }
+
+  while (!Worklist.empty()) {
+    const Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+    if (AllocaCtx != GetDetachedCtx(I->getParent()))
+      return true;
+
+    switch (I->getOpcode()) {
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+    case Instruction::AddrSpaceCast:
+      for (Use &UU : I->uses())
+        if (Visited.insert(&UU).second)
+          Worklist.push_back(&UU);
+      break;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+void CilkSanitizerImpl::chooseInstructionsToInstrument(
+    SmallVectorImpl<Instruction *> &Local, SmallVectorImpl<Instruction *> &All,
+    const DataLayout &DL) {
+  SmallSet<Value*, 8> WriteTargets;
+  // Iterate from the end.
+  for (Instruction *I : reverse(Local)) {
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      Value *Addr = Store->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      WriteTargets.insert(Addr);
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
+        continue;
+      if (WriteTargets.count(Addr)) {
+        // We will write to this temp, so no reason to analyze the read.
+        NumOmittedReadsBeforeWrite++;
+        continue;
+      }
+      if (addrPointsToConstantData(Addr)) {
+        // Addr points to some constant data -- it can not race with any writes.
+        NumOmittedReadsFromConstants++;
+        continue;
+      }
+    }
+    Value *Addr = isa<StoreInst>(*I)
+        ? cast<StoreInst>(I)->getPointerOperand()
+        : cast<LoadInst>(I)->getPointerOperand();
+    Value *Obj = GetUnderlyingObject(Addr, DL);
+    if (isa<AllocaInst>(Obj) &&
+        !PointerMayBeCaptured(Addr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(Obj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      continue;
+    }
+    All.push_back(I);
+  }
+  Local.clear();
+}
+
+bool CilkSanitizerImpl::instrumentFunction(Function &F) {
+  if (F.empty() || shouldNotInstrumentFunction(F))
+    return false;
+
+  DominatorTree *DT = &GetDomTree(F);
+  // DetachSSA &DSSA = GetDSSA(F);
+  // MemorySSA &MSSA = GetMSSA(F);
+
+  SmallVector<Instruction*, 8> AllLoadsAndStores;
+  SmallVector<Instruction*, 8> LocalLoadsAndStores;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
+  SmallVector<Instruction *, 8> Callsites;
+  SmallVector<DetachInst*, 8> Detaches;
+  SmallVector<SyncInst*, 8> Syncs;
+  bool Res = false;
+  bool HasCalls = false;
+  bool MaySpawn = false;
+
+  // TODO: Consider modifying this to choose instrumentation to insert based on
+  // fibrils, not basic blocks.
+  for (BasicBlock &BB : F) {
+    // Record the Tapir instructions found
+    if (DetachInst *DI = dyn_cast<DetachInst>(BB.getTerminator())) {
+      MaySpawn = true;
+      Detaches.push_back(DI);
+    } else if (SyncInst *SI = dyn_cast<SyncInst>(BB.getTerminator()))
+      Syncs.push_back(SI);
+
+    // Record the memory accesses in the basic block
+    for (Instruction &Inst : BB) {
+      if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+        LocalLoadsAndStores.push_back(&Inst);
+      else if (isa<AtomicRMWInst>(Inst) || isa<AtomicCmpXchgInst>(Inst))
+        AtomicAccesses.push_back(&Inst);
+      else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+        if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+          maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+        if (isa<MemIntrinsic>(Inst))
+          MemIntrinCalls.push_back(&Inst);
+        if (!isa<DbgInfoIntrinsic>(Inst)) {
+          if (!isa<MemIntrinsic>(Inst))
+            Callsites.push_back(&Inst);
+          HasCalls = true;
+          chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+                                         DL);
+        }
+      }
+    }
+    chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
+  }
+
+  uint64_t LocalId = getLocalFunctionID(F);
+
+  for (auto Inst : AllLoadsAndStores)
+    Res |= instrumentLoadOrStore(Inst, DL);
+
+  for (auto Inst : AtomicAccesses)
+    Res |= instrumentAtomic(Inst, DL);
+
+  for (auto Inst : MemIntrinCalls)
+    Res |= instrumentMemIntrinsic(Inst, DL);
+
+  for (auto Inst : Callsites)
+    Res |= instrumentCallsite(Inst, DT);
+
+  for (auto Inst : Detaches)
+    Res |= instrumentDetach(Inst, DT);
+
+  for (auto Inst : Syncs)
+    Res |= instrumentSync(Inst);
+
+  if ((Res || HasCalls)) {
+    IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
+    CsiFuncProperty FuncEntryProp;
+    FuncEntryProp.setMaySpawn(MaySpawn);
+    Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB);
+    // TODO: Determine if we actually want the frame pointer, not the stack
+    // pointer.
+    // Value *StackSave = IRB.CreateCall(
+    //     Intrinsic::getDeclaration(&M, Intrinsic::stacksave));
+    // IRB.CreateCall(CsanFuncEntry, {FuncId, StackSave, FuncEntryProp.getValue(IRB)});
+    Value *FrameAddr = IRB.CreateCall(
+        Intrinsic::getDeclaration(&M, Intrinsic::frameaddress),
+        {IRB.getInt32(0)});
+    IRB.CreateCall(CsanFuncEntry, {FuncId, FrameAddr, FuncEntryProp.getValue(IRB)});
+
+    EscapeEnumerator EE(F, "csan_cleanup", true);
+    while (IRBuilder<> *AtExit = EE.Next()) {
+      // uint64_t ExitLocalId = FunctionExitFED.add(F);
+      uint64_t ExitLocalId = FunctionExitFED.add(*AtExit->GetInsertPoint());
+      Value *ExitCsiId = FunctionExitFED.localToGlobalId(ExitLocalId, *AtExit);
+      CsiFuncExitProperty FuncExitProp;
+      FuncExitProp.setMaySpawn(MaySpawn);
+      AtExit->CreateCall(CsanFuncExit,
+                         {ExitCsiId, FuncId, FuncExitProp.getValue(*AtExit)});
+    }
+  }
+  return Res;
+}
+
+bool CilkSanitizerImpl::instrumentLoadOrStore(Instruction *I,
+                                              const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(*I);
+  Value *Addr = IsWrite
+      ? cast<StoreInst>(I)->getPointerOperand()
+      : cast<LoadInst>(I)->getPointerOperand();
+
+  // swifterror memory addresses are mem2reg promoted by instruction selection.
+  // As such they cannot have regular uses like an instrumentation function and
+  // it makes no sense to track them as memory.
+  if (Addr->isSwiftError())
+    return false;
+
+  int NumBytesAccessed = getNumBytesAccessed(Addr, DL);
+  if (-1 == NumBytesAccessed) {
+    // Ignore accesses with bad sizes.
+    NumAccessesWithBadSize++;
+    return false;
+  }
+
+  const unsigned Alignment = IsWrite
+      ? cast<StoreInst>(I)->getAlignment()
+      : cast<LoadInst>(I)->getAlignment();
+  CsiLoadStoreProperty Prop;
+  Prop.setAlignment(Alignment);
+  if (IsWrite) {
+    uint64_t LocalId = StoreFED.add(*I);
+    uint64_t StoreObjId = StoreObj.add(*I, Addr, DL);
+    assert(LocalId == StoreObjId &&
+           "Store received different ID's in FED and object tables.");
+    Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+    Value *Args[] = {CsiId,
+                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                     IRB.getInt32(NumBytesAccessed),
+                     Prop.getValue(IRB)};
+    Instruction *Call = IRB.CreateCall(CsanWrite, Args);
+    IRB.SetInstDebugLocation(Call);
+    NumInstrumentedWrites++;
+  } else {
+    uint64_t LocalId = LoadFED.add(*I);
+    uint64_t LoadObjId = LoadObj.add(*I, Addr, DL);
+    assert(LocalId == LoadObjId &&
+           "Load received different ID's in FED and object tables.");
+    Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB);
+    Value *Args[] = {CsiId,
+                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                     IRB.getInt32(NumBytesAccessed),
+                     Prop.getValue(IRB)};
+    Instruction *Call = IRB.CreateCall(CsanRead, Args);
+    IRB.SetInstDebugLocation(Call);
+    NumInstrumentedReads++;
+  }
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  CsiLoadStoreProperty Prop;
+  Value *Addr;
+  if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Addr = RMWI->getPointerOperand();
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Addr = CASI->getPointerOperand();
+  } else {
+    return false;
+  }
+
+  Value *Obj = GetUnderlyingObject(Addr, DL);
+  if (isa<AllocaInst>(Obj) &&
+      !PointerMayBeCaptured(Addr, true, true) &&
+      !MightHaveDetachedUse(cast<AllocaInst>(Obj))) {
+    // The variable is addressable but not captured, so it cannot be
+    // referenced from a different thread and participate in a data race
+    // (see llvm/Analysis/CaptureTracking.h for details).
+    NumOmittedNonCaptured++;
+    return false;
+  }
+
+  int NumBytesAccessed = getNumBytesAccessed(Addr, DL);
+  if (-1 == NumBytesAccessed) {
+    // Ignore accesses with bad sizes.
+    NumAccessesWithBadSize++;
+    return false;
+  }
+
+  uint64_t LocalId = StoreFED.add(*I);
+  uint64_t StoreObjId = StoreObj.add(*I, Addr, DL);
+  assert(LocalId == StoreObjId &&
+         "Store received different ID's in FED and object tables.");
+  Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+  Value *Args[] = {CsiId,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                   IRB.getInt32(NumBytesAccessed),
+                   Prop.getValue(IRB)};
+  Instruction *Call = IRB.CreateCall(CsanWrite, Args);
+  IRB.SetInstDebugLocation(Call);
+  NumInstrumentedWrites++;
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentMemIntrinsic(Instruction *I,
+                                               const DataLayout &DL) {
+  CsiLoadStoreProperty Prop;
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    // Check if we need to instrument the memset.
+    Value *Addr = M->getArgOperand(0);
+    Value *Obj = GetUnderlyingObject(Addr, DL);
+    if (isa<AllocaInst>(Obj) &&
+        !PointerMayBeCaptured(Addr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(Obj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+      return false;
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(M->getArgOperand(3)))
+      Prop.setAlignment(CI->getZExtValue());
+    uint64_t LocalId = StoreFED.add(*I);
+    uint64_t StoreObjId = StoreObj.add(*I, Addr, DL);
+    assert(LocalId == StoreObjId &&
+           "Store received different ID's in FED and object tables.");
+    Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+    Value *Args[] = {CsiId,
+                     IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+                     IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false),
+                     Prop.getValue(IRB)};
+    Instruction *Call = IRB.CreateCall(CsanLargeWrite, Args);
+    IRB.SetInstDebugLocation(Call);
+    return true;
+
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(M->getArgOperand(3)))
+      Prop.setAlignment(CI->getZExtValue());
+    Value *StoreAddr = M->getArgOperand(0);
+    Value *LoadAddr = M->getArgOperand(1);
+    bool Instrumented = false;
+
+    // First check if we need to instrument the store.
+    Value *SObj = GetUnderlyingObject(StoreAddr, DL);
+    if (isa<AllocaInst>(SObj) &&
+        !PointerMayBeCaptured(StoreAddr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(SObj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+    } else {
+      // Instrument the store
+      uint64_t StoreId = StoreFED.add(*I);
+      uint64_t StoreObjId = StoreObj.add(*I, StoreAddr, DL);
+      assert(StoreId == StoreObjId &&
+             "Store received different ID's in FED and object tables.");
+      Value *StoreCsiId = StoreFED.localToGlobalId(StoreId, IRB);
+      Value *StoreArgs[] = {StoreCsiId,
+                            IRB.CreatePointerCast(StoreAddr, IRB.getInt8PtrTy()),
+                            IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false),
+                            Prop.getValue(IRB)};
+      Instruction *WriteCall = IRB.CreateCall(CsanLargeWrite, StoreArgs);
+      IRB.SetInstDebugLocation(WriteCall);
+      Instrumented = true;
+    }
+    Value *LObj = GetUnderlyingObject(LoadAddr, DL);
+    if (isa<AllocaInst>(LObj) &&
+        !PointerMayBeCaptured(LoadAddr, true, true) &&
+        !MightHaveDetachedUse(cast<AllocaInst>(LObj))) {
+      // The variable is addressable but not captured, so it cannot be
+      // referenced from a different thread and participate in a data race
+      // (see llvm/Analysis/CaptureTracking.h for details).
+      NumOmittedNonCaptured++;
+    } else {
+      // Instrument the load
+      uint64_t LoadId = LoadFED.add(*I);
+      uint64_t LoadObjId = LoadObj.add(*I, LoadAddr, DL);
+      assert(LoadId == LoadObjId &&
+             "Load received different ID's in FED and object tables.");
+      Value *LoadCsiId = StoreFED.localToGlobalId(LoadId, IRB);
+      Value *LoadArgs[] = {LoadCsiId,
+                           IRB.CreatePointerCast(LoadAddr, IRB.getInt8PtrTy()),
+                           IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false),
+                           Prop.getValue(IRB)};
+      Instruction *ReadCall = IRB.CreateCall(CsanLargeRead, LoadArgs);
+      IRB.SetInstDebugLocation(ReadCall);
+      Instrumented = true;
+    }
+    return Instrumented;
+  }
+  return false;
+}
+
+bool CilkSanitizerImpl::instrumentCallsite(Instruction *I, DominatorTree *DT) {
+  // Exclude calls to the syncregion.start intrinsic.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+    if (Intrinsic::syncregion_start == II->getIntrinsicID() ||
+        Intrinsic::lifetime_start == II->getIntrinsicID() ||
+        Intrinsic::lifetime_end == II->getIntrinsicID())
+      return false;
+
+  bool IsInvoke = isa<InvokeInst>(I);
+
+  Function *Called = NULL;
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    Called = CI->getCalledFunction();
+  else if (InvokeInst *II = dyn_cast<InvokeInst>(I))
+    Called = II->getCalledFunction();
+
+  IRBuilder<> IRB(I);
+  uint64_t LocalId = CallsiteFED.add(*I);
+  Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+  Value *FuncId = NULL;
+  GlobalVariable *FuncIdGV = NULL;
+  if (Called) {
+    Module *M = I->getParent()->getParent()->getParent();
+    std::string GVName =
+      CsiFuncIdVariablePrefix + Called->getName().str();
+    FuncIdGV = dyn_cast<GlobalVariable>(M->getOrInsertGlobal(GVName,
+                                                             IRB.getInt64Ty()));
+    assert(FuncIdGV);
+    FuncIdGV->setConstant(false);
+    FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage);
+    FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId));
+    FuncId = IRB.CreateLoad(FuncIdGV);
+  } else {
+    // Unknown targets (i.e. indirect calls) are always unknown.
+    FuncId = IRB.getInt64(CsiCallsiteUnknownTargetId);
+  }
+  assert(FuncId != NULL);
+  CsiCallProperty Prop;
+  Prop.setIsIndirect(!Called);
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(I, CsiBeforeCallsite,
+                            {CallsiteId, FuncId, PropVal});
+
+  BasicBlock::iterator Iter(I);
+  if (IsInvoke) {
+    // There are two "after" positions for invokes: the normal block
+    // and the exception block. This also means we have to recompute
+    // the callsite and function IDs in each basic block so that we
+    // can use it for the after hook.
+
+    // TODO: Do we want the "after" hook for this callsite to come
+    // before or after the BB entry hook? Currently it is inserted
+    // before BB entry because instrumentCallsite is called after
+    // instrumentBasicBlock.
+
+    // TODO: If a destination of an invoke has multiple predecessors, then we
+    // must split that destination.
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    BasicBlock *NormalBB = II->getNormalDest();
+    unsigned SuccNum = GetSuccessorNumber(II->getParent(), NormalBB);
+    if (isCriticalEdge(II, SuccNum))
+      NormalBB = SplitCriticalEdge(II, SuccNum,
+                                   CriticalEdgeSplittingOptions(DT));
+    IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+
+    BasicBlock *UnwindBB = II->getUnwindDest();
+    IRB.SetInsertPoint(&*UnwindBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  } else {
+    // Simple call instruction; there is only one "after" position.
+    Iter++;
+    IRB.SetInsertPoint(&*Iter);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*Iter, CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  }
+
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentDetach(DetachInst *DI,
+                                         DominatorTree *DT) {
+  // Instrument the detach instruction itself
+  Value *DetachID;
+  {
+    IRBuilder<> IRB(DI);
+    uint64_t LocalID = DetachFED.add(*DI);
+    DetachID = DetachFED.localToGlobalId(LocalID, IRB);
+    Instruction *Call = IRB.CreateCall(CsanDetach, {DetachID});
+    IRB.SetInstDebugLocation(Call);
+  }
+  NumInstrumentedDetaches++;
+
+  // Find the detached block, continuation, and associated reattaches.
+  BasicBlock *DetachedBlock = DI->getDetached();
+  BasicBlock *ContinueBlock = DI->getContinue();
+  SmallVector<BasicBlock *, 8> TaskExits;
+  // TODO: Extend this loop to find EH exits of the detached task.
+  for (BasicBlock *Pred : predecessors(ContinueBlock))
+    if (isa<ReattachInst>(Pred->getTerminator()))
+      TaskExits.push_back(Pred);
+
+  // Instrument the entry and exit points of the detached task.
+  {
+    // Instrument the entry point of the detached task.
+    IRBuilder<> IRB(&*DetachedBlock->getFirstInsertionPt());
+    uint64_t LocalID = TaskFED.add(*DetachedBlock);
+    Value *TaskID = TaskFED.localToGlobalId(LocalID, IRB);
+    // TODO: Determine if we actually want the frame pointer, not the stack
+    // pointer.
+    // Value *StackSave = IRB.CreateCall(
+    //     Intrinsic::getDeclaration(&M, Intrinsic::stacksave));
+    // Instruction *Call = IRB.CreateCall(CsanTaskEntry,
+    //                                    {TaskID, DetachID, StackSave});
+    Value *FrameAddr = IRB.CreateCall(
+        Intrinsic::getDeclaration(&M, Intrinsic::frameaddress),
+        {IRB.getInt32(0)});
+    Instruction *Call = IRB.CreateCall(CsanTaskEntry,
+                                       {TaskID, DetachID, FrameAddr});
+    IRB.SetInstDebugLocation(Call);
+
+    // Instrument the exit points of the detached tasks.
+    for (BasicBlock *TaskExit : TaskExits) {
+      IRBuilder<> IRB(TaskExit->getTerminator());
+      uint64_t LocalID = TaskExitFED.add(*TaskExit->getTerminator());
+      Value *TaskExitID = TaskExitFED.localToGlobalId(LocalID, IRB);
+      Instruction *Call = IRB.CreateCall(CsanTaskExit,
+                                         {TaskExitID, TaskID, DetachID});
+      IRB.SetInstDebugLocation(Call);
+      NumInstrumentedDetachExits++;
+    }
+  }
+
+  // Instrument the continuation of the detach.
+  {
+    if (isCriticalContinueEdge(DI, 1))
+      ContinueBlock = SplitCriticalEdge(
+          DI, 1,
+          CriticalEdgeSplittingOptions(DT).setSplitDetachContinue());
+
+    IRBuilder<> IRB(&*ContinueBlock->getFirstInsertionPt());
+    uint64_t LocalID = DetachContinueFED.add(*ContinueBlock);
+    Value *ContinueID = DetachContinueFED.localToGlobalId(LocalID, IRB);
+    Instruction *Call = IRB.CreateCall(CsanDetachContinue,
+                                       {ContinueID, DetachID});
+    IRB.SetInstDebugLocation(Call);
+  }
+  return true;
+}
+
+bool CilkSanitizerImpl::instrumentSync(SyncInst *SI) {
+  IRBuilder<> IRB(SI);
+  // Get the ID of this sync.
+  uint64_t LocalID = SyncFED.add(*SI);
+  Value *SyncID = SyncFED.localToGlobalId(LocalID, IRB);
+  // Insert instrumentation before the sync.
+  Instruction *Call = IRB.CreateCall(CsanSync, {SyncID});
+  IRB.SetInstDebugLocation(Call);
+  NumInstrumentedSyncs++;
+  return true;
+}
+
+bool CilkSanitizer::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  // auto GetDSSA = [this](Function &F) -> DetachSSA & {
+  //   return this->getAnalysis<DetachSSAWrapperPass>(F).getDSSA();
+  // };
+  // auto GetMSSA = [this](Function &F) -> MemorySSA & {
+  //   return this->getAnalysis<MemorySSAWrapperPass>(F).getMSSA();
+  // };
+
+  CallGraph *CG = &getAnalysis<CallGraphWrapperPass>().getCallGraph();
+  const TargetLibraryInfo *TLI =
+      &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  auto GetDomTree = [this](Function &F) -> DominatorTree & {
+    return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  };
+
+  // return CilkSanitizerImpl(M, CG, GetDSSA, GetMSSA).run();
+  return CilkSanitizerImpl(M, CG, GetDomTree, TLI).run();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp
new file mode 100644
index 00000000000000..1446eb4b8e7dd3
--- /dev/null
+++ b/llvm/lib/Transforms/Instrumentation/ComprehensiveStaticInstrumentation.cpp
@@ -0,0 +1,982 @@
+//===-- ComprehensiveStaticInstrumentation.cpp - instrumentation hooks ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// TODO: License
+//===----------------------------------------------------------------------===//
+//
+// This file is part of CSI, a framework that provides comprehensive static
+// instrumentation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/CSI.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "csi"
+
+static cl::opt<bool>  ClInstrumentFuncEntryExit(
+    "csi-instrument-func-entry-exit", cl::init(true),
+    cl::desc("Instrument function entry and exit"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentBasicBlocks(
+    "csi-instrument-basic-blocks", cl::init(true),
+    cl::desc("Instrument basic blocks"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemoryAccesses(
+    "csi-instrument-memory-accesses", cl::init(true),
+    cl::desc("Instrument memory accesses"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentCalls(
+    "csi-instrument-function-calls", cl::init(true),
+    cl::desc("Instrument function calls"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentAtomics(
+    "csi-instrument-atomics", cl::init(true),
+    cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "csi-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+
+namespace {
+
+static CSIOptions OverrideFromCL(CSIOptions Options) {
+  Options.InstrumentFuncEntryExit |= ClInstrumentFuncEntryExit;
+  Options.InstrumentBasicBlocks |= ClInstrumentBasicBlocks;
+  Options.InstrumentMemoryAccesses |= ClInstrumentMemoryAccesses;
+  Options.InstrumentCalls |= ClInstrumentCalls;
+  Options.InstrumentAtomics |= ClInstrumentAtomics;
+  Options.InstrumentMemIntrinsics |= ClInstrumentMemIntrinsics;
+  return Options;
+}
+
+/// The Comprehensive Static Instrumentation pass.
+/// Inserts calls to user-defined hooks at predefined points in the IR.
+struct ComprehensiveStaticInstrumentation : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid.
+
+  ComprehensiveStaticInstrumentation(
+      const CSIOptions &Options = CSIOptions())
+      : ModulePass(ID), Options(OverrideFromCL(Options)) {
+    initializeComprehensiveStaticInstrumentationPass(
+        *PassRegistry::getPassRegistry());
+  }
+  StringRef getPassName() const override {
+    return "ComprehensiveStaticInstrumentation";
+  }
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  CSIOptions Options;
+}; // struct ComprehensiveStaticInstrumentation
+} // anonymous namespace
+
+char ComprehensiveStaticInstrumentation::ID = 0;
+
+INITIALIZE_PASS(ComprehensiveStaticInstrumentation, "csi",
+                "ComprehensiveStaticInstrumentation pass", false, false)
+
+ModulePass *llvm::createComprehensiveStaticInstrumentationPass(
+    const CSIOptions &Options) {
+  return new ComprehensiveStaticInstrumentation(Options);
+}
+
+/// Return the first DILocation in the given basic block, or nullptr
+/// if none exists.
+static const DILocation *getFirstDebugLoc(const BasicBlock &BB) {
+  for (const Instruction &Inst : BB)
+    if (const DILocation *Loc = Inst.getDebugLoc())
+      return Loc;
+
+  return nullptr;
+}
+
+/// Set DebugLoc on the call instruction to a CSI hook, based on the
+/// debug information of the instrumented instruction.
+static void setInstrumentationDebugLoc(Instruction *Instrumented,
+                                       Instruction *Call) {
+  DISubprogram *Subprog = Instrumented->getFunction()->getSubprogram();
+  if (Subprog) {
+    if (Instrumented->getDebugLoc()) {
+      Call->setDebugLoc(Instrumented->getDebugLoc());
+    } else {
+      LLVMContext &C = Instrumented->getFunction()->getParent()->getContext();
+      Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog));
+    }
+  }
+}
+
+/// Set DebugLoc on the call instruction to a CSI hook, based on the
+/// debug information of the instrumented instruction.
+static void setInstrumentationDebugLoc(BasicBlock &Instrumented,
+                                       Instruction *Call) {
+  DISubprogram *Subprog = Instrumented.getParent()->getSubprogram();
+  if (Subprog) {
+    if (const DILocation *FirstDebugLoc = getFirstDebugLoc(Instrumented))
+      Call->setDebugLoc(FirstDebugLoc);
+    else {
+      LLVMContext &C = Instrumented.getParent()->getParent()->getContext();
+      Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog));
+    }
+  }
+}
+
+/// Set DebugLoc on the call instruction to a CSI hook, based on the
+/// debug information of the instrumented instruction.
+static void setInstrumentationDebugLoc(Function &Instrumented,
+                                       Instruction *Call) {
+  DISubprogram *Subprog = Instrumented.getSubprogram();
+  if (Subprog) {
+    LLVMContext &C = Instrumented.getParent()->getContext();
+    Call->setDebugLoc(DILocation::get(C, 0, 0, Subprog));
+  }
+}
+
+bool CSIImpl::run() {
+  initializeCsi();
+
+  for (Function &F : M)
+    instrumentFunction(F);
+
+  collectUnitFEDTables();
+  finalizeCsi();
+  return true; // We always insert the unit constructor.
+}
+
+ForensicTable::ForensicTable(Module &M, StringRef BaseIdName) {
+  LLVMContext &C = M.getContext();
+  IntegerType *Int64Ty = IntegerType::get(C, 64);
+  IdCounter = 0;
+  BaseId = new GlobalVariable(M, Int64Ty, false, GlobalValue::InternalLinkage,
+                              ConstantInt::get(Int64Ty, 0), BaseIdName);
+  assert(BaseId);
+}
+
+uint64_t ForensicTable::getId(const Value *V) {
+  if (!ValueToLocalIdMap.count(V))
+    ValueToLocalIdMap[V] = IdCounter++;
+  assert(ValueToLocalIdMap.count(V) && "Value not in ID map.");
+  return ValueToLocalIdMap[V];
+}
+
+Value *ForensicTable::localToGlobalId(uint64_t LocalId,
+                                      IRBuilder<> &IRB) const {
+  assert(BaseId);
+  LLVMContext &C = IRB.getContext();
+  LoadInst *Base = IRB.CreateLoad(BaseId);
+  MDNode *MD = llvm::MDNode::get(C, None);
+  Base->setMetadata(LLVMContext::MD_invariant_load, MD);
+  Value *Offset = IRB.getInt64(LocalId);
+  return IRB.CreateAdd(Base, Offset);
+}
+
+uint64_t FrontEndDataTable::add(const Function &F) {
+  uint64_t ID = getId(&F);
+  add(ID, F.getSubprogram());
+  return ID;
+}
+
+uint64_t FrontEndDataTable::add(const BasicBlock &BB) {
+  uint64_t ID = getId(&BB);
+  add(ID, getFirstDebugLoc(BB));
+  return ID;
+}
+
+uint64_t FrontEndDataTable::add(const Instruction &I) {
+  uint64_t ID = getId(&I);
+  add(ID, I.getDebugLoc());
+  return ID;
+}
+
+PointerType *FrontEndDataTable::getPointerType(LLVMContext &C) {
+  return PointerType::get(getSourceLocStructType(C), 0);
+}
+
+StructType *FrontEndDataTable::getSourceLocStructType(LLVMContext &C) {
+  return StructType::get(
+      /* Name */ PointerType::get(IntegerType::get(C, 8), 0),
+      /* Line */ IntegerType::get(C, 32),
+      /* Column */ IntegerType::get(C, 32),
+      /* File */ PointerType::get(IntegerType::get(C, 8), 0));
+}
+
+void FrontEndDataTable::add(uint64_t ID, const DILocation *Loc) {
+  if (Loc) {
+    // TODO: Add location information for inlining
+    const DISubprogram *Subprog = Loc->getScope()->getSubprogram();
+    add(ID, (int32_t)Loc->getLine(), (int32_t)Loc->getColumn(),
+        Loc->getFilename(), Loc->getDirectory(), Subprog->getName());
+  } else
+    add(ID);
+}
+
+void FrontEndDataTable::add(uint64_t ID, const DISubprogram *Subprog) {
+  if (Subprog)
+    add(ID, (int32_t)Subprog->getLine(), -1, Subprog->getFilename(),
+        Subprog->getDirectory(), Subprog->getName());
+  else
+    add(ID);
+}
+
+void FrontEndDataTable::add(uint64_t ID, int32_t Line, int32_t Column,
+                            StringRef Filename, StringRef Directory,
+                            StringRef Name) {
+  assert(LocalIdToSourceLocationMap.find(ID) ==
+             LocalIdToSourceLocationMap.end() &&
+         "Id already exists in FED table.");
+  LocalIdToSourceLocationMap[ID] = {Name, Line, Column, Filename, Directory};
+}
+
+Constant *FrontEndDataTable::insertIntoModule(Module &M) const {
+  LLVMContext &C = M.getContext();
+  StructType *FedType = getSourceLocStructType(C);
+  IntegerType *Int32Ty = IntegerType::get(C, 32);
+  Constant *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *GepArgs[] = {Zero, Zero};
+  SmallVector<Constant *, 6> FEDEntries;
+
+  for (uint64_t LocalID = 0; LocalID < IdCounter; ++LocalID) {
+    const SourceLocation &E = LocalIdToSourceLocationMap.find(LocalID)->second;
+    Constant *Line = ConstantInt::get(Int32Ty, E.Line);
+    Constant *Column = ConstantInt::get(Int32Ty, E.Column);
+    Constant *File;
+    {
+      std::string Filename = E.Filename.str();
+      if (!E.Directory.empty())
+        Filename = E.Directory.str() + "/" + Filename;
+      Constant *FileStrConstant = ConstantDataArray::getString(C, Filename);
+      GlobalVariable *GV =
+        M.getGlobalVariable("__csi_unit_filename_" + Filename, true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, FileStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                FileStrConstant,
+                                "__csi_unit_filename_" + Filename,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      File =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    Constant *Name;
+    if (E.Name.empty())
+      Name = ConstantPointerNull::get(PointerType::get(
+                                          IntegerType::get(C, 8), 0));
+    else {
+      Constant *NameStrConstant = ConstantDataArray::getString(C, E.Name);
+      GlobalVariable *GV =
+        M.getGlobalVariable(("__csi_unit_function_name_" + E.Name).str(), true);
+      if (GV == NULL) {
+        GV = new GlobalVariable(M, NameStrConstant->getType(),
+                                true, GlobalValue::PrivateLinkage,
+                                NameStrConstant,
+                                "__csi_unit_function_name_" + E.Name,
+                                nullptr,
+                                GlobalVariable::NotThreadLocal, 0);
+        GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+      }
+      assert(GV);
+      Name =
+        ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+    }
+    // The order of arguments to ConstantStruct::get() must match the
+    // source_loc_t type in csi.h.
+    FEDEntries.push_back(ConstantStruct::get(FedType, Name, Line, Column,
+                                             File));
+  }
+
+  ArrayType *FedArrayType = ArrayType::get(FedType, FEDEntries.size());
+  Constant *Table = ConstantArray::get(FedArrayType, FEDEntries);
+  GlobalVariable *GV =
+    new GlobalVariable(M, FedArrayType, false, GlobalValue::InternalLinkage,
+                       Table, CsiUnitFedTableName);
+  return ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs);
+}
+
+void CSIImpl::initializeFuncHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *FuncPropertyTy = CsiFuncProperty::getType(C);
+  CsiFuncEntry = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_func_entry", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), FuncPropertyTy));
+  Type *FuncExitPropertyTy = CsiFuncExitProperty::getType(C);
+  CsiFuncExit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_func_exit", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), IRB.getInt64Ty(),
+                            FuncExitPropertyTy));
+}
+
+void CSIImpl::initializeBasicBlockHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *PropertyTy = CsiBBProperty::getType(C);
+  CsiBBEntry = checkCsiInterfaceFunction(M.getOrInsertFunction(
+      "__csi_bb_entry", IRB.getVoidTy(), IRB.getInt64Ty(), PropertyTy));
+  CsiBBExit = checkCsiInterfaceFunction(M.getOrInsertFunction(
+      "__csi_bb_exit", IRB.getVoidTy(), IRB.getInt64Ty(), PropertyTy));
+}
+
+void CSIImpl::initializeCallsiteHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *PropertyTy = CsiCallProperty::getType(C);
+  CsiBeforeCallsite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_before_call", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), IRB.getInt64Ty(), PropertyTy));
+  CsiAfterCallsite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_after_call", IRB.getVoidTy(),
+                            IRB.getInt64Ty(), IRB.getInt64Ty(), PropertyTy));
+}
+
+void CSIImpl::initializeLoadStoreHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+  Type *LoadPropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *StorePropertyTy = CsiLoadStoreProperty::getType(C);
+  Type *RetType = IRB.getVoidTy();
+  Type *AddrType = IRB.getInt8PtrTy();
+  Type *NumBytesType = IRB.getInt32Ty();
+
+  CsiBeforeRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_before_load", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, LoadPropertyTy));
+  CsiAfterRead = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_after_load", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, LoadPropertyTy));
+
+  CsiBeforeWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_before_store", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, StorePropertyTy));
+  CsiAfterWrite = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("__csi_after_store", RetType, IRB.getInt64Ty(),
+                            AddrType, NumBytesType, StorePropertyTy));
+}
+
+void CSIImpl::initializeMemIntrinsicsHooks() {
+  LLVMContext &C = M.getContext();
+  IRBuilder<> IRB(C);
+
+  MemmoveFn = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemcpyFn = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt8PtrTy(), IntptrTy));
+  MemsetFn = checkCsiInterfaceFunction(
+      M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+                            IRB.getInt32Ty(), IntptrTy));
+}
+
+int CSIImpl::getNumBytesAccessed(Value *Addr, const DataLayout &DL) {
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+  if (TypeSize % 8 != 0) {
+  // if (TypeSize != 8 && TypeSize != 16 && TypeSize != 32 && TypeSize != 64 &&
+  //     TypeSize != 128 && TypeSize != 256 && TypeSize != 512) {
+    return -1;
+  }
+  return TypeSize / 8;
+}
+
+void CSIImpl::addLoadStoreInstrumentation(
+    Instruction *I, Function *BeforeFn, Function *AfterFn, Value *CsiId,
+    Type *AddrType, Value *Addr, int NumBytes, CsiLoadStoreProperty &Prop) {
+  IRBuilder<> IRB(I);
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(I, BeforeFn,
+                            {CsiId, IRB.CreatePointerCast(Addr, AddrType),
+                                IRB.getInt32(NumBytes), PropVal});
+
+  BasicBlock::iterator Iter(I);
+  Iter++;
+  IRB.SetInsertPoint(&*Iter);
+  insertConditionalHookCall(&*Iter, AfterFn,
+                            {CsiId, IRB.CreatePointerCast(Addr, AddrType),
+                                IRB.getInt32(NumBytes), PropVal});
+}
+
+void CSIImpl::instrumentLoadOrStore(Instruction *I, CsiLoadStoreProperty &Prop,
+                                    const DataLayout &DL) {
+  IRBuilder<> IRB(I);
+  bool IsWrite = isa<StoreInst>(I);
+  Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand()
+                        : cast<LoadInst>(I)->getPointerOperand();
+  int NumBytes = getNumBytesAccessed(Addr, DL);
+  Type *AddrType = IRB.getInt8PtrTy();
+
+  if (NumBytes == -1)
+    return; // size that we don't recognize
+
+  if (IsWrite) {
+    uint64_t LocalId = StoreFED.add(*I);
+    Value *CsiId = StoreFED.localToGlobalId(LocalId, IRB);
+    addLoadStoreInstrumentation(I, CsiBeforeWrite, CsiAfterWrite, CsiId,
+                                AddrType, Addr, NumBytes, Prop);
+  } else { // is read
+    uint64_t LocalId = LoadFED.add(*I);
+    Value *CsiId = LoadFED.localToGlobalId(LocalId, IRB);
+    addLoadStoreInstrumentation(I, CsiBeforeRead, CsiAfterRead, CsiId, AddrType,
+                                Addr, NumBytes, Prop);
+  }
+}
+
+void CSIImpl::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+  // For now, print a message that this code contains atomics.
+  dbgs() << "WARNING: Uninstrumented atomic operations in program-under-test!\n";
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since our pass runs after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __csi_memset to avoid the intrinsics.
+bool CSIImpl::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    Instruction *Call = IRB.CreateCall(
+                                       MemsetFn,
+                                       {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+                                           IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+                                           IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    setInstrumentationDebugLoc(I, Call);
+    I->eraseFromParent();
+    return true;
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    Instruction *Call = IRB.CreateCall(
+                                       isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+                                       {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+                                           IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+                                           IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+    setInstrumentationDebugLoc(I, Call);
+    I->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+void CSIImpl::instrumentBasicBlock(BasicBlock &BB) {
+  IRBuilder<> IRB(&*BB.getFirstInsertionPt());
+  //LLVMContext &C = IRB.getContext();
+  uint64_t LocalId = BasicBlockFED.add(BB);
+  Value *CsiId = BasicBlockFED.localToGlobalId(LocalId, IRB);
+  CsiBBProperty Prop;
+  TerminatorInst *TI = BB.getTerminator();
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiBBEntry,
+                            {CsiId, PropVal});
+  insertConditionalHookCall(TI, CsiBBExit,
+                            {CsiId, PropVal});
+}
+
+void CSIImpl::instrumentCallsite(Instruction *I) {
+  // Ignore calls to debug intrinsics
+  if (isa<DbgInfoIntrinsic>(I))
+    return;
+
+  bool IsInvoke = false;
+  Function *Called = NULL;
+  if (CallInst *CI = dyn_cast<CallInst>(I)) {
+    Called = CI->getCalledFunction();
+  } else if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+    Called = II->getCalledFunction();
+    IsInvoke = true;
+  }
+
+  // if (Called && Called->getName().startswith("llvm.dbg")) {
+  //   return;
+  // }
+
+  IRBuilder<> IRB(I);
+  uint64_t LocalId = CallsiteFED.add(*I);
+  Value *CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+  Value *FuncId = NULL;
+  GlobalVariable *FuncIdGV = NULL;
+  if (Called) {
+    Module *M = I->getParent()->getParent()->getParent();
+    std::string GVName =
+      CsiFuncIdVariablePrefix + Called->getName().str();
+    FuncIdGV = dyn_cast<GlobalVariable>(M->getOrInsertGlobal(GVName,
+                                                             IRB.getInt64Ty()));
+    assert(FuncIdGV);
+    FuncIdGV->setConstant(false);
+    FuncIdGV->setLinkage(GlobalValue::WeakAnyLinkage);
+    FuncIdGV->setInitializer(IRB.getInt64(CsiCallsiteUnknownTargetId));
+    FuncId = IRB.CreateLoad(FuncIdGV);
+  } else {
+    // Unknown targets (i.e. indirect calls) are always unknown.
+    FuncId = IRB.getInt64(CsiCallsiteUnknownTargetId);
+  }
+  assert(FuncId != NULL);
+  CsiCallProperty Prop;
+  Prop.setIsIndirect(!Called);
+  Value *PropVal = Prop.getValue(IRB);
+  insertConditionalHookCall(I, CsiBeforeCallsite,
+                            {CallsiteId, FuncId, PropVal});
+
+  BasicBlock::iterator Iter(I);
+  if (IsInvoke) {
+    // There are two "after" positions for invokes: the normal block
+    // and the exception block. This also means we have to recompute
+    // the callsite and function IDs in each basic block so that we
+    // can use it for the after hook.
+
+    // TODO: Do we want the "after" hook for this callsite to come
+    // before or after the BB entry hook? Currently it is inserted
+    // before BB entry because instrumentCallsite is called after
+    // instrumentBasicBlock.
+    InvokeInst *II = dyn_cast<InvokeInst>(I);
+    BasicBlock *NormalBB = II->getNormalDest();
+    IRB.SetInsertPoint(&*NormalBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+
+    BasicBlock *UnwindBB = II->getUnwindDest();
+    IRB.SetInsertPoint(&*UnwindBB->getFirstInsertionPt());
+    CallsiteId = CallsiteFED.localToGlobalId(LocalId, IRB);
+    if (FuncIdGV != NULL) FuncId = IRB.CreateLoad(FuncIdGV);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  } else {
+    // Simple call instruction; there is only one "after" position.
+    Iter++;
+    IRB.SetInsertPoint(&*Iter);
+    PropVal = Prop.getValue(IRB);
+    insertConditionalHookCall(&*Iter, CsiAfterCallsite,
+                              {CallsiteId, FuncId, PropVal});
+  }
+}
+
+void CSIImpl::insertConditionalHookCall(Instruction *I, Function *HookFunction,
+                                        ArrayRef<Value *> HookArgs) {
+  IRBuilder<> IRB(I);
+  // Value *Cond = IRB.CreateICmpEQ(IRB.CreateLoad(DisableInstrGV), IRB.getInt1(false));
+  // TerminatorInst *TI = SplitBlockAndInsertIfThen(Cond, I, false);
+  // IRB.SetInsertPoint(TI);
+  // IRB.CreateStore(IRB.getInt1(true), DisableInstrGV);
+  Instruction *Call = IRB.CreateCall(HookFunction, HookArgs);
+  setInstrumentationDebugLoc(I, Call);
+  // IRB.CreateStore(IRB.getInt1(false), DisableInstrGV);
+}
+
+
+void CSIImpl::initializeFEDTables() {
+  FunctionFED = FrontEndDataTable(M, CsiFunctionBaseIdName);
+  FunctionExitFED = FrontEndDataTable(M, CsiFunctionExitBaseIdName);
+  BasicBlockFED = FrontEndDataTable(M, CsiBasicBlockBaseIdName);
+  CallsiteFED = FrontEndDataTable(M, CsiCallsiteBaseIdName);
+  LoadFED = FrontEndDataTable(M, CsiLoadBaseIdName);
+  StoreFED = FrontEndDataTable(M, CsiStoreBaseIdName);
+}
+
+uint64_t CSIImpl::getLocalFunctionID(Function &F) {
+  uint64_t LocalId = FunctionFED.add(F);
+  FuncOffsetMap[F.getName()] = LocalId;
+  return LocalId;
+}
+
+void CSIImpl::generateInitCallsiteToFunction() {
+  LLVMContext &C = M.getContext();
+  BasicBlock *EntryBB = BasicBlock::Create(C, "", InitCallsiteToFunction);
+  IRBuilder<> IRB(ReturnInst::Create(C, EntryBB));
+
+  GlobalVariable *Base = FunctionFED.baseId();
+  LoadInst *LI = IRB.CreateLoad(Base);
+  // Traverse the map of function name -> function local id. Generate
+  // a store of each function's global ID to the corresponding weak
+  // global variable.
+  for (const auto &it : FuncOffsetMap) {
+    std::string GVName = CsiFuncIdVariablePrefix + it.first.str();
+    GlobalVariable *GV = nullptr;
+    if ((GV = M.getGlobalVariable(GVName)) == nullptr) {
+      GV = new GlobalVariable(M, IRB.getInt64Ty(), false,
+                              GlobalValue::WeakAnyLinkage,
+                              IRB.getInt64(CsiCallsiteUnknownTargetId), GVName);
+    }
+    assert(GV);
+    IRB.CreateStore(IRB.CreateAdd(LI, IRB.getInt64(it.second)), GV);
+  }
+}
+
+void CSIImpl::initializeCsi() {
+  IntptrTy = DL.getIntPtrType(M.getContext());
+
+  initializeFEDTables();
+  if (Options.InstrumentFuncEntryExit)
+    initializeFuncHooks();
+  if (Options.InstrumentMemoryAccesses)
+    initializeLoadStoreHooks();
+  if (Options.InstrumentBasicBlocks)
+    initializeBasicBlockHooks();
+  if (Options.InstrumentCalls)
+    initializeCallsiteHooks();
+  if (Options.InstrumentMemIntrinsics)
+    initializeMemIntrinsicsHooks();
+
+  FunctionType *FnType =
+    FunctionType::get(Type::getVoidTy(M.getContext()), {}, false);
+  InitCallsiteToFunction = checkCsiInterfaceFunction(
+      M.getOrInsertFunction(CsiInitCallsiteToFunctionName, FnType));
+  assert(InitCallsiteToFunction);
+  InitCallsiteToFunction->setLinkage(GlobalValue::InternalLinkage);
+
+  /*
+  The runtime declares this as a __thread var --- need to change this decl generation
+    or the tool won't compile
+  DisableInstrGV = new GlobalVariable(M, IntegerType::get(M.getContext(), 1), false,
+                                      GlobalValue::ExternalLinkage, nullptr,
+                                      CsiDisableInstrumentationName, nullptr,
+                                      GlobalValue::GeneralDynamicTLSModel, 0, true);
+  */
+}
+
+// Create a struct type to match the unit_fed_entry_t type in csirt.c.
+StructType *CSIImpl::getUnitFedTableType(LLVMContext &C,
+                                         PointerType *EntryPointerType) {
+  return StructType::get(IntegerType::get(C, 64),
+                         Type::getInt8PtrTy(C, 0),
+                         EntryPointerType);
+}
+
+Constant *CSIImpl::fedTableToUnitFedTable(Module &M,
+                                          StructType *UnitFedTableType,
+                                          FrontEndDataTable &FedTable) {
+  Constant *NumEntries =
+    ConstantInt::get(IntegerType::get(M.getContext(), 64), FedTable.size());
+  Constant *BaseIdPtr =
+    ConstantExpr::getPointerCast(FedTable.baseId(),
+                                 Type::getInt8PtrTy(M.getContext(), 0));
+  Constant *InsertedTable = FedTable.insertIntoModule(M);
+  return ConstantStruct::get(UnitFedTableType, NumEntries, BaseIdPtr,
+                             InsertedTable);
+}
+
+void CSIImpl::collectUnitFEDTables() {
+  LLVMContext &C = M.getContext();
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+
+  // The order of the FED tables here must match the enum in csirt.c and the
+  // instrumentation_counts_t in csi.h.
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, FunctionFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, FunctionExitFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, BasicBlockFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, CallsiteFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, LoadFED));
+  UnitFedTables.push_back(
+      fedTableToUnitFedTable(M, UnitFedTableType, StoreFED));
+}
+
+CallInst *CSIImpl::createRTUnitInitCall(IRBuilder<> &IRB) {
+  LLVMContext &C = M.getContext();
+
+  StructType *UnitFedTableType =
+      getUnitFedTableType(C, FrontEndDataTable::getPointerType(C));
+
+  // Lookup __csirt_unit_init
+  SmallVector<Type *, 4> InitArgTypes({IRB.getInt8PtrTy(),
+                                       PointerType::get(UnitFedTableType, 0),
+                                       InitCallsiteToFunction->getType()});
+  FunctionType *InitFunctionTy =
+      FunctionType::get(IRB.getVoidTy(), InitArgTypes, false);
+  RTUnitInit = checkCsiInterfaceFunction(
+      M.getOrInsertFunction(CsiRtUnitInitName, InitFunctionTy));
+  assert(RTUnitInit);
+
+  ArrayType *UnitFedTableArrayType =
+      ArrayType::get(UnitFedTableType, UnitFedTables.size());
+  Constant *Table = ConstantArray::get(UnitFedTableArrayType, UnitFedTables);
+  GlobalVariable *GV = new GlobalVariable(M, UnitFedTableArrayType, false,
+                                          GlobalValue::InternalLinkage, Table,
+                                          CsiUnitFedTableArrayName);
+
+  Constant *Zero = ConstantInt::get(IRB.getInt32Ty(), 0);
+  Value *GepArgs[] = {Zero, Zero};
+
+  // Insert call to __csirt_unit_init
+  return IRB.CreateCall(
+      RTUnitInit,
+      {IRB.CreateGlobalStringPtr(M.getName()),
+          ConstantExpr::getGetElementPtr(GV->getValueType(), GV, GepArgs),
+          InitCallsiteToFunction});
+}
+
+void CSIImpl::finalizeCsi() {
+  LLVMContext &C = M.getContext();
+
+  // Add CSI global constructor, which calls unit init.
+  Function *Ctor =
+      Function::Create(FunctionType::get(Type::getVoidTy(C), false),
+                       GlobalValue::InternalLinkage, CsiRtUnitCtorName, &M);
+  BasicBlock *CtorBB = BasicBlock::Create(C, "", Ctor);
+  IRBuilder<> IRB(ReturnInst::Create(C, CtorBB));
+
+  // Insert __csi_func_id_<f> weak symbols for all defined functions and
+  // generate the runtime code that stores to all of them.
+  generateInitCallsiteToFunction();
+
+  CallInst *Call = createRTUnitInitCall(IRB);
+
+  // Add the constructor to the global list
+  appendToGlobalCtors(M, Ctor, CsiUnitCtorPriority);
+
+  CallGraphNode *CNCtor = CG->getOrInsertFunction(Ctor);
+  CallGraphNode *CNFunc = CG->getOrInsertFunction(RTUnitInit);
+  CNCtor->addCalledFunction(Call, CNFunc);
+}
+
+bool CSIImpl::shouldNotInstrumentFunction(Function &F) {
+  Module &M = *F.getParent();
+  // Never instrument the CSI ctor.
+  if (F.hasName() && F.getName() == CsiRtUnitCtorName)
+    return true;
+
+  // Don't instrument functions that will run before or
+  // simultaneously with CSI ctors.
+  GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+  if (GV == nullptr)
+    return false;
+  ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+  for (Use &OP : CA->operands()) {
+    if (isa<ConstantAggregateZero>(OP))
+      continue;
+    ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+    if (Function *CF = dyn_cast<Function>(CS->getOperand(1))) {
+      uint64_t Priority =
+          dyn_cast<ConstantInt>(CS->getOperand(0))->getLimitedValue();
+      if (Priority <= CsiUnitCtorPriority && CF->getName() == F.getName()) {
+        // Do not instrument F.
+        return true;
+      }
+    }
+  }
+  // false means do instrument it.
+  return false;
+}
+
+bool CSIImpl::isVtableAccess(Instruction *I) {
+  if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+    return Tag->isTBAAVtableAccess();
+  return false;
+}
+
+bool CSIImpl::addrPointsToConstantData(Value *Addr) {
+  // If this is a GEP, just analyze its pointer operand.
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+    Addr = GEP->getPointerOperand();
+
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+    if (GV->isConstant()) {
+      return true;
+    }
+  } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+    if (isVtableAccess(L)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CSIImpl::isAtomic(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
+  if (StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
+  if (isa<AtomicRMWInst>(I))
+    return true;
+  if (isa<AtomicCmpXchgInst>(I))
+    return true;
+  if (isa<FenceInst>(I))
+    return true;
+  return false;
+}
+
+void CSIImpl::computeLoadAndStoreProperties(
+    SmallVectorImpl<std::pair<Instruction *, CsiLoadStoreProperty>> &LoadAndStoreProperties,
+    SmallVectorImpl<Instruction *> &BBLoadsAndStores,
+    const DataLayout &DL) {
+  SmallSet<Value *, 8> WriteTargets;
+
+  for (SmallVectorImpl<Instruction *>::reverse_iterator
+         It = BBLoadsAndStores.rbegin(),
+         E = BBLoadsAndStores.rend();
+       It != E; ++It) {
+    Instruction *I = *It;
+    unsigned Alignment;
+    if (StoreInst *Store = dyn_cast<StoreInst>(I)) {
+      Value *Addr = Store->getPointerOperand();
+      WriteTargets.insert(Addr);
+      CsiLoadStoreProperty Prop;
+      // Update alignment property data
+      Alignment = Store->getAlignment();
+      Prop.setAlignment(Alignment);
+      // Set vtable-access property
+      Prop.setIsVtableAccess(isVtableAccess(Store));
+      // Set constant-data-access property
+      Prop.setIsConstant(addrPointsToConstantData(Addr));
+      Value *Obj = GetUnderlyingObject(Addr, DL);
+      // Set is-on-stack property
+      Prop.setIsOnStack(isa<AllocaInst>(Obj));
+      // Set may-be-captured property
+      Prop.setMayBeCaptured(isa<GlobalValue>(Obj) ||
+                            PointerMayBeCaptured(Addr, true, true));
+      LoadAndStoreProperties.push_back(std::make_pair(I, Prop));
+    } else {
+      LoadInst *Load = cast<LoadInst>(I);
+      Value *Addr = Load->getPointerOperand();
+      CsiLoadStoreProperty Prop;
+      // Update alignment property data
+      Alignment = Load->getAlignment();
+      Prop.setAlignment(Alignment);
+      // Set vtable-access property
+      Prop.setIsVtableAccess(isVtableAccess(Load));
+      // Set constant-data-access-property
+      Prop.setIsConstant(addrPointsToConstantData(Addr));
+      Value *Obj = GetUnderlyingObject(Addr, DL);
+      // Set is-on-stack property
+      Prop.setIsOnStack(isa<AllocaInst>(Obj));
+      // Set may-be-captured property
+      Prop.setMayBeCaptured(isa<GlobalValue>(Obj) ||
+                            PointerMayBeCaptured(Addr, true, true));
+      // Set load-read-before-write-in-bb property
+      bool HasBeenSeen = WriteTargets.count(Addr) > 0;
+      Prop.setLoadReadBeforeWriteInBB(HasBeenSeen);
+      LoadAndStoreProperties.push_back(std::make_pair(I, Prop));
+    }
+  }
+  BBLoadsAndStores.clear();
+}
+
+void CSIImpl::instrumentFunction(Function &F) {
+  // This is required to prevent instrumenting the call to
+  // __csi_module_init from within the module constructor.
+  if (F.empty() || shouldNotInstrumentFunction(F)) {
+    return;
+  }
+
+  SmallVector<std::pair<Instruction *, CsiLoadStoreProperty>, 8>
+    LoadAndStoreProperties;
+  SmallVector<Instruction *, 8> ReturnInstructions;
+  SmallVector<Instruction *, 8> MemIntrinsics;
+  SmallVector<Instruction *, 8> Callsites;
+  SmallVector<BasicBlock *, 8> BasicBlocks;
+  SmallVector<Instruction*, 8> AtomicAccesses;
+
+  // Compile lists of all instrumentation points before anything is modified.
+  for (BasicBlock &BB : F) {
+    SmallVector<Instruction *, 8> BBLoadsAndStores;
+    for (Instruction &I : BB) {
+      if (isAtomic(&I))
+        AtomicAccesses.push_back(&I);
+      else if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+        BBLoadsAndStores.push_back(&I);
+      } else if (isa<ReturnInst>(I)) {
+        ReturnInstructions.push_back(&I);
+      } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        if (isa<MemIntrinsic>(I)) {
+          MemIntrinsics.push_back(&I);
+        } else {
+          Callsites.push_back(&I);
+        }
+        computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores,
+                                      DL);
+      }
+    }
+    computeLoadAndStoreProperties(LoadAndStoreProperties, BBLoadsAndStores, DL);
+    BasicBlocks.push_back(&BB);
+  }
+
+  uint64_t LocalId = getLocalFunctionID(F);
+
+  // Instrument basic blocks.  Note that we do this before other instrumentation
+  // so that we put this at the beginning of the basic block, and then the
+  // function entry call goes before the call to basic block entry.
+  if (Options.InstrumentBasicBlocks)
+    for (BasicBlock *BB : BasicBlocks)
+      instrumentBasicBlock(*BB);
+
+  // Do this work in a separate loop after copying the iterators so that we
+  // aren't modifying the list as we're iterating.
+  if (Options.InstrumentMemoryAccesses)
+    for (std::pair<Instruction *, CsiLoadStoreProperty> p :
+           LoadAndStoreProperties)
+      instrumentLoadOrStore(p.first, p.second, DL);
+
+  // Instrument atomic memory accesses in any case (they can be used to
+  // implement synchronization).
+  if (Options.InstrumentAtomics)
+    for (Instruction *I : AtomicAccesses)
+      instrumentAtomic(I, DL);
+
+  if (Options.InstrumentMemIntrinsics)
+    for (Instruction *I : MemIntrinsics)
+      instrumentMemIntrinsic(I);
+
+  if (Options.InstrumentCalls)
+    for (Instruction *I : Callsites)
+      instrumentCallsite(I);
+
+  // Instrument function entry/exit points.
+  if (Options.InstrumentFuncEntryExit) {
+    IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
+    CsiFuncProperty FuncEntryProp;
+    CsiFuncExitProperty FuncExitProp;
+    Value *FuncId = FunctionFED.localToGlobalId(LocalId, IRB);
+    Value *PropVal = FuncEntryProp.getValue(IRB);
+    insertConditionalHookCall(&*IRB.GetInsertPoint(), CsiFuncEntry,
+                              {FuncId, PropVal});
+
+    for (Instruction *I : ReturnInstructions) {
+      IRBuilder<> IRBRet(I);
+      // uint64_t ExitLocalId = FunctionExitFED.add(F);
+      uint64_t ExitLocalId = FunctionExitFED.add(*I);
+      Value *ExitCsiId = FunctionExitFED.localToGlobalId(ExitLocalId, IRBRet);
+      PropVal = FuncExitProp.getValue(IRBRet);
+      insertConditionalHookCall(I, CsiFuncExit,
+                                {ExitCsiId, FuncId, PropVal});
+    }
+  }
+}
+
+void ComprehensiveStaticInstrumentation::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<CallGraphWrapperPass>();
+}
+
+bool ComprehensiveStaticInstrumentation::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  CallGraph *CG = &getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+  return CSIImpl(M, CG, Options).run();
+}
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index c3e323613c7079..f9ba37987a61e9 100644
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -105,6 +105,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingLegacyPassPass(Registry);
   initializeControlHeightReductionLegacyPassPass(Registry);
+  initializeCilkSanitizerPass(Registry);
   initializeGCOVProfilerLegacyPassPass(Registry);
   initializePGOInstrumentationGenLegacyPassPass(Registry);
   initializePGOInstrumentationUseLegacyPassPass(Registry);
@@ -117,6 +118,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeSanitizerCoverageModulePass(Registry);
   initializeDataFlowSanitizerPass(Registry);
   initializeEfficiencySanitizerPass(Registry);
+  initializeComprehensiveStaticInstrumentationPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/llvm/lib/Transforms/LLVMBuild.txt b/llvm/lib/Transforms/LLVMBuild.txt
index f061c6d9285e3e..ae57c40a946255 100644
--- a/llvm/lib/Transforms/LLVMBuild.txt
+++ b/llvm/lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Utils Vectorize ObjCARC
+subdirectories = AggressiveInstCombine Coroutines IPO InstCombine Instrumentation Scalar Tapir Utils Vectorize ObjCARC
 
 [component_0]
 type = Group
diff --git a/llvm/lib/Transforms/Scalar/CMakeLists.txt b/llvm/lib/Transforms/Scalar/CMakeLists.txt
index e3548ce5cd0afd..688365dfae4676 100644
--- a/llvm/lib/Transforms/Scalar/CMakeLists.txt
+++ b/llvm/lib/Transforms/Scalar/CMakeLists.txt
@@ -28,6 +28,7 @@ add_llvm_library(LLVMScalarOpts
   LoopDeletion.cpp
   LoopDataPrefetch.cpp
   LoopDistribute.cpp
+  LoopFuse.cpp
   LoopIdiomRecognize.cpp
   LoopInstSimplify.cpp
   LoopInterchange.cpp
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 9861948c8297a9..fcc11e0716f9b5 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1123,8 +1123,12 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
     if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks, 0)) {
       continue;
     }
+    if (isa<ReattachInst>(Pred->getTerminator())) {
+      continue;
+    }
 
-    if (Pred->getTerminator()->getNumSuccessors() != 1) {
+    if (Pred->getTerminator()->getNumSuccessors() != 1 &&
+        !isa<DetachInst>(Pred->getTerminator())) {
       if (isa<IndirectBrInst>(Pred->getTerminator())) {
         LLVM_DEBUG(
             dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
@@ -1327,6 +1331,20 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     return false;
   }
 
+  // If we depend on a detach instruction, reject.
+  for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+    MemDepResult DepInfo = Deps[i].getResult();
+    if (!(DepInfo.getInst()))
+      continue;
+    if (isa<DetachInst>(DepInfo.getInst())||
+        isa<SyncInst>(DepInfo.getInst())) {
+      DEBUG(dbgs() << "GVN: Cannot process" << *LI <<
+            " due to dependency on" <<
+            *(DepInfo.getInst()) << "\n");
+      return false;
+    }
+  }
+
   // If this load follows a GEP, see if we can PRE the indices before analyzing.
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
     for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
@@ -2184,6 +2202,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   unsigned NumWithout = 0;
   BasicBlock *PREPred = nullptr;
   BasicBlock *CurrentBlock = CurInst->getParent();
+  BasicBlock *DetachPred = nullptr, *ReattachPred = nullptr;
+  Value *DetachV = nullptr, *ReattachV = nullptr;
 
   // Update the RPO numbers for this function.
   if (InvalidBlockRPONumbers)
@@ -2212,18 +2232,36 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
       break;
     }
 
+    // Ignore reattach predecessors for determining whether to perform
+    // PRE.  These predecessors have the same available values as
+    // their corresponding detach predecessors.
+    if (isa<ReattachInst>(P->getTerminator()))
+      ReattachPred = P;
+
     uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
     Value *predV = findLeader(P, TValNo);
+
+    if (isa<DetachInst>(P->getTerminator())) {
+      assert(nullptr == DetachPred && "Multiple detach predecessors found!");
+      DetachPred = P;
+    }
+
     if (!predV) {
-      predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
-      PREPred = P;
-      ++NumWithout;
+      if (!isa<ReattachInst>(P->getTerminator())) {
+        predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+        PREPred = P;
+        ++NumWithout;
+      }
     } else if (predV == CurInst) {
       /* CurInst dominates this predecessor. */
       NumWithout = 2;
       break;
     } else {
       predMap.push_back(std::make_pair(predV, P));
+      if (isa<DetachInst>(P->getTerminator()))
+        DetachV = predV;
+      if (isa<ReattachInst>(P->getTerminator()))
+        ReattachV = predV;
       ++NumWith;
     }
   }
@@ -2233,6 +2271,15 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
   if (NumWithout > 1 || NumWith == 0)
     return false;
 
+  // If the reattach predecessor has a value that does not match the
+  // detach predecessor's value, assume that this is not a redundant
+  // instruction.
+  if (ReattachV && ReattachV != DetachV)
+    return false;
+
+  assert((!ReattachPred || DetachPred) &&
+         "Reattach predecessor found with no detach predecessor");
+
   // We may have a case where all predecessors have the instruction,
   // and we just need to insert a phi node. Otherwise, perform
   // insertion.
@@ -2256,7 +2303,8 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
     // the edge to be split and perform the PRE the next time we iterate
     // on the function.
     unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
-    if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+    if (isCriticalEdge(PREPred->getTerminator(), SuccNum) &&
+        !isa<DetachInst>(PREPred->getTerminator())) {
       toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
       return false;
     }
@@ -2267,6 +2315,9 @@ bool GVN::performScalarPRE(Instruction *CurInst) {
       LLVM_DEBUG(verifyRemoved(PREInstr));
       PREInstr->deleteValue();
       return false;
+    } else if (DetachPred == PREPred && ReattachPred) {
+      assert(nullptr == DetachV && "Detach predecessor already had a value");
+      predMap.push_back(std::make_pair(PREInstr, ReattachPred));
     }
   }
 
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 48de56a02834d5..bf2865332ce880 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -987,8 +987,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   // because now the condition in this block can be threaded through
   // predecessors of our predecessor block.
   if (BasicBlock *SinglePred = BB->getSinglePredecessor()) {
-    const Instruction *TI = SinglePred->getTerminator();
-    if (!TI->isExceptionalTerminator() && TI->getNumSuccessors() == 1 &&
+    const TerminatorInst *TI = SinglePred->getTerminator();
+    if (!TI->isExceptional() &&
+        !isa<SyncInst>(SinglePred->getTerminator()) &&  // Can't remove syncs
+        TI->getNumSuccessors() == 1 &&
         SinglePred != BB && !hasAddressTakenAndUsed(BB)) {
       // If SinglePred was a loop header, BB becomes one.
       if (LoopHeaders.erase(SinglePred))
@@ -1373,7 +1375,8 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
       }
     }
 
-    if (!PredAvailable) {
+    if (!PredAvailable ||
+        isa<ReattachInst>(PredBB->getTerminator())) {
       OneUnavailablePred = PredBB;
       continue;
     }
@@ -1416,6 +1419,9 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   // unconditional branch, we know that it isn't a critical edge.
   if (PredsScanned.size() == AvailablePreds.size()+1 &&
       OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+    // If the predecessor is a reattach, we can't split the edge
+    if (isa<ReattachInst>(OneUnavailablePred->getTerminator()))
+      return false;
     UnavailablePred = OneUnavailablePred;
   } else if (PredsScanned.size() != AvailablePreds.size()) {
     // Otherwise, we had multiple unavailable predecessors or we had a critical
@@ -1428,8 +1434,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
     // Add all the unavailable predecessors to the PredsToSplit list.
     for (BasicBlock *P : predecessors(LoadBB)) {
-      // If the predecessor is an indirect goto, we can't split the edge.
-      if (isa<IndirectBrInst>(P->getTerminator()))
+      // If the predecessor is an indirect goto or a reattach, we
+      // can't split the edge.
+      if (isa<IndirectBrInst>(P->getTerminator()) ||
+          isa<ReattachInst>(P->getTerminator()))
         return false;
 
       if (!AvailablePredSet.count(P))
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index d204654c39157d..d598ec917d8932 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -72,6 +72,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
 #include <algorithm>
 #include <utility>
 using namespace llvm;
@@ -1775,6 +1776,18 @@ bool llvm::promoteLoopAccessesToScalars(
   bool DereferenceableInPH = false;
   bool SafeToInsertStore = false;
 
+  // We cannot speculate loads to values that are stored in a detached
+  // context within the loop.  Precompute whether or not there is a
+  // detach within this loop.
+  bool DetachWithinLoop =
+    isa<DetachInst>(CurLoop->getHeader()->getTerminator());
+  if (!DetachWithinLoop)
+    for (BasicBlock *BB : CurLoop->getBlocks())
+      if (isa<DetachInst>(BB->getTerminator())) {
+        DetachWithinLoop = true;
+        break;
+      }
+
   SmallVector<Instruction *, 64> LoopUses;
 
   // We start with an alignment of one and try to find instructions that allow
@@ -1838,6 +1851,23 @@ bool llvm::promoteLoopAccessesToScalars(
         if (!Store->isUnordered())
           return false;
 
+	// We conservatively avoid promoting stores that are detached
+	// within the loop.  Technically it can be legal to move these
+	// stores -- the program already contains a determinacy race
+	// -- but to preserve the serial execution, we have to avoid
+	// moving stores that are loaded.  For now, we simply avoid
+	// moving these stores.
+	//
+	// TODO: The call to GetDetachedCtx can potentially be
+	// expensive.  Optimize this analysis in the future.
+	if (DetachWithinLoop &&
+	    CurLoop->contains(GetDetachedCtx(Store->getParent())))
+	  return false;
+
+        // Note that we only check GuaranteedToExecute inside the store case
+        // so that we do not introduce stores where they did not exist before
+        // (which would break the LLVM concurrency model).
+
         SawUnorderedAtomic |= Store->isAtomic();
         SawNotAtomic |= !Store->isAtomic();
 
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
new file mode 100644
index 00000000000000..4c90ace351c603
--- /dev/null
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -0,0 +1,561 @@
+//===------------- LoopFuse.cpp - Loop Fusion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Fuse two adjacent loops to improve cache locality. Loops are multi-versioned
+/// and unconditionally fused along one version to check for dependence
+/// legality. Legality decides whether to keep the original version or the fused
+/// version or both versions with runtime checks. LoopAccessLegacyAnalysis is used to
+/// check dependence legality.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFuse.h"
+
+#define DEBUG_TYPE "loop-fuse"
+
+using namespace llvm;
+
+static cl::opt<bool>
+    LFuseVerify("loop-fuse-verify", cl::Hidden,
+                cl::desc("Turn on DominatorTree and LoopInfo verification "
+                         "after Loop Fusion"),
+                cl::init(false));
+
+STATISTIC(NumLoopsFused, "Number of loops fused");
+
+// Replace IncomingBlocks in PHI nodes of @Br successors from Br's parent to
+// @To.
+void LoopFuse::RewritePHI(BranchInst *Br, BasicBlock *To) {
+  assert((Br && To));
+  for (auto *S : Br->successors()) {
+    auto I = S->begin();
+    while (PHINode *P = dyn_cast<PHINode>(&*I)) {
+      P->setIncomingBlock(P->getBasicBlockIndex(Br->getParent()), To);
+      ++I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                     Loop Fusion Implementation.
+// The idea to check fusion legality is by first fusing the loops and then look
+// for fusion preventing dependences. This is done by versioning the loops
+// first. The check is done on versioned loops and one of the version is
+// discarded based on legality's success.
+//===----------------------------------------------------------------------===//
+
+/* Fuse loops @L1 and @L2. Remove ConnectingBlock (CB) and connect L1Latch to
+   L2Header. Loop from L2Latch to L1Header. Make L1's indvar as indvar for the
+   fused loop. Update LI by moving L2Blocks into L1 and call L1 as FusedLoop.
+   Return FusedLoop.
+    L1
+    |               L1Blocks
+    CB     -->      |       \
+    |               L2Blocks/
+    L2                    |/
+*/
+Loop *LoopFuse::FuseLoops(Loop &L1, Loop &L2) {
+  PHINode *P1 = L1.getCanonicalInductionVariable();
+  PHINode *P2 = L2.getCanonicalInductionVariable();
+
+  BranchInst *Br1 = dyn_cast<BranchInst>(L1.getLoopLatch()->getTerminator());
+  BranchInst *Br2 = dyn_cast<BranchInst>(L2.getLoopLatch()->getTerminator());
+
+  // Make Br2 to branch to L1 header based on Br1's condition.
+  unsigned LoopBack = 0;
+  if (Br2->getSuccessor(1) == L2.getHeader())
+    LoopBack = 1;
+  assert((Br2->getSuccessor(LoopBack) == L2.getHeader()));
+  Br2->setSuccessor(LoopBack, L1.getHeader());
+  Br2->setCondition(Br1->getCondition());
+  RewritePHI(Br1, Br2->getParent());
+
+  // Zap L2 preheader and unconditionally branch from L1 latch to L2 header.
+  // L2 preheader is a connecting block and it is known to contain only an
+  // unconditional branch to L2 header.
+  BasicBlock *L2PH = L2.getLoopPreheader(), *L2Header = L2.getHeader();
+  BranchInst *L2PHBr = dyn_cast<BranchInst>(L2PH->getTerminator());
+  RewritePHI(L2PHBr, Br1->getParent());
+  DT->changeImmediateDominator(L2Header, L1.getLoopLatch());
+
+  BranchInst::Create(L2Header, Br1);
+  Br1->eraseFromParent();
+  L2PH->dropAllReferences();
+  L2PHBr->eraseFromParent();
+  L2PH->eraseFromParent();
+  DT->eraseNode(L2PH);
+  LI->removeBlock(L2PH);
+
+  P2->replaceAllUsesWith(P1);
+  P2->eraseFromParent();
+
+  // Update LI.
+  // Move all blocks from L2 to L1.
+  SmallVector<BasicBlock *, 2> L2BBs;
+  for (auto bb = L2.block_begin(), bbe = L2.block_end(); bb != bbe; ++bb)
+    L2BBs.push_back(*bb);
+  for (auto *bb : L2BBs) {
+    LI->removeBlock(bb);
+    L1.addBasicBlockToLoop(bb, *LI);
+  }
+  // Remove L2.
+  SE->forgetLoop(&L2);
+  LI->markAsRemoved(&L2);
+
+  // Update DT: DT changed only at L2PH zap and was updated during zapping.
+
+  return &L1;
+}
+
+/*  Version the given loops along a parallel path and fuse the cloned loops.
+    Check the dependence legality of the fused loop.
+
+    L1PH                       BooleanBB                       BooleanBB
+    |                             /\                              /\
+    L1                        L1PH  L1PH.clone                L1PH  FusedPH
+    |                version  |     |            Fuse along      |  |
+    CB (L1Exit/L2PH)  ---->   L1    L1.clone     -------->      L1  L1Blocks
+    |                         |     |            versioned       |  |       \
+    L2                        CB    CB.clone     path           CB  L2Blocks |
+    |                         |     |                            |  |      |/
+    L2Exit                    L2    L2.clone                    L2  |
+                                \  /                              \ /
+                                L2Exit                         CommonExit
+   CB is ConnectingBlock.
+*/
+bool LoopFuse::DependenceLegal(Loop &L1, Loop &L2) {
+
+  // Version to fuse. LoopVersioning is not used here because:
+  // a. Runtime checks are inserted later.
+  // b. Intermediate VMap updates are required.
+  // Moreover it is convenient for now to just clone and remap.
+  BasicBlock *BooleanBB = L1.getLoopPreheader();
+  BasicBlock *L1PH = SplitEdge(BooleanBB, L1.getHeader(), DT, LI);
+
+  ValueToValueMapTy VMap1;
+  SmallVector<BasicBlock *, 2> ClonedBBs1;
+  Loop *ClonedLoop1 =
+      cloneLoopWithPreheader(L1.getExitBlock(), BooleanBB, &L1, VMap1,
+                             Twine(".L1clone"), LI, DT, ClonedBBs1);
+
+  ValueToValueMapTy VMap2;
+  SmallVector<BasicBlock *, 2> ClonedBBs2;
+  Loop *ClonedLoop2 =
+      cloneLoopWithPreheader(L2.getExitBlock(), L1.getExitBlock(), &L2, VMap2,
+                             Twine(".L2clone"), LI, DT, ClonedBBs2);
+  remapInstructionsInBlocks(ClonedBBs2, VMap2);
+  VMap1[L1.getExitBlock()] = ClonedLoop2->getLoopPreheader();
+  remapInstructionsInBlocks(ClonedBBs1, VMap1);
+
+  // Build the custom VMap by concatenating VMap1 and VMap2.
+  for (auto V : VMap1)
+    VMap[V->first] = V->second;
+  for (auto V : VMap2)
+    VMap[V->first] = V->second;
+
+  // VMap.size() != VMap1.size() + VMap2.size() because of redundants and
+  // L1Exit update in VMap1 above.
+
+  // Branch to either of the versions - using a boolean flag.
+  Instruction *Term = BooleanBB->getTerminator();
+  FusionSwitcher =
+      BranchInst::Create(L1PH, ClonedLoop1->getLoopPreheader(),
+                         ConstantInt::getTrue(L1PH->getContext()), Term);
+  Term->eraseFromParent();
+
+  // The two versions join back at L2 exit. Update DT.
+  if (DT->dominates(L2.getLoopLatch(), L2.getExitBlock()))
+    DT->changeImmediateDominator(L2.getExitBlock(), BooleanBB);
+
+  DEBUG(dbgs() << "ClonedLoop1: " << *ClonedLoop1 << "\n");
+  DEBUG(dbgs() << "ClonedLoop2: " << *ClonedLoop2 << "\n");
+
+  FusedLoop = FuseLoops(*ClonedLoop1, *ClonedLoop2);
+  DEBUG(dbgs() << "FusedLoop: " << *FusedLoop << "\n");
+
+  // Check dependences.
+  DEBUG(dbgs() << "Loop fused on versioned path. Checking dependences...\n");
+  LAI = &LAA->getInfo(FusedLoop);
+  DEBUG(LAI->print(dbgs()));
+
+  auto Dependences = LAI->getDepChecker().getDependences();
+  // TODO@jiahao: Investigate.
+  // if (!Dependences || Dependences->empty()) {
+  //   DEBUG(dbgs() << "Failed to get dependences to check fusion legality!"
+  //                << " Skipping...\n");
+  //   return false;
+  // }
+
+  // Fusion is illegal if there is a backward dependence between memory accesses
+  // whose source was in L1 and sink was in L2. ClonedBBs1 and ClonedBBs2
+  // contain cloned BBs from L1 and L2 respectively. They are used to check the
+  // containment of srouce and sink.
+  for (auto &Dep : *Dependences) {
+    if (Dep.isPossiblyBackward()) {
+      Instruction *Source = Dep.getSource(*LAI);
+      Instruction *Sink = Dep.getDestination(*LAI);
+      if (std::find(ClonedBBs1.begin(), ClonedBBs1.end(),
+                    Source->getParent()) == ClonedBBs1.end())
+        continue;
+      if (std::find(ClonedBBs2.begin(), ClonedBBs2.end(), Sink->getParent()) ==
+          ClonedBBs2.end())
+        continue;
+      DEBUG(dbgs() << "Loop carried backward dependence prevents fusion!\n");
+      return false;
+    }
+  }
+  DEBUG(dbgs() << "Loops are dependence legal to fuse!\n");
+  return true;
+}
+
+// Return true if any of the defs made in @L1 is used inside @L2.
+bool LoopFuse::DefsUsedAcrossLoops(Loop &L1, Loop &L2) {
+  auto DefsUsedOutsideL1 = findDefsUsedOutsideOfLoop(&L1);
+  for (auto *D : DefsUsedOutsideL1) {
+    for (auto *U : D->users()) {
+      if (L2.contains(dyn_cast<Instruction>(U)->getParent()))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool LoopFuse::IsLegalAndProfitable(Loop &L1, Loop &L2) {
+  // Basic legality.
+  if (!L1.empty() || !L2.empty()) {
+    // TODO: Update cloneLoopWithPreheader() to update LoopInfo for subloops
+    // too and LoopFusion can be done for loops at any depth.
+    DEBUG(dbgs() << "Not innermost loops! Skipping...\n");
+    return false;
+  }
+
+  if (L1.getLoopDepth() != L2.getLoopDepth()) {
+    DEBUG(dbgs() << "Loops not at same depth! Skipping...\n");
+    return false;
+  }
+
+  if (!L1.getLoopPreheader() || !L2.getLoopPreheader()) {
+    DEBUG(dbgs() << "No preheader! Skipping...\n");
+    return false;
+  }
+
+  if (!L1.getExitBlock() || !L2.getExitBlock()) {
+    DEBUG(dbgs() << "Single exit block not found! Skipping...\n");
+    return false;
+  }
+
+  // Can fuse only bottom-tested loops and loops with latch being the single
+  // exiting block.
+  if ((L1.getExitingBlock() != L1.getLoopLatch()) ||
+      (L2.getExitingBlock() != L2.getLoopLatch())) {
+    DEBUG(dbgs() << "Not a bottom-tested loop! Skipping...\n");
+    return false;
+  }
+
+  // Can fuse only adjacent loops. Adjacency is defined by:
+  // a. L1Exit has single entry only from L1Latch.
+  // b. L1Exit and L2Preheader are same i.e the block forms the ConnectingBlock.
+  // c. ConnectingBlock just branches unconditionally to L2Header.
+  auto *Br = dyn_cast<BranchInst>(L1.getExitBlock()->begin());
+  if ((L1.getExitBlock()->getSinglePredecessor() != L1.getLoopLatch()) ||
+      (L1.getExitBlock() != L2.getLoopPreheader()) ||
+      (!Br || Br->isConditional())) {
+    DEBUG(dbgs() << "Loops not adjacent! Skipping...\n");
+    return false;
+  }
+
+  // Indvars of both loops is known and canonicalized.
+  PHINode *P1 = L1.getCanonicalInductionVariable();
+  PHINode *P2 = L2.getCanonicalInductionVariable();
+  if (!P1 || !P2) {
+    DEBUG(dbgs() << "Unknown induction variables! Skipping...\n");
+    return false;
+  }
+
+  // P1 and P2 are canonical indvars. Backedge taken count check is enough to
+  // ascertain both loops have same iteration space.
+  if (SE->getBackedgeTakenCount(&L1) != SE->getBackedgeTakenCount(&L2))
+    return false;
+
+  // Cannot fuse if there are uses of L1 defs in L2.
+  if (DefsUsedAcrossLoops(L1, L2))
+    return false;
+
+  // Dependene based legality.
+  if (!DependenceLegal(L1, L2))
+    return false;
+
+  // TODO: Add profitability measures.
+
+  return true;
+}
+
+// Remove Loop @L completely by deleting the BBs and also from @LI, @DT and @SE
+// including preheader. Finally connect the single predecessor (the BooleanBB
+// that contains FusionSwitcher) of preheader to loop exit.
+void LoopFuse::RemoveLoopCompletelyWithPreheader(Loop &L) {
+  DEBUG(dbgs() << "Removing loop: " << L << "\n");
+  BasicBlock *PH = L.getLoopPreheader();
+  BasicBlock *Exit = L.getExitBlock();
+  assert(Exit && "Expected Exit bb and single pred to preheader!");
+
+  // No need to RewritePHIs of Exit block given the Loop is deleted because the
+  // uses remain same if FusedLoop is removed OR uses are already replaced if
+  // original loops are deleted.
+
+  // Branch to Exit block from FusionSwitcher.
+  unsigned SuccNum = 0;
+  if (FusionSwitcher->getSuccessor(1) == PH)
+    SuccNum = 1;
+  assert((FusionSwitcher->getSuccessor(SuccNum) == PH));
+  FusionSwitcher->setSuccessor(SuccNum, Exit);
+  if (DT->dominates(L.getLoopLatch(), Exit)) // L1 removal case.
+    // Exit blocks iDom is FusionSwitcher's block due to versioning.
+    DT->changeImmediateDominator(Exit, FusionSwitcher->getParent());
+
+  // Erase each of the loop blocks. Update SE, DT and LI.
+  SE->forgetLoop(&L);
+  PH->dropAllReferences();
+  for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) {
+    DT->changeImmediateDominator(*bb, PH);
+    (*bb)->dropAllReferences();
+  }
+
+  PH->eraseFromParent();
+  for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb) {
+    // Now nuke bb and its DT.
+    (*bb)->eraseFromParent();
+    DT->eraseNode(*bb);
+  }
+  DT->eraseNode(PH);
+
+  SmallVector<BasicBlock *, 2> LBBs;
+  for (auto bb = L.block_begin(), bbe = L.block_end(); bb != bbe; ++bb)
+    LBBs.push_back(*bb);
+  for (auto *bb : LBBs)
+    LI->removeBlock(bb);
+  if (LI->getLoopFor(PH))
+    LI->removeBlock(PH);
+
+  LI->markAsRemoved(&L);
+}
+
+// Remove FusionSwitcher and branch directly to given loop @L's header. This
+// removes loop's preheader and make FusionSwitcher's block as preheader.
+void LoopFuse::RemoveFusionSwitcher(Loop &L) {
+  assert(FusionSwitcher->isConditional());
+  DEBUG(dbgs() << "Removing FusionSwitcher: " << *FusionSwitcher << "\n");
+
+  BasicBlock *PH = L.getLoopPreheader();
+  assert((PH->size() == 1));
+
+  BranchInst *PHBr = dyn_cast<BranchInst>(PH->getTerminator());
+  assert(PHBr->isUnconditional());
+
+  RewritePHI(PHBr, FusionSwitcher->getParent());
+
+  PHBr->removeFromParent();
+  PHBr->insertBefore(FusionSwitcher);
+  DT->changeImmediateDominator(L.getHeader(), FusionSwitcher->getParent());
+
+  FusionSwitcher->eraseFromParent();
+  PH->eraseFromParent();
+  DT->eraseNode(PH);
+  if (LI->getLoopFor(PH))
+    LI->removeBlock(PH);
+}
+
+// Update the uses of defs that reach outside original loop with the defs made
+// made in fused loop.
+void LoopFuse::UpdateUsesOutsideLoop(Loop &L) {
+  for (auto *D : findDefsUsedOutsideOfLoop(&L)) {
+    auto VI = VMap.find(D);
+    if (VI == VMap.end())
+      continue;
+
+    for (auto *U : D->users()) {
+      if (!L.contains(dyn_cast<Instruction>(U)->getParent())) {
+        if (auto *P = dyn_cast<PHINode>(U)) {
+          // Replace U in PHI with <VMap(D), FusedLoopLatch>
+          for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+            if (P->getIncomingValue(i) == U) {
+              P->removeIncomingValue(i);
+              P->addIncoming(VI->second, FusedLoop->getLoopLatch());
+            }
+          }
+        } else
+          U->replaceUsesOfWith(D, VI->second);
+      }
+    }
+  }
+}
+
+// Add/update phi for defs that reach uses outside the loop from original loop
+// @L and from fused loop. Insert the phis into fused loop's exit block, which
+// is also the exit block of original L2 loop. @OrigIncomingBlock refers to the
+// block from where a def is reached outside of loop - L2 latch.
+// TODO: This routine is similar to LoopVersioning's addPHINodes(), but
+// rewritten here as access to internal data structures differ.
+void LoopFuse::AddPHIsOutsideLoop(Loop &L, BasicBlock *OrigIncomingBlock) {
+  BasicBlock *PHIBlock = FusedLoop->getExitBlock();
+  assert(PHIBlock && "Unable to find FusedLoop's ExitBlock!");
+
+  for (auto *Inst : findDefsUsedOutsideOfLoop(&L)) {
+    PHINode *PN = nullptr;
+    auto FusedInst = VMap.find(Inst);
+    assert((FusedInst != VMap.end()) &&
+           "Expected an equivalent instruction in fused loop!");
+    // Update/add phi node for this Inst.
+    bool FoundInst = false;
+    for (auto I = PHIBlock->begin(); !FoundInst && (PN = dyn_cast<PHINode>(I));
+         ++I) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); !FoundInst && i != e;
+           ++i)
+        if (PN->getIncomingValue(i) == Inst)
+          FoundInst = true;
+    }
+    if (!PN) {
+      PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lfuse",
+                           &PHIBlock->front());
+
+      for (auto *U : Inst->users())
+        if (!L.contains(dyn_cast<Instruction>(U)->getParent()))
+          U->replaceUsesOfWith(Inst, PN);
+
+      PN->addIncoming(Inst, OrigIncomingBlock);
+    }
+    // Add incoming value from fused loop.
+    PN->addIncoming(FusedInst->second, FusedLoop->getLoopLatch());
+  }
+}
+
+bool LoopFuse::run(Loop &L1, Loop &L2) {
+  assert((LI && LAA && DT && SE));
+  DEBUG(dbgs() << "\nTrying to fuse:\n" << L1 << "AND\n" << L2 << "\n");
+
+  FusionSwitcher = nullptr;
+  FusedLoop = nullptr;
+  VMap.clear();
+  bool Changed = false;
+  if (IsLegalAndProfitable(L1, L2)) {
+    assert((FusedLoop && FusionSwitcher));
+    auto *RuntimePtrChecks = LAI->getRuntimePointerChecking();
+    if (RuntimePtrChecks->Need) {
+      // Add runtime checks and add/update phis in exit block for the defs
+      // reaching from two versions.
+      Instruction *FirstCheck, *LastCheck;
+      std::tie(FirstCheck, LastCheck) = LAI->addRuntimeChecks(FusionSwitcher);
+      // TODO: Add SCEVRuntime checks?
+      FusionSwitcher->setCondition(LastCheck);
+
+      AddPHIsOutsideLoop(L1, L2.getLoopLatch());
+      AddPHIsOutsideLoop(L2, L2.getLoopLatch());
+      FusionKind = VERSIONED_FUSION;
+
+    } else {
+      // Remove original loops and retain FusedLoop. Also update the uses of
+      // defs from original loops with the defs from fused loop.
+      UpdateUsesOutsideLoop(L1);
+      UpdateUsesOutsideLoop(L2);
+      RemoveLoopCompletelyWithPreheader(L1);
+      RemoveLoopCompletelyWithPreheader(L2);
+
+      // Remove FusionSwitcher and directly point to FusedLoop header.
+      if (DT->dominates(FusionSwitcher->getParent(), FusedLoop->getExitBlock()))
+        DT->changeImmediateDominator(FusedLoop->getExitBlock(),
+                                     FusedLoop->getLoopLatch());
+      RemoveFusionSwitcher(*FusedLoop);
+      FusionKind = PURE_FUSION;
+    }
+    ++NumLoopsFused;
+    Changed = true;
+
+  } else {
+    if (FusedLoop) {
+      // Loops were versioned to check legality. Rollback to original state.
+      RemoveLoopCompletelyWithPreheader(*FusedLoop);
+
+      // Remove FusionSwitcher and directly point to L1 header.
+      if (DT->dominates(FusionSwitcher->getParent(), L2.getExitBlock()))
+        DT->changeImmediateDominator(L2.getExitBlock(), L2.getLoopLatch());
+      RemoveFusionSwitcher(L1);
+      FusionKind = REVERTED_FUSION;
+    }
+  }
+
+  if (LFuseVerify) {
+    LI->verify(*DT);
+    DT->verifyDomTree();
+  }
+
+  return Changed;
+}
+
+void PopulateInnermostLoopsOf(Loop &L, SmallVectorImpl<Loop *> &Loops) {
+  if (L.empty())
+    Loops.push_back(&L);
+  for (auto I = L.begin(), E = L.end(); I != E; ++I)
+    PopulateInnermostLoopsOf(**I, Loops);
+}
+
+bool LoopFuse::runOnFunction(Function &F) {
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  // Populate innermost loops and try a n^2 combination of loop fusion.
+  bool Changed = false;
+  SmallVector<Loop *, 2> Loops;
+  for (auto L = LI->begin(), Le = LI->end(); L != Le; ++L)
+    PopulateInnermostLoopsOf(**L, Loops);
+
+  auto L1 = Loops.begin(), L1e = Loops.end();
+  while (L1 != L1e) {
+    auto L2 = Loops.begin(), L2e = Loops.end();
+    while (L2 != L2e) {
+      if (L1 == L2) {
+        ++L2;
+        continue;
+      }
+      if (run(**L1, **L2)) {
+        // Remove L1 and L2 from Loops and add FusedLoop.
+        Loops.erase(L1);
+        Loops.erase(L2);
+        Loops.push_back(FusedLoop);
+        L1 = L2 = Loops.begin();
+        L1e = L2e = Loops.end();
+        Changed = true;
+      } else
+        ++L2;
+    }
+    ++L1;
+  }
+
+  if (LFuseVerify) {
+    LI->verify(*DT);
+    DT->verifyDomTree();
+    assert((!verifyFunction(F, &dbgs())) && "Function verification failed!");
+  }
+
+  return Changed;
+}
+
+char LoopFuse::ID;
+
+INITIALIZE_PASS_BEGIN(LoopFuse, "loop-fuse", "Loop Fusion", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopFuse, "loop-fuse", "Loop Fusion", false, false)
+
+namespace llvm {
+FunctionPass *createLoopFusePass() { return new LoopFuse(); }
+}
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index fd22128f7fe6b8..34773d906e0481 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -32,6 +32,603 @@ static cl::opt<unsigned> DefaultRotationThreshold(
     "rotation-max-header-size", cl::init(16), cl::Hidden,
     cl::desc("The default maximum header size for automatic loop rotation"));
 
+STATISTIC(NumRotated, "Number of loops rotated");
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+  const unsigned MaxHeaderSize;
+  LoopInfo *LI;
+  const TargetTransformInfo *TTI;
+  AssumptionCache *AC;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  const SimplifyQuery &SQ;
+
+public:
+  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+             const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, const SimplifyQuery &SQ)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+        SQ(SQ) {}
+  bool processLoop(Loop *L);
+
+private:
+  bool rotateLoop(Loop *L, bool SimplifiedLatch);
+  bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader.  If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values.  Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+                                            BasicBlock *OrigPreheader,
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
+  // Remove PHI node entries that are no longer live.
+  BasicBlock::iterator I, E = OrigHeader->end();
+  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+  // as necessary.
+  SSAUpdater SSA(InsertedPHIs);
+  for (I = OrigHeader->begin(); I != E; ++I) {
+    Value *OrigHeaderVal = &*I;
+
+    // If there are no uses of the value (e.g. because it returns void), there
+    // is nothing to rewrite.
+    if (OrigHeaderVal->use_empty())
+      continue;
+
+    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+    // The value now exits in two versions: the initial value in the preheader
+    // and the loop "next" value in the original header.
+    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+    // Visit each use of the OrigHeader instruction.
+    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+                             UE = OrigHeaderVal->use_end();
+         UI != UE;) {
+      // Grab the use before incrementing the iterator.
+      Use &U = *UI;
+
+      // Increment the iterator before removing the use from the list.
+      ++UI;
+
+      // SSAUpdater can't handle a non-PHI use in the same block as an
+      // earlier def. We can easily handle those cases manually.
+      Instruction *UserInst = cast<Instruction>(U.getUser());
+      if (!isa<PHINode>(UserInst)) {
+        BasicBlock *UserBB = UserInst->getParent();
+
+        // The original users in the OrigHeader are already using the
+        // original definitions.
+        if (UserBB == OrigHeader)
+          continue;
+
+        // Users in the OrigPreHeader need to use the value to which the
+        // original definitions are mapped.
+        if (UserBB == OrigPreheader) {
+          U = OrigPreHeaderVal;
+          continue;
+        }
+      }
+
+      // Anything else can be handled by SSAUpdater.
+      SSA.RewriteUse(U);
+    }
+
+    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+    // intrinsics.
+    SmallVector<DbgValueInst *, 1> DbgValues;
+    llvm::findDbgValues(DbgValues, OrigHeaderVal);
+    for (auto &DbgValue : DbgValues) {
+      // The original users in the OrigHeader are already using the original
+      // definitions.
+      BasicBlock *UserBB = DbgValue->getParent();
+      if (UserBB == OrigHeader)
+        continue;
+
+      // Users in the OrigPreHeader need to use the value to which the
+      // original definitions are mapped and anything else can be handled by
+      // the SSAUpdater. To avoid adding PHINodes, check if the value is
+      // available in UserBB, if not substitute undef.
+      Value *NewVal;
+      if (UserBB == OrigPreheader)
+        NewVal = OrigPreHeaderVal;
+      else if (SSA.HasValueForBlock(UserBB))
+        NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+      else
+        NewVal = UndefValue::get(OrigHeaderVal->getType());
+      DbgValue->setOperand(0,
+                           MetadataAsValue::get(OrigHeaderVal->getContext(),
+                                                ValueAsMetadata::get(NewVal)));
+    }
+  }
+}
+
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
+  ValueToValueMapTy DbgValueMap;
+
+  // Map existing PHI nodes to their dbg.values.
+  for (auto &I : *OrigHeader) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = OrigHeader->getContext();
+  for (auto PHI : InsertedPHIs) {
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+        NewDbgII->setOperand(0, PhiMAV);
+        BasicBlock *Parent = PHI->getParent();
+        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+      }
+    }
+  }
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+  // If the loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  BasicBlock *OrigHeader = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
+    return false;
+
+  // If the loop header is not one of the loop exiting blocks then
+  // either this loop is already rotated or it is not
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExiting(OrigHeader))
+    return false;
+
+  // If the loop latch already contains a branch that leaves the loop then the
+  // loop is already rotated.
+  if (!OrigLatch)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+    return false;
+
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  {
+    SmallPtrSet<const Value *, 32> EphValues;
+    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+    CodeMetrics Metrics;
+    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    if (Metrics.notDuplicatable) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                   << " instructions: ";
+            L->dump());
+      return false;
+    }
+    if (Metrics.convergent) {
+      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                      "instructions: ";
+            L->dump());
+      return false;
+    }
+    if (Metrics.NumInsts > MaxHeaderSize)
+      return false;
+  }
+
+  // Now, this loop is suitable for rotation.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!OrigPreheader)
+    return false;
+
+  if (isa<SyncInst>(OrigPreheader->getTerminator())) {
+    DEBUG(dbgs() << "LoopRotation: Splitting header due to sync terminator.\n");
+    BasicBlock *NewPreheader = SplitEdge(OrigPreheader, OrigHeader, DT, LI);
+    // SyncInst::Create(NewPreheader, OrigPreheader->getTerminator());
+    // OrigPreheader->getTerminator()->eraseFromParent();
+    OrigPreheader = NewPreheader;
+  }
+
+  // Anything ScalarEvolution may know about this loop or the PHI nodes
+  // in its header will soon be invalidated.
+  if (SE)
+    SE->forgetLoop(L);
+
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is outside the
+  // loop.  Otherwise loop is not suitable for rotation.
+  BasicBlock *Exit = BI->getSuccessor(0);
+  BasicBlock *NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert(NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) &&
+         "Unable to determine loop header and exit blocks");
+
+  // This code assumes that the new header has exactly one predecessor.
+  // Remove any single-entry PHI nodes in it.
+  assert(NewHeader->getSinglePredecessor() &&
+         "New header doesn't have one pred!");
+  FoldSingleEntryPHINodes(NewHeader);
+
+  // Begin by walking OrigHeader and populating ValueMap with an entry for
+  // each Instruction.
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  ValueToValueMapTy ValueMap;
+
+  // For PHI nodes, the value available in OldPreHeader is just the
+  // incoming value from OldPreHeader.
+  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
+
+  // For the rest of the instructions, either hoist to the OrigPreheader if
+  // possible or create a clone in the OldPreHeader if not.
+  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
+
+  // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+  using DbgIntrinsicHash =
+      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+  auto makeHash = [](DbgInfoIntrinsic *D) -> DbgIntrinsicHash {
+    return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+  };
+  SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+  for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+       I != E; ++I) {
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&*I))
+      DbgIntrinsics.insert(makeHash(DII));
+    else
+      break;
+  }
+
+  while (I != E) {
+    Instruction *Inst = &*I++;
+
+    // If the instruction's operands are invariant and it doesn't read or write
+    // memory, then it is safe to hoist.  Doing this doesn't change the order of
+    // execution in the preheader, but does prevent the instruction from
+    // executing in each iteration of the loop.  This means it is safe to hoist
+    // something that might trap, but isn't safe to hoist something that reads
+    // memory (without proving that the loop doesn't write).
+    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
+        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+      Inst->moveBefore(LoopEntryBranch);
+      continue;
+    }
+
+    // Otherwise, create a duplicate of the instruction.
+    Instruction *C = Inst->clone();
+
+    // Eagerly remap the operands of the instruction.
+    RemapInstruction(C, ValueMap,
+                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+    // Avoid inserting the same intrinsic twice.
+    if (auto *DII = dyn_cast<DbgInfoIntrinsic>(C))
+      if (DbgIntrinsics.count(makeHash(DII))) {
+        C->deleteValue();
+        continue;
+      }
+
+    // With the operands remapped, see if the instruction constant folds or is
+    // otherwise simplifyable.  This commonly occurs because the entry from PHI
+    // nodes allows icmps and other instructions to fold.
+    Value *V = SimplifyInstruction(C, SQ);
+    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+      // If so, then delete the temporary instruction and stick the folded value
+      // in the map.
+      ValueMap[Inst] = V;
+      if (!C->mayHaveSideEffects()) {
+        C->deleteValue();
+        C = nullptr;
+      }
+    } else {
+      ValueMap[Inst] = C;
+    }
+    if (C) {
+      // Otherwise, stick the new instruction into the new block!
+      C->setName(Inst->getName());
+      C->insertBefore(LoopEntryBranch);
+
+      if (auto *II = dyn_cast<IntrinsicInst>(C))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
+    }
+  }
+
+  // Along with all the other instructions, we just cloned OrigHeader's
+  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+  // successors by duplicating their incoming values for OrigHeader.
+  TerminatorInst *TI = OrigHeader->getTerminator();
+  for (BasicBlock *SuccBB : TI->successors())
+    for (BasicBlock::iterator BI = SuccBB->begin();
+         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+  // OrigPreHeader's old terminator (the original branch into the loop), and
+  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+  LoopEntryBranch->eraseFromParent();
+
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
+  // If there were any uses of instructions in the duplicated block outside the
+  // loop, update them, inserting PHI nodes as required
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValues(OrigHeader, InsertedPHIs);
+
+  // NewHeader is now the header of the loop.
+  L->moveToHeader(NewHeader);
+  assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+  // Inform DT about changes to the CFG.
+  if (DT) {
+    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+    // the DT about the removed edge to the OrigHeader (that got removed).
+    SmallVector<DominatorTree::UpdateType, 3> Updates;
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+    Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+    Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+    DT->applyUpdates(Updates);
+  }
+
+  // At this point, we've finished our major CFG changes.  As part of cloning
+  // the loop into the preheader we've simplified instructions and the
+  // duplicated conditional branch may now be branching on a constant.  If it is
+  // branching on a constant and if that constant means that we enter the loop,
+  // then we fold away the cond branch to an uncond branch.  This simplifies the
+  // loop in cases important for nested loops, and it also means we don't have
+  // to split as many edges.
+  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+  if (!isa<ConstantInt>(PHBI->getCondition()) ||
+      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+          NewHeader) {
+    // The conditional branch can't be folded, handle the general case.
+    // Split edges as necessary to preserve LoopSimplify form.
+
+    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+    // thus is not a preheader anymore.
+    // Split the edge to form a real preheader.
+    BasicBlock *NewPH = SplitCriticalEdge(
+        OrigPreheader, NewHeader,
+        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+    NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+    // Preserve canonical loop form, which means that 'Exit' should have only
+    // one predecessor. Note that Exit could be an exit block for multiple
+    // nested loops, causing both of the edges to now be critical and need to
+    // be split.
+    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+    bool SplitLatchEdge = false;
+    for (BasicBlock *ExitPred : ExitPreds) {
+      // We only need to split loop exit edges.
+      Loop *PredLoop = LI->getLoopFor(ExitPred);
+      if (!PredLoop || PredLoop->contains(Exit))
+        continue;
+      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
+        continue;
+      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+      BasicBlock *ExitSplit = SplitCriticalEdge(
+          ExitPred, Exit,
+          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
+      ExitSplit->moveBefore(Exit);
+    }
+    assert(SplitLatchEdge &&
+           "Despite splitting all preds, failed to split latch exit?");
+  } else {
+    // We can fold the conditional branch in the preheader, this makes things
+    // simpler. The first step is to remove the extra edge to the Exit block.
+    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+    NewBI->setDebugLoc(PHBI->getDebugLoc());
+    PHBI->eraseFromParent();
+
+    // With our CFG finalized, update DomTree if it is available.
+    if (DT) DT->deleteEdge(OrigPreheader, Exit);
+  }
+
+  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+  // Now that the CFG and DomTree are in a consistent state again, try to merge
+  // the OrigHeader block into OrigLatch.  This will succeed if they are
+  // connected by an unconditional branch.  This is just a cleanup so the
+  // emitted code isn't too gross in this common case.
+  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+
+  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+  ++NumRotated;
+  return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+                                  BasicBlock::iterator End, Loop *L) {
+  bool seenIncrement = false;
+  bool MultiExitLoop = false;
+
+  if (!L->getExitingBlock())
+    MultiExitLoop = true;
+
+  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+    if (!isSafeToSpeculativelyExecute(&*I))
+      return false;
+
+    if (isa<DbgInfoIntrinsic>(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      return false;
+    case Instruction::GetElementPtr:
+      // GEPs are cheap if all indices are constant.
+      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+        return false;
+      // fall-thru to increment case
+      LLVM_FALLTHROUGH;
+    case Instruction::Add:
+    case Instruction::Sub:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr: {
+      Value *IVOpnd =
+          !isa<Constant>(I->getOperand(0))
+              ? I->getOperand(0)
+              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+      if (!IVOpnd)
+        return false;
+
+      // If increment operand is used outside of the loop, this speculation
+      // could cause extra live range interference.
+      if (MultiExitLoop) {
+        for (User *UseI : IVOpnd->users()) {
+          auto *UserInst = cast<Instruction>(UseI);
+          if (!L->contains(UserInst))
+            return false;
+        }
+      }
+
+      if (seenIncrement)
+        return false;
+      seenIncrement = true;
+      break;
+    }
+    case Instruction::Trunc:
+    case Instruction::ZExt:
+    case Instruction::SExt:
+      // ignore type conversions
+      break;
+    }
+  }
+  return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  if (!Latch || Latch->hasAddressTaken())
+    return false;
+
+  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+  if (!Jmp || !Jmp->isUnconditional())
+    return false;
+
+  BasicBlock *LastExit = Latch->getSinglePredecessor();
+  if (!LastExit || !L->isLoopExiting(LastExit))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+  if (!BI)
+    return false;
+
+  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+    return false;
+
+  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+               << LastExit->getName() << "\n");
+
+  // Hoist the instructions from Latch into LastExit.
+  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
+                                 Latch->begin(), Jmp->getIterator());
+
+  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
+  BasicBlock *Header = Jmp->getSuccessor(0);
+  assert(Header == L->getHeader() && "expected a backward branch");
+
+  // Remove Latch from the CFG so that LastExit becomes the new Latch.
+  BI->setSuccessor(FallThruPath, Header);
+  Latch->replaceSuccessorsPhiUsesWith(LastExit);
+  Jmp->eraseFromParent();
+
+  // Nuke the Latch block.
+  assert(Latch->empty() && "unable to evacuate Latch");
+  LI->removeBlock(Latch);
+  if (DT)
+    DT->eraseNode(Latch);
+  Latch->eraseFromParent();
+  return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+  // Save the loop metadata.
+  MDNode *LoopMD = L->getLoopID();
+
+  // Simplify the loop latch before attempting to rotate the header
+  // upward. Rotation may not be needed if the loop tail can be folded into the
+  // loop exit.
+  bool SimplifiedLatch = simplifyLoopLatch(L);
+
+  bool MadeChange = rotateLoop(L, SimplifiedLatch);
+  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+         "Loop latch should be exiting after loop-rotate.");
+
+  // Restore the loop metadata.
+  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+  if ((MadeChange || SimplifiedLatch) && LoopMD)
+    L->setLoopID(LoopMD);
+
+  return MadeChange || SimplifiedLatch;
+}
+
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
     : EnableHeaderDuplication(EnableHeaderDuplication) {}
 
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index 2f6ed05c023b1e..c4dccc91b54056 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -611,6 +611,14 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
 
   void visitReturnInst(ReturnInst &I);
   void visitTerminator(Instruction &TI);
+  void visitReattachInst(ReattachInst &I) {
+    markOverdefined(&I);
+    visitTerminator(I);
+  }
+  void visitSyncInst(SyncInst &I) {
+    markOverdefined(&I);
+    visitTerminator(I);
+  }
 
   void visitCastInst(CastInst &I);
   void visitSelectInst(SelectInst &I);
@@ -734,6 +742,13 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
     return;
   }
 
+  if (isa<DetachInst>(&TI) ||
+      isa<ReattachInst>(&TI) ||
+      isa<SyncInst>(&TI)) {
+    // All destinations are executable.
+    Succs.assign(TI.getNumSuccessors(), true);
+    return;
+  }
   LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
 }
@@ -745,6 +760,66 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
   // be more aggressive and try to consider edges which haven't been marked
   // yet, but there isn't any need.)
   return KnownFeasibleEdges.count(Edge(From, To));
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now.
+  TerminatorInst *TI = From->getTerminator();
+  if (auto *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+
+    LatticeVal BCValue = getValueState(BI->getCondition());
+
+    // Overdefined condition variables mean the branch could go either way,
+    // undef conditions mean that neither edge is feasible yet.
+    ConstantInt *CI = BCValue.getConstantInt();
+    if (!CI)
+      return !BCValue.isUnknown();
+
+    // Constant condition variables mean the branch can only go a single way.
+    return BI->getSuccessor(CI->isZero()) == To;
+  }
+
+  // Unwinding instructions successors are always executable.
+  if (TI->isExceptional())
+    return true;
+
+  if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+    if (SI->getNumCases() < 1)
+      return true;
+
+    LatticeVal SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = SCValue.getConstantInt();
+
+    if (!CI)
+      return !SCValue.isUnknown();
+
+    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
+  }
+
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+
+    if (!Addr)
+      return !IBRValue.isUnknown();
+
+    // At this point, the indirectbr is branching on a blockaddress.
+    return Addr->getBasicBlock() == To;
+  }
+
+  if (isa<ReattachInst>(TI) ||
+      isa<DetachInst>(TI) ||
+      isa<SyncInst>(TI))
+    return true;
+
+  LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
+  llvm_unreachable("SCCP: Don't know how to handle this terminator!");
 }
 
 // visit Implementations - Something changed in this instruction, either an
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 68ca6c47c8f1a4..cef9cac89db330 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -3867,6 +3867,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
       // a direct store) as needing to be resplit because it is no longer
       // promotable.
       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+        assert((!FunctionContainsDetach ||
+                isAllocaParallelPromotable(OtherAI, *DT)) &&
+               "Alloca must be promotable");
         ResplitPromotableAllocas.insert(OtherAI);
         Worklist.insert(OtherAI);
       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
@@ -3983,6 +3986,9 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
     if (!SplitLoads) {
       if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
         assert(OtherAI != &AI && "We can't re-split our own alloca!");
+        assert((!FunctionContainsDetach ||
+                isAllocaParallelPromotable(OtherAI, *DT)) &&
+               "Alloca must be promotable");
         ResplitPromotableAllocas.insert(OtherAI);
         Worklist.insert(OtherAI);
       } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
@@ -4152,9 +4158,16 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
       break;
     }
 
+  // Check if any detaches block promotion.
+  Promotable &= (!FunctionContainsDetach ||
+                 isAllocaParallelPromotable(NewAI, *DT));
+
   if (Promotable) {
     if (PHIUsers.empty() && SelectUsers.empty()) {
       // Promote the alloca.
+        assert((!FunctionContainsDetach ||
+              isAllocaParallelPromotable(NewAI, *DT)) &&
+             "Alloca must be promotable");
       PromotableAllocas.push_back(NewAI);
     } else {
       // If we have either PHIs or Selects to speculate, add them to those
@@ -4496,11 +4509,28 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   DT = &RunDT;
   AC = &RunAC;
 
-  BasicBlock &EntryBB = F.getEntryBlock();
-  for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
-       I != E; ++I) {
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
-      Worklist.insert(AI);
+  // BasicBlock &EntryBB = F.getEntryBlock();
+  // Scan the function to get its entry block and all entry blocks of detached
+  // CFG's.  We can perform this scan for entry blocks once for the function,
+  // because this pass preserves the CFG.
+  SmallVector<BasicBlock *, 4> EntryBlocks;
+  FunctionContainsDetach = false;
+  EntryBlocks.push_back(&F.getEntryBlock());
+  for (BasicBlock &BB : F)
+    if (BasicBlock *Pred = BB.getUniquePredecessor())
+      if (DetachInst *DI = dyn_cast<DetachInst>(Pred->getTerminator())) {
+        FunctionContainsDetach = true;
+        if (DI->getDetached() == &BB)
+          EntryBlocks.push_back(&BB);
+      }
+
+  for (BasicBlock *BB : EntryBlocks) {
+    BasicBlock &EntryBB = *BB;
+    for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+         I != E; ++I) {
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+        Worklist.insert(AI);
+    }
   }
 
   bool Changed = false;
diff --git a/llvm/lib/Transforms/Scalar/Scalar.cpp b/llvm/lib/Transforms/Scalar/Scalar.cpp
index 976daf4c78c2fd..67571aeeaf12c6 100644
--- a/llvm/lib/Transforms/Scalar/Scalar.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalar.cpp
@@ -65,6 +65,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeLegacyLoopSinkPassPass(Registry);
   initializeLoopDataPrefetchLegacyPassPass(Registry);
   initializeLoopDeletionLegacyPassPass(Registry);
+  initializeLoopFusePass(Registry);
   initializeLoopAccessLegacyAnalysisPass(Registry);
   initializeLoopInstSimplifyLegacyPassPass(Registry);
   initializeLoopInterchangePass(Registry);
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index b7b1db76b49237..f60e856a4d4285 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -38,6 +38,8 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include <utility>
@@ -143,6 +145,71 @@ static bool mergeEmptyReturnBlocks(Function &F) {
   return Changed;
 }
 
+static bool removeUselessSyncs(Function &F) {
+  bool Changed = false;
+  // Scan all the blocks in the function
+ check:
+  for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; ) {
+    BasicBlock *BB = &*BBI++;
+    if (SyncInst *Sync = dyn_cast<SyncInst>(BB->getTerminator())) {
+      // Walk the CFG backwards to try to find a reaching detach instruction.
+      bool ReachingDetach = false;
+      SmallPtrSet<BasicBlock *, 32> Visited;
+      SmallVector<BasicBlock *, 32> WorkList;
+      WorkList.push_back(BB);
+      while (!WorkList.empty()) {
+        BasicBlock *PBB = WorkList.pop_back_val();
+        if (!Visited.insert(PBB).second)
+          continue;
+
+        for (pred_iterator PI = pred_begin(PBB), PE = pred_end(PBB);
+             PI != PE; ++PI) {
+          BasicBlock *Pred = *PI;
+          TerminatorInst *PT = Pred->getTerminator();
+          // Stop the traversal at the entry block of a detached CFG.
+          if (DetachInst *DI = dyn_cast<DetachInst>(PT)) {
+            if (DI->getDetached() == PBB)
+              continue;
+            else // DI->getContinue() == PBB
+              // This detach reaches the sync through the continuation edge.
+              ReachingDetach = true;
+          }
+          if (ReachingDetach)
+            break;
+
+          // Ignore predecessors via a reattach, which belong to child detached
+          // contexts.
+          if (isa<ReattachInst>(PT))
+            continue;
+
+          // For a predecessor terminated by a sync instruction, check the sync
+          // region it belongs to.  If the sync belongs to a different sync
+          // region, add the block that starts that region.  Otherwise, ignore
+          // the predecessor.
+          if (SyncInst *SI = dyn_cast<SyncInst>(PT)) {
+            if (SI->getSyncRegion() != Sync->getSyncRegion())
+              for (User *U : SI->getSyncRegion()->users())
+                if (isa<DetachInst>(U))
+                  WorkList.push_back(cast<Instruction>(U)->getParent());
+            continue;
+          }
+
+          WorkList.push_back(Pred);
+        }
+      }
+
+      // If no detach reaches this sync, then this sync can be removed.
+      if (!ReachingDetach) {
+        BasicBlock* Succ = Sync->getSuccessor(0);
+        ReplaceInstWithInst(Sync, BranchInst::Create(Succ));
+        Changed = true;
+        if (MergeBlockIntoPredecessor(Succ)) goto check;
+      }
+    }
+  }
+  return Changed;
+}
+
 /// Call SimplifyCFG on all the blocks in the function,
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
@@ -176,6 +243,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
   EverChanged |= iterativelySimplifyCFG(F, TTI, Options);
+  EverChanged |= removeUselessSyncs(F);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -191,6 +259,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
   do {
     EverChanged = iterativelySimplifyCFG(F, TTI, Options);
     EverChanged |= removeUnreachableBlocks(F);
+    EverChanged |= removeUselessSyncs(F);
   } while (EverChanged);
 
   return true;
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 0f6db21f73b60e..7a24ab744b4b75 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -684,6 +684,38 @@ static bool eliminateRecursiveTailCall(
   return true;
 }
 
+static void getReturnBlocksToSync(
+    BasicBlock *Entry, SyncInst *Sync,
+    SmallVectorImpl<BasicBlock *> &ReturnBlocksToSync) {
+  // Walk the CFG from the entry block, stopping traversal at any sync within
+  // the same region.  Record all blocks found that are terminated by a return
+  // instruction.
+  Value *SyncRegion = Sync->getSyncRegion();
+  SmallVector<BasicBlock *, 8> WorkList;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+  WorkList.push_back(Entry);
+  while (!WorkList.empty()) {
+    BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    // Skip paths that are synced within the same region.
+    if (SyncInst *SI = dyn_cast<SyncInst>(BB->getTerminator()))
+      if (SI->getSyncRegion() == SyncRegion)
+        continue;
+
+    // If we find a return, we must add a sync before it if we eliminate a
+    // recursive tail call.
+    if (isa<ReturnInst>(BB->getTerminator()))
+      ReturnBlocksToSync.push_back(BB);
+
+    // Queue up successors to search.
+    for (BasicBlock *Succ : successors(BB))
+      if (Succ != Sync->getParent())
+        WorkList.push_back(Succ);
+  }
+}
+
 static bool foldReturnAndProcessPred(
     BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
     bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
@@ -700,13 +732,17 @@ static bool foldReturnAndProcessPred(
   // predecessors and perform TRE there. Look for predecessors that end
   // in unconditional branch and recursive call(s).
   SmallVector<BranchInst*, 8> UncondBranchPreds;
+  SmallVector<SyncInst*, 8> SyncPreds;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *Pred = *PI;
     Instruction *PTI = Pred->getTerminator();
     if (BranchInst *BI = dyn_cast<BranchInst>(PTI))
       if (BI->isUnconditional())
         UncondBranchPreds.push_back(BI);
+    if (SyncInst *SI = dyn_cast<SyncInst>(PTI))
+      SyncPreds.push_back(SI);
   }
+  BasicBlock *OldEntryBlock = &BB->getParent()->getEntryBlock();
 
   while (!UncondBranchPreds.empty()) {
     BranchInst *BI = UncondBranchPreds.pop_back_val();
@@ -730,6 +766,68 @@ static bool foldReturnAndProcessPred(
     }
   }
 
+  // If this loop runs, then the previous one could not have erased BB, because
+  // BB has a predecessor that is not an unconditional branch.
+  while (!SyncPreds.empty()) {
+    SyncInst *SI = SyncPreds.pop_back_val();
+    BasicBlock *Pred = SI->getParent();
+    if (CallInst *CI =
+        findTRECandidate(SI, CannotTailCallElimCallsMarkedTail, TTI)) {
+      // Check that all instructions between the candidate tail call and the
+      // sync can be moved above the call.  In particular, we disallow
+      // accumulator recursion elimination for tail calls before a sync.
+      BasicBlock::iterator BBI(CI);
+      for (++BBI; &*BBI != SI; ++BBI)
+        if (!canMoveAboveCall(&*BBI, CI, AA))
+          break;
+      if (&*BBI != SI)
+        continue;
+
+      // Get the sync region for this sync.
+      Value *SyncRegion = SI->getSyncRegion();
+
+      // Check that the sync region begins in the entry block of the function.
+      if (cast<Instruction>(SyncRegion)->getParent() != OldEntryBlock) {
+        DEBUG(dbgs() << "Cannot eliminate tail call " << *CI <<
+              ": sync region does not start in entry block.");
+        continue;
+      }
+
+      // Get returns reachable from newly created loop.
+      SmallVector<BasicBlock *, 8> ReturnBlocksToSync;
+      getReturnBlocksToSync(OldEntryBlock, SI, ReturnBlocksToSync);
+
+      // Remove the sync.
+      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred);
+
+      // Cleanup: if all predecessors of BB have been eliminated by
+      // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
+      // because the ret instruction in there is still using a value which
+      // eliminateRecursiveTailCall will attempt to remove.
+      if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+        BB->eraseFromParent();
+
+      bool EliminatedTail =
+        eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
+                                   ArgumentPHIs, AA);
+
+      // If a recursive tail was eliminated, fix up the syncs and sync region in
+      // the CFG.
+      if (EliminatedTail) {
+        // Move the sync region start to the new entry block.
+        BasicBlock *NewEntry = &OldEntry->getParent()->getEntryBlock();
+        cast<Instruction>(SyncRegion)->moveBefore(&*(NewEntry->begin()));
+        // Insert syncs before relevant return blocks.
+        for (BasicBlock *RetBlock : ReturnBlocksToSync) {
+          BasicBlock *NewRetBlock = SplitBlock(RetBlock,
+                                               RetBlock->getTerminator());
+          ReplaceInstWithInst(RetBlock->getTerminator(),
+                              SyncInst::Create(NewRetBlock, SyncRegion));
+        }
+        Change = true;
+      }
+    }
+  }
   return Change;
 }
 
diff --git a/llvm/lib/Transforms/Tapir/CMakeLists.txt b/llvm/lib/Transforms/Tapir/CMakeLists.txt
new file mode 100644
index 00000000000000..568558d64e84ae
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_llvm_library(LLVMTapirOpts
+  CilkABI.cpp
+  SmallBlock.cpp
+  RedundantSpawn.cpp
+  SpawnRestructure.cpp
+  SpawnUnswitch.cpp
+  SyncElimination.cpp
+  LowerToCilk.cpp
+  LoopSpawning.cpp
+  Outline.cpp
+  Tapir.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
+  ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms/Tapir
+  )
+
+add_dependencies(LLVMTapirOpts intrinsics_gen)
diff --git a/llvm/lib/Transforms/Tapir/CilkABI.cpp b/llvm/lib/Transforms/Tapir/CilkABI.cpp
new file mode 100644
index 00000000000000..bf679d2e0c5377
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/CilkABI.cpp
@@ -0,0 +1,1344 @@
+//===- CilkABI.cpp - Lower Tapir into Cilk runtime system calls -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CilkABI interface, which is used to convert Tapir
+// instructions -- detach, reattach, and sync -- to calls into the Cilk
+// runtime system.  This interface does the low-level dirty work of passes
+// such as LowerToCilk.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/CilkABI.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cilkabi"
+
+/// Helper typedefs for cilk struct TypeBuilders.
+typedef llvm::TypeBuilder<__cilkrts_stack_frame, false> StackFrameBuilder;
+typedef llvm::TypeBuilder<__cilkrts_worker, false> WorkerBuilder;
+typedef llvm::TypeBuilder<__cilkrts_pedigree, false> PedigreeBuilder;
+
+/// Helper methods for storing to and loading from struct fields.
+static Value *GEP(IRBuilder<> &B, Value *Base, int field) {
+  // return B.CreateStructGEP(cast<PointerType>(Base->getType()),
+  //                          Base, field);
+  return B.CreateConstInBoundsGEP2_32(nullptr, Base, 0, field);
+}
+
+static void StoreField(IRBuilder<> &B, Value *Val, Value *Dst, int field,
+                       bool isVolatile = false) {
+  B.CreateStore(Val, GEP(B, Dst, field), isVolatile);
+}
+
+static Value *LoadField(IRBuilder<> &B, Value *Src, int field,
+                        bool isVolatile = false) {
+  return B.CreateLoad(GEP(B, Src, field), isVolatile);
+}
+
+/// \brief Emit inline assembly code to save the floating point
+/// state, for x86 Only.
+static void EmitSaveFloatingPointState(IRBuilder<> &B, Value *SF) {
+  typedef void (AsmPrototype)(uint32_t*, uint16_t*);
+  llvm::FunctionType *FTy =
+    TypeBuilder<AsmPrototype, false>::get(B.getContext());
+
+  Value *Asm = InlineAsm::get(FTy,
+                              "stmxcsr $0\n\t" "fnstcw $1",
+                              "*m,*m,~{dirflag},~{fpsr},~{flags}",
+                              /*sideeffects*/ true);
+
+  Value * args[2] = {
+    GEP(B, SF, StackFrameBuilder::mxcsr),
+    GEP(B, SF, StackFrameBuilder::fpcsr)
+  };
+
+  B.CreateCall(Asm, args);
+}
+
+/// \brief Helper to find a function with the given name, creating it if it
+/// doesn't already exist. If the function needed to be created then return
+/// false, signifying that the caller needs to add the function body.
+template <typename T>
+static bool GetOrCreateFunction(const char *FnName, Module& M,
+                                Function *&Fn,
+                                Function::LinkageTypes Linkage =
+                                Function::InternalLinkage,
+                                bool DoesNotThrow = true) {
+  LLVMContext &Ctx = M.getContext();
+
+  Fn = M.getFunction(FnName);
+
+  // if the function already exists then let the
+  // caller know that it is complete
+  if (Fn)
+    return true;
+
+  // Otherwise we have to create it
+  FunctionType *FTy = TypeBuilder<T, false>::get(Ctx);
+  Fn = Function::Create(FTy, Linkage, FnName, &M);
+
+  // Set nounwind if it does not throw.
+  if (DoesNotThrow)
+    Fn->setDoesNotThrow();
+
+  // and let the caller know that the function is incomplete
+  // and the body still needs to be added
+  return false;
+}
+
+/// \brief Emit a call to the CILK_SETJMP function.
+static CallInst *EmitCilkSetJmp(IRBuilder<> &B, Value *SF, Module& M) {
+  LLVMContext &Ctx = M.getContext();
+
+  // We always want to save the floating point state too
+  EmitSaveFloatingPointState(B, SF);
+
+  Type *Int32Ty = Type::getInt32Ty(Ctx);
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+
+  // Get the buffer to store program state
+  // Buffer is a void**.
+  Value *Buf = GEP(B, SF, StackFrameBuilder::ctx);
+
+  // Store the frame pointer in the 0th slot
+  Value *FrameAddr =
+    B.CreateCall(Intrinsic::getDeclaration(&M, Intrinsic::frameaddress),
+                 ConstantInt::get(Int32Ty, 0));
+
+  Value *FrameSaveSlot = GEP(B, Buf, 0);
+  B.CreateStore(FrameAddr, FrameSaveSlot, /*isVolatile=*/true);
+
+  // Store stack pointer in the 2nd slot
+  Value *StackAddr = B.CreateCall(
+      Intrinsic::getDeclaration(&M, Intrinsic::stacksave));
+
+  Value *StackSaveSlot = GEP(B, Buf, 2);
+  B.CreateStore(StackAddr, StackSaveSlot, /*isVolatile=*/true);
+
+  Buf = B.CreateBitCast(Buf, Int8PtrTy);
+
+  // Call LLVM's EH setjmp, which is lightweight.
+  Value* F = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp);
+
+  CallInst *SetjmpCall = B.CreateCall(F, Buf);
+  SetjmpCall->setCanReturnTwice();
+
+  return SetjmpCall;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_pop_frame.
+/// It is equivalent to the following C code
+///
+/// __cilkrts_pop_frame(__cilkrts_stack_frame *sf) {
+///   sf->worker->current_stack_frame = sf->call_parent;
+///   sf->call_parent = 0;
+/// }
+static Function *Get__cilkrts_pop_frame(Module &M) {
+  Function *Fn = 0;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_pop_frame", M, Fn))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+  IRBuilder<> B(Entry);
+
+  // sf->worker->current_stack_frame = sf.call_parent;
+  StoreField(B,
+             LoadField(B, SF, StackFrameBuilder::call_parent,
+                       /*isVolatile=*/true),
+             LoadField(B, SF, StackFrameBuilder::worker,
+                       /*isVolatile=*/true),
+             WorkerBuilder::current_stack_frame,
+             /*isVolatile=*/true);
+
+  // sf->call_parent = 0;
+  StoreField(B,
+             Constant::getNullValue(
+                 TypeBuilder<__cilkrts_stack_frame*, false>::get(Ctx)),
+             SF, StackFrameBuilder::call_parent, /*isVolatile=*/true);
+
+  B.CreateRetVoid();
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_detach.
+/// It is equivalent to the following C code
+///
+/// void __cilkrts_detach(struct __cilkrts_stack_frame *sf) {
+///   struct __cilkrts_worker *w = sf->worker;
+///   struct __cilkrts_stack_frame *volatile *tail = w->tail;
+///
+///   sf->spawn_helper_pedigree = w->pedigree;
+///   sf->call_parent->parent_pedigree = w->pedigree;
+///
+///   w->pedigree.rank = 0;
+///   w->pedigree.next = &sf->spawn_helper_pedigree;
+///
+///   *tail++ = sf->call_parent;
+///   w->tail = tail;
+///
+///   sf->flags |= CILK_FRAME_DETACHED;
+/// }
+static Function *Get__cilkrts_detach(Module &M) {
+  Function *Fn = 0;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_detach", M, Fn))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+  IRBuilder<> B(Entry);
+
+  // struct __cilkrts_worker *w = sf->worker;
+  Value *W = LoadField(B, SF, StackFrameBuilder::worker,
+                       /*isVolatile=*/true);
+
+  // __cilkrts_stack_frame *volatile *tail = w->tail;
+  Value *Tail = LoadField(B, W, WorkerBuilder::tail,
+                          /*isVolatile=*/true);
+
+  // sf->spawn_helper_pedigree = w->pedigree;
+  StoreField(B,
+             LoadField(B, W, WorkerBuilder::pedigree),
+             SF, StackFrameBuilder::parent_pedigree);
+
+  // sf->call_parent->parent_pedigree = w->pedigree;
+  StoreField(B,
+             LoadField(B, W, WorkerBuilder::pedigree),
+             LoadField(B, SF, StackFrameBuilder::call_parent),
+             StackFrameBuilder::parent_pedigree);
+
+  // w->pedigree.rank = 0;
+  {
+    StructType *STy = PedigreeBuilder::get(Ctx);
+    llvm::Type *Ty = STy->getElementType(PedigreeBuilder::rank);
+    StoreField(B,
+               ConstantInt::get(Ty, 0),
+               GEP(B, W, WorkerBuilder::pedigree),
+               PedigreeBuilder::rank);
+  }
+
+  // w->pedigree.next = &sf->spawn_helper_pedigree;
+  StoreField(B,
+             GEP(B, SF, StackFrameBuilder::parent_pedigree),
+             GEP(B, W, WorkerBuilder::pedigree),
+             PedigreeBuilder::next);
+
+  // *tail++ = sf->call_parent;
+  B.CreateStore(LoadField(B, SF, StackFrameBuilder::call_parent,
+                          /*isVolatile=*/true),
+                Tail, /*isVolatile=*/true);
+  Tail = B.CreateConstGEP1_32(Tail, 1);
+
+  // w->tail = tail;
+  StoreField(B, Tail, W, WorkerBuilder::tail, /*isVolatile=*/true);
+
+  // sf->flags |= CILK_FRAME_DETACHED;
+  {
+    Value *F = LoadField(B, SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+    F = B.CreateOr(F, ConstantInt::get(F->getType(), CILK_FRAME_DETACHED));
+    StoreField(B, F, SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+  }
+
+  B.CreateRetVoid();
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilk_sync.
+/// Calls to this function is always inlined, as it saves
+/// the current stack/frame pointer values. This function must be marked
+/// as returns_twice to allow it to be inlined, since the call to setjmp
+/// is marked returns_twice.
+///
+/// It is equivalent to the following C code
+///
+/// void __cilk_sync(struct __cilkrts_stack_frame *sf) {
+///   if (sf->flags & CILK_FRAME_UNSYNCHED) {
+///     sf->parent_pedigree = sf->worker->pedigree;
+///     SAVE_FLOAT_STATE(*sf);
+///     if (!CILK_SETJMP(sf->ctx))
+///       __cilkrts_sync(sf);
+///     else if (sf->flags & CILK_FRAME_EXCEPTING)
+///       __cilkrts_rethrow(sf);
+///   }
+///   ++sf->worker->pedigree.rank;
+/// }
+///
+/// With exceptions disabled in the compiler, the function
+/// does not call __cilkrts_rethrow()
+static Function *GetCilkSyncFn(Module &M, bool instrument = false) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilk_sync", M, Fn,
+                                     Function::InternalLinkage,
+                                     /*doesNotThrow*/false))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "cilk.sync.test", Fn);
+  BasicBlock *SaveState = BasicBlock::Create(Ctx, "cilk.sync.savestate", Fn);
+  BasicBlock *SyncCall = BasicBlock::Create(Ctx, "cilk.sync.runtimecall", Fn);
+  BasicBlock *Excepting = BasicBlock::Create(Ctx, "cilk.sync.excepting", Fn);
+  // TODO: Detect whether exceptions are needed.
+  BasicBlock *Rethrow = BasicBlock::Create(Ctx, "cilk.sync.rethrow", Fn);
+  BasicBlock *Exit = BasicBlock::Create(Ctx, "cilk.sync.end", Fn);
+
+  // Entry
+  {
+    IRBuilder<> B(Entry);
+
+    if (instrument)
+      // cilk_sync_begin
+      B.CreateCall(CILK_CSI_FUNC(sync_begin, M), SF);
+
+    // if (sf->flags & CILK_FRAME_UNSYNCHED)
+    Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+                             /*isVolatile=*/true);
+    Flags = B.CreateAnd(Flags,
+                        ConstantInt::get(Flags->getType(),
+                                         CILK_FRAME_UNSYNCHED));
+    Value *Zero = ConstantInt::get(Flags->getType(), 0);
+    Value *Unsynced = B.CreateICmpEQ(Flags, Zero);
+    B.CreateCondBr(Unsynced, Exit, SaveState);
+  }
+
+  // SaveState
+  {
+    IRBuilder<> B(SaveState);
+
+    // sf.parent_pedigree = sf.worker->pedigree;
+    StoreField(B,
+               LoadField(B, LoadField(B, SF, StackFrameBuilder::worker,
+                                      /*isVolatile=*/true),
+                         WorkerBuilder::pedigree),
+               SF, StackFrameBuilder::parent_pedigree);
+
+    // if (!CILK_SETJMP(sf.ctx))
+    Value *C = EmitCilkSetJmp(B, SF, M);
+    C = B.CreateICmpEQ(C, ConstantInt::get(C->getType(), 0));
+    B.CreateCondBr(C, SyncCall, Excepting);
+  }
+
+  // SyncCall
+  {
+    IRBuilder<> B(SyncCall);
+
+    // __cilkrts_sync(&sf);
+    B.CreateCall(CILKRTS_FUNC(sync, M), SF);
+    B.CreateBr(Exit);
+  }
+
+  // Excepting
+  {
+    IRBuilder<> B(Excepting);
+    if (Rethrow) {
+      Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+                               /*isVolatile=*/true);
+      Flags = B.CreateAnd(Flags,
+                          ConstantInt::get(Flags->getType(),
+                                           CILK_FRAME_EXCEPTING));
+      Value *Zero = ConstantInt::get(Flags->getType(), 0);
+      Value *CanExcept = B.CreateICmpEQ(Flags, Zero);
+      B.CreateCondBr(CanExcept, Exit, Rethrow);
+    } else {
+      B.CreateBr(Exit);
+    }
+  }
+
+  // Rethrow
+  if (Rethrow) {
+    IRBuilder<> B(Rethrow);
+    B.CreateCall(CILKRTS_FUNC(rethrow, M), SF)->setDoesNotReturn();
+    B.CreateUnreachable();
+  }
+
+  // Exit
+  {
+    IRBuilder<> B(Exit);
+
+    // ++sf.worker->pedigree.rank;
+    Value *Rank = LoadField(B, SF, StackFrameBuilder::worker,
+                            /*isVolatile=*/true);
+    Rank = GEP(B, Rank, WorkerBuilder::pedigree);
+    Rank = GEP(B, Rank, PedigreeBuilder::rank);
+    B.CreateStore(B.CreateAdd(
+                      B.CreateLoad(Rank),
+                      ConstantInt::get(Rank->getType()->getPointerElementType(),
+                                       1)),
+                  Rank);
+    if (instrument)
+      // cilk_sync_end
+      B.CreateCall(CILK_CSI_FUNC(sync_end, M), SF);
+
+    B.CreateRetVoid();
+  }
+
+  Fn->addFnAttr(Attribute::AlwaysInline);
+  Fn->addFnAttr(Attribute::ReturnsTwice);
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_enter_frame.
+/// It is equivalent to the following C code
+///
+/// void __cilkrts_enter_frame_1(struct __cilkrts_stack_frame *sf)
+/// {
+///     struct __cilkrts_worker *w = __cilkrts_get_tls_worker();
+///     if (w == 0) { /* slow path, rare */
+///         w = __cilkrts_bind_thread_1();
+///         sf->flags = CILK_FRAME_LAST | CILK_FRAME_VERSION;
+///     } else {
+///         sf->flags = CILK_FRAME_VERSION;
+///     }
+///     sf->call_parent = w->current_stack_frame;
+///     sf->worker = w;
+///     /* sf->except_data is only valid when CILK_FRAME_EXCEPTING is set */
+///     w->current_stack_frame = sf;
+/// }
+static Function *Get__cilkrts_enter_frame_1(Module &M) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_enter_frame_1", M, Fn))
+    return Fn;
+
+  LLVMContext &Ctx = M.getContext();
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+  BasicBlock *SlowPath = BasicBlock::Create(Ctx, "slowpath", Fn);
+  BasicBlock *FastPath = BasicBlock::Create(Ctx, "fastpath", Fn);
+  BasicBlock *Cont = BasicBlock::Create(Ctx, "cont", Fn);
+
+  llvm::PointerType *WorkerPtrTy =
+    TypeBuilder<__cilkrts_worker*, false>::get(Ctx);
+  StructType *SFTy = StackFrameBuilder::get(Ctx);
+
+  // Block  (Entry)
+  CallInst *W = nullptr;
+  {
+    IRBuilder<> B(Entry);
+    if (fastCilk)
+      W = B.CreateCall(CILKRTS_FUNC(get_tls_worker_fast, M));
+    else
+      W = B.CreateCall(CILKRTS_FUNC(get_tls_worker, M));
+
+    Value *Cond = B.CreateICmpEQ(W, ConstantPointerNull::get(WorkerPtrTy));
+    B.CreateCondBr(Cond, SlowPath, FastPath);
+  }
+  // Block  (SlowPath)
+  CallInst *Wslow = nullptr;
+  {
+    IRBuilder<> B(SlowPath);
+    Wslow = B.CreateCall(CILKRTS_FUNC(bind_thread_1, M));
+    llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags);
+    StoreField(B,
+               ConstantInt::get(Ty, CILK_FRAME_LAST | CILK_FRAME_VERSION),
+               SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+    B.CreateBr(Cont);
+  }
+  // Block  (FastPath)
+  {
+    IRBuilder<> B(FastPath);
+    llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags);
+    StoreField(B,
+               ConstantInt::get(Ty, CILK_FRAME_VERSION),
+               SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+    B.CreateBr(Cont);
+  }
+  // Block  (Cont)
+  {
+    IRBuilder<> B(Cont);
+    Value *Wfast = W;
+    PHINode *W  = B.CreatePHI(WorkerPtrTy, 2);
+    W->addIncoming(Wslow, SlowPath);
+    W->addIncoming(Wfast, FastPath);
+
+    StoreField(B,
+               LoadField(B, W, WorkerBuilder::current_stack_frame,
+                         /*isVolatile=*/true),
+               SF, StackFrameBuilder::call_parent,
+               /*isVolatile=*/true);
+
+    StoreField(B, W, SF, StackFrameBuilder::worker, /*isVolatile=*/true);
+    StoreField(B, SF, W, WorkerBuilder::current_stack_frame,
+               /*isVolatile=*/true);
+
+    B.CreateRetVoid();
+  }
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+/// \brief Get or create a LLVM function for __cilkrts_enter_frame_fast.
+/// It is equivalent to the following C code
+///
+/// void __cilkrts_enter_frame_fast_1(struct __cilkrts_stack_frame *sf)
+/// {
+///     struct __cilkrts_worker *w = __cilkrts_get_tls_worker();
+///     sf->flags = CILK_FRAME_VERSION;
+///     sf->call_parent = w->current_stack_frame;
+///     sf->worker = w;
+///     /* sf->except_data is only valid when CILK_FRAME_EXCEPTING is set */
+///     w->current_stack_frame = sf;
+/// }
+static Function *Get__cilkrts_enter_frame_fast_1(Module &M) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilkrts_enter_frame_fast_1", M, Fn))
+    return Fn;
+
+  LLVMContext &Ctx = M.getContext();
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+
+  IRBuilder<> B(Entry);
+  Value *W;
+
+  if (fastCilk)
+    W = B.CreateCall(CILKRTS_FUNC(get_tls_worker_fast, M));
+  else
+    W = B.CreateCall(CILKRTS_FUNC(get_tls_worker, M));
+
+  StructType *SFTy = StackFrameBuilder::get(Ctx);
+  llvm::Type *Ty = SFTy->getElementType(StackFrameBuilder::flags);
+
+  StoreField(B,
+             ConstantInt::get(Ty, CILK_FRAME_VERSION),
+             SF, StackFrameBuilder::flags, /*isVolatile=*/true);
+  StoreField(B,
+             LoadField(B, W, WorkerBuilder::current_stack_frame,
+                       /*isVolatile=*/true),
+             SF, StackFrameBuilder::call_parent,
+             /*isVolatile=*/true);
+  StoreField(B, W, SF, StackFrameBuilder::worker, /*isVolatile=*/true);
+  StoreField(B, SF, W, WorkerBuilder::current_stack_frame, /*isVolatile=*/true);
+
+  B.CreateRetVoid();
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+// /// \brief Get or create a LLVM function for __cilk_parent_prologue.
+// /// It is equivalent to the following C code
+// ///
+// /// void __cilk_parent_prologue(__cilkrts_stack_frame *sf) {
+// ///   __cilkrts_enter_frame_1(sf);
+// /// }
+// static Function *GetCilkParentPrologue(Module &M) {
+//   Function *Fn = 0;
+
+//   if (GetOrCreateFunction<cilk_func>("__cilk_parent_prologue", M, Fn))
+//     return Fn;
+
+//   // If we get here we need to add the function body
+//   LLVMContext &Ctx = M.getContext();
+
+//   Function::arg_iterator args = Fn->arg_begin();
+//   Value *SF = &*args;
+
+//   BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn);
+//   IRBuilder<> B(Entry);
+
+//   // __cilkrts_enter_frame_1(sf)
+//   B.CreateCall(CILKRTS_FUNC(enter_frame_1, M), SF);
+
+//   B.CreateRetVoid();
+
+//   Fn->addFnAttr(Attribute::InlineHint);
+
+//   return Fn;
+// }
+
+/// \brief Get or create a LLVM function for __cilk_parent_epilogue.
+/// It is equivalent to the following C code
+///
+/// void __cilk_parent_epilogue(__cilkrts_stack_frame *sf) {
+///   __cilkrts_pop_frame(sf);
+///   if (sf->flags != CILK_FRAME_VERSION)
+///     __cilkrts_leave_frame(sf);
+/// }
+static Function *GetCilkParentEpilogue(Module &M, bool instrument = false) {
+  Function *Fn = nullptr;
+
+  if (GetOrCreateFunction<cilk_func>("__cilk_parent_epilogue", M, Fn))
+    return Fn;
+
+  // If we get here we need to add the function body
+  LLVMContext &Ctx = M.getContext();
+
+  Function::arg_iterator args = Fn->arg_begin();
+  Value *SF = &*args;
+
+  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", Fn),
+    *B1 = BasicBlock::Create(Ctx, "body", Fn),
+    *Exit  = BasicBlock::Create(Ctx, "exit", Fn);
+
+  // Entry
+  {
+    IRBuilder<> B(Entry);
+
+    if (instrument)
+      // cilk_leave_begin
+      B.CreateCall(CILK_CSI_FUNC(leave_begin, M), SF);
+
+    // __cilkrts_pop_frame(sf)
+    B.CreateCall(CILKRTS_FUNC(pop_frame, M), SF);
+
+    // if (sf->flags != CILK_FRAME_VERSION)
+    Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+                             /*isVolatile=*/true);
+    Value *Cond = B.CreateICmpNE(Flags,
+                                 ConstantInt::get(Flags->getType(),
+                                                  CILK_FRAME_VERSION));
+    B.CreateCondBr(Cond, B1, Exit);
+  }
+
+  // B1
+  {
+    IRBuilder<> B(B1);
+
+    // __cilkrts_leave_frame(sf);
+    B.CreateCall(CILKRTS_FUNC(leave_frame, M), SF);
+    B.CreateBr(Exit);
+  }
+
+  // Exit
+  {
+    IRBuilder<> B(Exit);
+    if (instrument)
+      // cilk_leave_end
+      B.CreateCall(CILK_CSI_FUNC(leave_end, M));
+    B.CreateRetVoid();
+  }
+
+  Fn->addFnAttr(Attribute::InlineHint);
+
+  return Fn;
+}
+
+static const StringRef stack_frame_name = "__cilkrts_sf";
+static const StringRef worker8_name = "__cilkrts_wc8";
+
+// static llvm::Value *LookupStackFrame(Function &F) {
+//   return F.getValueSymbolTable()->lookup(stack_frame_name);
+// }
+
+/// \brief Create the __cilkrts_stack_frame for the spawning function.
+static AllocaInst *CreateStackFrame(Function &F) {
+  // assert(!LookupStackFrame(F) && "already created the stack frame");
+
+  LLVMContext &Ctx = F.getContext();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  Type *SFTy = StackFrameBuilder::get(Ctx);
+
+  Instruction *I = F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime();
+
+  AllocaInst *SF = new AllocaInst(SFTy, DL.getAllocaAddrSpace(),
+                                  /*size*/nullptr, 8,
+                                  /*name*/stack_frame_name, /*insert before*/I);
+  if (!I)
+    F.getEntryBlock().getInstList().push_back(SF);
+
+  return SF;
+}
+
+Value* GetOrInitCilkStackFrame(Function& F,
+                               ValueToValueMapTy &DetachCtxToStackFrame,
+                               bool Helper = true, bool instrument = false) {
+  // Value* V = LookupStackFrame(F);
+  Value *V = DetachCtxToStackFrame[&F];
+  if (V) return V;
+
+  AllocaInst* alloc = CreateStackFrame(F);
+  DetachCtxToStackFrame[&F] = alloc;
+  BasicBlock::iterator II = F.getEntryBlock().getFirstInsertionPt();
+  AllocaInst* curinst;
+  do {
+    curinst = dyn_cast<llvm::AllocaInst>(II);
+    II++;
+  } while (curinst != alloc);
+  Value *StackSave;
+  IRBuilder<> IRB(&(F.getEntryBlock()), II);
+
+  if (instrument) {
+    Type *Int8PtrTy = IRB.getInt8PtrTy();
+    Value *ThisFn = ConstantExpr::getBitCast(&F, Int8PtrTy);
+    Value *ReturnAddress =
+      IRB.CreateCall(Intrinsic::getDeclaration(F.getParent(),
+                                               Intrinsic::returnaddress),
+                     IRB.getInt32(0));
+    StackSave =
+      IRB.CreateCall(Intrinsic::getDeclaration(F.getParent(),
+                                               Intrinsic::stacksave));
+    if (Helper) {
+      Value *begin_args[3] = { alloc, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_helper_begin, *F.getParent()),
+                     begin_args);
+    } else {
+      Value *begin_args[4] = { IRB.getInt32(0), alloc, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_begin, *F.getParent()), begin_args);
+    }
+  }
+  Value *args[1] = { alloc };
+  if (Helper)
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_fast_1, *F.getParent()), args);
+  else
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_1, *F.getParent()), args);
+  /* inst->insertAfter(alloc); */
+
+  if (instrument) {
+    Value* end_args[2] = { alloc, StackSave };
+    IRB.CreateCall(CILK_CSI_FUNC(enter_end, *F.getParent()), end_args);
+  }
+
+  EscapeEnumerator EE(F, "cilkabi_epilogue", false);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    if (isa<ReturnInst>(AtExit->GetInsertPoint()))
+      AtExit->CreateCall(GetCilkParentEpilogue(*F.getParent(), instrument),
+                         args, "");
+  }
+
+  // // The function exits are unified before lowering.
+  // ReturnInst *retInst = nullptr;
+  // for (BasicBlock &BB : F) {
+  //   TerminatorInst* TI = BB.getTerminator();
+  //   if (!TI) continue;
+  //   if (ReturnInst* RI = llvm::dyn_cast<ReturnInst>(TI)) {
+  //     assert(!retInst && "Multiple returns found.");
+  //     retInst = RI;
+  //   }
+  // }
+
+  // assert(retInst && "No returns found.");
+  // CallInst::Create(GetCilkParentEpilogue(*F.getParent(), instrument), args, "",
+  //                  retInst);
+  return alloc;
+}
+
+static inline
+bool makeFunctionDetachable(Function &extracted,
+                            ValueToValueMapTy &DetachCtxToStackFrame,
+                            bool instrument = false) {
+  Module *M = extracted.getParent();
+  // LLVMContext& Context = extracted.getContext();
+  // const DataLayout& DL = M->getDataLayout();
+  /*
+    __cilkrts_stack_frame sf;
+    __cilkrts_enter_frame_fast_1(&sf);
+    __cilkrts_detach();
+    *x = f(y);
+  */
+
+  Value *SF = CreateStackFrame(extracted);
+  DetachCtxToStackFrame[&extracted] = SF;
+  assert(SF);
+  Value *args[1] = { SF };
+
+  // Scan function to see if it detaches.
+  bool SimpleHelper = true;
+  for (BasicBlock &BB : extracted) {
+    if (isa<DetachInst>(BB.getTerminator())) {
+      SimpleHelper = false;
+      break;
+    }
+  }
+  if (!SimpleHelper)
+    DEBUG(dbgs() << "Detachable helper function itself detaches.\n");
+
+  BasicBlock::iterator II = extracted.getEntryBlock().getFirstInsertionPt();
+  AllocaInst* curinst;
+  do {
+    curinst = dyn_cast<llvm::AllocaInst>(II);
+    II++;
+  } while (curinst != SF);
+  Value *StackSave;
+  IRBuilder<> IRB(&(extracted.getEntryBlock()), II);
+
+  if (instrument) {
+    Type *Int8PtrTy = IRB.getInt8PtrTy();
+    Value *ThisFn = ConstantExpr::getBitCast(&extracted, Int8PtrTy);
+    Value *ReturnAddress =
+      IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::returnaddress),
+                     IRB.getInt32(0));
+    StackSave =
+      IRB.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+    if (SimpleHelper) {
+      Value *begin_args[3] = { SF, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_helper_begin, *M), begin_args);
+    } else {
+      Value *begin_args[4] = { IRB.getInt32(0), SF, ThisFn, ReturnAddress };
+      IRB.CreateCall(CILK_CSI_FUNC(enter_begin, *M), begin_args);
+    }
+  }
+
+  if (SimpleHelper)
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_fast_1, *M), args);
+  else
+    IRB.CreateCall(CILKRTS_FUNC(enter_frame_1, *M), args);
+
+  if (instrument) {
+    Value *end_args[2] = { SF, StackSave };
+    IRB.CreateCall(CILK_CSI_FUNC(enter_end, *M), end_args);
+  }
+
+  // Call __cilkrts_detach
+  {
+    if (instrument)
+      IRB.CreateCall(CILK_CSI_FUNC(detach_begin, *M), args);
+
+    IRB.CreateCall(CILKRTS_FUNC(detach, *M), args);
+
+    if (instrument)
+      IRB.CreateCall(CILK_CSI_FUNC(detach_end, *M));
+  }
+
+  EscapeEnumerator EE(extracted, "cilkabi_epilogue", false);
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    if (isa<ReturnInst>(AtExit->GetInsertPoint()))
+      AtExit->CreateCall(GetCilkParentEpilogue(*M, instrument), args, "");
+    else if (ResumeInst *RI = dyn_cast<ResumeInst>(AtExit->GetInsertPoint())) {
+      /*
+        sf.flags = sf.flags | CILK_FRAME_EXCEPTING;
+        sf.except_data = Exn;
+      */
+      IRBuilder<> B(RI);
+      Value *Exn = AtExit->CreateExtractValue(RI->getValue(),
+                                              ArrayRef<unsigned>(0));
+      Value *Flags = LoadField(*AtExit, SF, StackFrameBuilder::flags,
+                               /*isVolatile=*/true);
+      Flags = AtExit->CreateOr(Flags,
+                               ConstantInt::get(Flags->getType(),
+                                                CILK_FRAME_EXCEPTING));
+      StoreField(*AtExit, Exn, SF, StackFrameBuilder::except_data);
+      /*
+        __cilkrts_pop_frame(&sf);
+        if (sf->flags)
+          __cilkrts_leave_frame(&sf);
+      */
+      AtExit->CreateCall(GetCilkParentEpilogue(*M, instrument), args, "");
+      // CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", RI);
+    }
+  }
+
+  // // Handle returns
+  // ReturnInst* Ret = nullptr;
+  // for (BasicBlock &BB : extracted) {
+  //   TerminatorInst* TI = BB.getTerminator();
+  //   if (!TI) continue;
+  //   if (ReturnInst* RI = dyn_cast<ReturnInst>(TI)) {
+  //     assert(Ret == nullptr && "Multiple return");
+  //     Ret = RI;
+  //   }
+  // }
+  // assert(Ret && "No return from extract function");
+
+  // /*
+  //    __cilkrts_pop_frame(&sf);
+  //    if (sf->flags)
+  //      __cilkrts_leave_frame(&sf);
+  // */
+  // CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", Ret);
+
+  // // Handle resumes
+  // for (BasicBlock &BB : extracted) {
+  //   if (!isa<ResumeInst>(BB.getTerminator()))
+  //     continue;
+  //   ResumeInst *RI = cast<ResumeInst>(BB.getTerminator());
+  //   /*
+  //     sf.flags = sf.flags | CILK_FRAME_EXCEPTING;
+  //     sf.except_data = Exn;
+  //    */
+  //   IRBuilder<> B(RI);
+  //   Value *Exn = B.CreateExtractValue(RI->getValue(), ArrayRef<unsigned>(0));
+  //   Value *Flags = LoadField(B, SF, StackFrameBuilder::flags,
+  //                            /*isVolatile=*/true);
+  //   Flags = B.CreateOr(Flags,
+  //                      ConstantInt::get(Flags->getType(),
+  //                                       CILK_FRAME_EXCEPTING));
+  //   StoreField(B, Exn, SF, StackFrameBuilder::except_data);
+  //   /*
+  //     __cilkrts_pop_frame(&sf);
+  //     if (sf->flags)
+  //       __cilkrts_leave_frame(&sf);
+  //   */
+  //   CallInst::Create(GetCilkParentEpilogue(*M, instrument), args, "", RI);
+  // }
+
+  return true;
+}
+
+//##############################################################################
+
+/// \brief Get/Create the worker count for the spawning function.
+Value* llvm::cilk::GetOrCreateWorker8(Function &F) {
+  // Value* W8 = F.getValueSymbolTable()->lookup(worker8_name);
+  // if (W8) return W8;
+  IRBuilder<> B(F.getEntryBlock().getFirstNonPHIOrDbgOrLifetime());
+  Value *P0 = B.CreateCall(CILKRTS_FUNC(get_nworkers, *F.getParent()));
+  Value *P8 = B.CreateMul(P0, ConstantInt::get(P0->getType(), 8), worker8_name);
+  return P8;
+}
+
+void llvm::cilk::createSync(SyncInst &SI, ValueToValueMapTy &DetachCtxToStackFrame,
+                            bool instrument) {
+  Function &Fn = *(SI.getParent()->getParent());
+  Module &M = *(Fn.getParent());
+
+  Value *SF = GetOrInitCilkStackFrame(Fn, DetachCtxToStackFrame,
+                                      /*isFast*/false, instrument);
+  Value *args[] = { SF };
+  assert( args[0] && "sync used in function without frame!" );
+  CallInst *CI = CallInst::Create(GetCilkSyncFn(M, instrument), args, "",
+                                  /*insert before*/&SI);
+  CI->setDebugLoc(SI.getDebugLoc());
+  BasicBlock *Succ = SI.getSuccessor(0);
+  SI.eraseFromParent();
+  BranchInst::Create(Succ, CI->getParent());
+}
+
+bool llvm::cilk::verifyDetachedCFG(const DetachInst &Detach, DominatorTree &DT,
+                                   bool error) {
+  BasicBlock *Spawned  = Detach.getDetached();
+  BasicBlock *Continue = Detach.getContinue();
+  BasicBlockEdge DetachEdge(Detach.getParent(), Spawned);
+
+  SmallVector<BasicBlock *, 32> Todo;
+  SmallPtrSet<BasicBlock *, 32> functionPieces;
+  SmallVector<BasicBlock *, 4> WorkListEH;
+  Todo.push_back(Spawned);
+
+  while (!Todo.empty()) {
+    BasicBlock *BB = Todo.pop_back_val();
+
+    if (!functionPieces.insert(BB).second)
+      continue;
+
+    TerminatorInst* Term = BB->getTerminator();
+    if (Term == nullptr) return false;
+    if (ReattachInst* Inst = dyn_cast<ReattachInst>(Term)) {
+      //only analyze reattaches going to the same continuation
+      if (Inst->getSuccessor(0) != Continue) continue;
+      continue;
+    } else if (DetachInst* Inst = dyn_cast<DetachInst>(Term)) {
+      assert(Inst != &Detach && "Found recursive Detach!");
+      Todo.push_back(Inst->getSuccessor(0));
+      Todo.push_back(Inst->getSuccessor(1));
+      continue;
+    } else if (SyncInst* Inst = dyn_cast<SyncInst>(Term)) {
+      //only sync inner elements, consider as branch
+      Todo.push_back(Inst->getSuccessor(0));
+      continue;
+    } else if (isa<BranchInst>(Term) || isa<SwitchInst>(Term) ||
+               isa<InvokeInst>(Term)) {
+      for (BasicBlock *Succ : successors(BB)) {
+        if (!DT.dominates(DetachEdge, Succ))
+          // We assume that this block is an exception-handling block and save
+          // it for later processing.
+          WorkListEH.push_back(Succ);
+        else
+          Todo.push_back(Succ);
+      }
+      continue;
+    } else if (isa<UnreachableInst>(Term) || isa<ResumeInst>(Term)) {
+      continue;
+    } else {
+      DEBUG(Term->dump());
+      DEBUG(Term->getParent()->getParent()->dump());
+      assert(!error && "Detached block did not absolutely terminate in reattach");
+      return false;
+    }
+  }
+  {
+    SmallPtrSet<BasicBlock *, 4> Visited;
+    while (!WorkListEH.empty()) {
+      BasicBlock *BB = WorkListEH.pop_back_val();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      // Make sure that the control flow through these exception-handling blocks
+      // cannot re-enter the blocks being outlined.
+      assert(!functionPieces.count(BB) &&
+             "EH blocks for a detached region reenter that region.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't perform an ordinary return.
+      assert(!isa<ReturnInst>(BB->getTerminator()) &&
+             "EH block terminated by return.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't reattach to the detached CFG's continuation.
+      if (ReattachInst *RI = dyn_cast<ReattachInst>(BB->getTerminator()))
+        assert(RI->getSuccessor(0) != Continue &&
+               "Exit block reaches a reattach to the continuation.");
+
+      for (BasicBlock *Succ : successors(BB))
+        WorkListEH.push_back(Succ);
+    }
+  }
+  return true;
+}
+
+bool llvm::cilk::populateDetachedCFG(
+    const DetachInst &Detach, DominatorTree &DT,
+    SmallPtrSetImpl<BasicBlock *> &functionPieces,
+    SmallVectorImpl<BasicBlock *> &reattachB,
+    SmallPtrSetImpl<BasicBlock *> &ExitBlocks,
+    bool replace, bool error) {
+  SmallVector<BasicBlock *, 32> Todo;
+  SmallVector<BasicBlock *, 4> WorkListEH;
+
+  BasicBlock *Spawned  = Detach.getDetached();
+  BasicBlock *Continue = Detach.getContinue();
+  BasicBlockEdge DetachEdge(Detach.getParent(), Spawned);
+  Todo.push_back(Spawned);
+
+  while (!Todo.empty()) {
+    BasicBlock *BB = Todo.pop_back_val();
+
+    if (!functionPieces.insert(BB).second)
+      continue;
+
+    TerminatorInst *Term = BB->getTerminator();
+    if (Term == nullptr) return false;
+    if (isa<ReattachInst>(Term)) {
+      // only analyze reattaches going to the same continuation
+      if (Term->getSuccessor(0) != Continue) continue;
+      if (replace) {
+        BranchInst* toReplace = BranchInst::Create(Continue);
+        ReplaceInstWithInst(Term, toReplace);
+        reattachB.push_back(BB);
+      }
+      continue;
+    } else if (isa<DetachInst>(Term)) {
+      assert(Term != &Detach && "Found recursive detach!");
+      Todo.push_back(Term->getSuccessor(0));
+      Todo.push_back(Term->getSuccessor(1));
+      continue;
+    } else if (isa<SyncInst>(Term)) {
+      //only sync inner elements, consider as branch
+      Todo.push_back(Term->getSuccessor(0));
+      continue;
+    } else if (isa<BranchInst>(Term) || isa<SwitchInst>(Term) ||
+               isa<InvokeInst>(Term)) {
+      for (BasicBlock *Succ : successors(BB)) {
+        if (!DT.dominates(DetachEdge, Succ)) {
+          // We assume that this block is an exception-handling block and save
+          // it for later processing.
+          ExitBlocks.insert(Succ);
+          WorkListEH.push_back(Succ);
+        } else {
+          Todo.push_back(Succ);
+        }
+      }
+      // We don't bother cloning unreachable exits from the detached CFG at this
+      // point.  We're cloning the entire detached CFG anyway when we outline
+      // the function.
+      continue;
+    } else if (isa<UnreachableInst>(Term) || isa<ResumeInst>(Term)) {
+      continue;
+    } else {
+      DEBUG(Term->dump());
+      DEBUG(Term->getParent()->getParent()->dump());
+      assert(!error && "Detached block did not absolutely terminate in reattach");
+      return false;
+    }
+  }
+
+  // Find the exit-handling blocks.
+  {
+    SmallPtrSet<BasicBlock *, 4> Visited;
+    while (!WorkListEH.empty()) {
+      BasicBlock *BB = WorkListEH.pop_back_val();
+      if (!Visited.insert(BB).second)
+        continue;
+
+      // Make sure that the control flow through these exception-handling blocks
+      // cannot re-enter the blocks being outlined.
+      assert(!functionPieces.count(BB) &&
+             "EH blocks for a detached region reenter that region.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't perform an ordinary return.
+      assert(!isa<ReturnInst>(BB->getTerminator()) &&
+             "EH block terminated by return.");
+
+      // Make sure that the control flow through these exception-handling blocks
+      // doesn't reattach to the detached CFG's continuation.
+      if (ReattachInst *RI = dyn_cast<ReattachInst>(BB->getTerminator()))
+        assert(RI->getSuccessor(0) != Continue &&
+               "Exit block reaches a reattach to the continuation.");
+
+      // if (isa<ResumeInst>(BB-getTerminator()))
+      //   ResumeBlocks.push_back(BB);
+
+      for (BasicBlock *Succ : successors(BB)) {
+        ExitBlocks.insert(Succ);
+        WorkListEH.push_back(Succ);
+      }
+    }
+
+    // Visited now contains exception-handling blocks that we want to clone as
+    // part of outlining.
+    for (BasicBlock *EHBlock : Visited)
+      functionPieces.insert(EHBlock);
+  }
+
+  return true;
+}
+
+//Returns true if success
+Function *llvm::cilk::extractDetachBodyToFunction(DetachInst &detach,
+                                                  DominatorTree &DT,
+                                                  AssumptionCache &AC,
+                                                  CallInst **call) {
+  BasicBlock *Detacher = detach.getParent();
+  Function &F = *(Detacher->getParent());
+
+  BasicBlock *Spawned  = detach.getDetached();
+  BasicBlock *Continue = detach.getContinue();
+
+  SmallPtrSet<BasicBlock *, 32> functionPieces;
+  SmallVector<BasicBlock *, 32> reattachB;
+  SmallPtrSet<BasicBlock *, 4> ExitBlocks;
+
+  // if (!Spawned->getUniquePredecessor())
+  //   dbgs() << *Spawned;
+  assert(Spawned->getUniquePredecessor() &&
+         "Entry block of detached CFG has multiple predecessors.");
+  assert(Spawned->getUniquePredecessor() == Detacher &&
+         "Broken CFG.");
+
+  // if (getNumPred(Spawned) > 1) {
+  //   dbgs() << "Found multiple predecessors to a detached-CFG entry block "
+  //          << Spawned->getName() << ".\n";
+  //   BasicBlock* ts = BasicBlock::Create(Spawned->getContext(), Spawned->getName()+".fx", &F, Detacher);
+  //   IRBuilder<> b(ts);
+  //   b.CreateBr(Spawned);
+  //   detach.setSuccessor(0,ts);
+  //   llvm::BasicBlock::iterator i = Spawned->begin();
+  //   while (auto phi = llvm::dyn_cast<llvm::PHINode>(i)) {
+  //     int idx = phi->getBasicBlockIndex(detach.getParent());
+  //     phi->setIncomingBlock(idx, ts);
+  //     ++i;
+  //   }
+  //   Spawned = ts;
+  // }
+
+  if (!populateDetachedCFG(detach, DT, functionPieces, reattachB,
+                           ExitBlocks, true))
+    return nullptr;
+
+  // functionPieces.erase(Spawned);
+  // std::vector<BasicBlock *> blocks(functionPieces.begin(), functionPieces.end());
+  // blocks.insert(blocks.begin(), Spawned);
+  // functionPieces.insert(Spawned);
+
+  // Check the spawned block's predecessors.
+  for (BasicBlock *BB : functionPieces) {
+    int detached_count = 0;
+    if (ExitBlocks.count(BB))
+      continue;
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+      BasicBlock *Pred = *PI;
+      if (detached_count == 0 && BB == Spawned && Pred == detach.getParent()) {
+        detached_count = 1;
+        continue;
+      }
+      assert(functionPieces.count(Pred) &&
+             "Block inside of detached context branched into from outside branch context");
+    }
+  }
+
+  // Get the inputs and outputs for the detached CFG.
+  SetVector<Value *> Inputs, Outputs;
+  findInputsOutputs(functionPieces, Inputs, Outputs, &ExitBlocks);
+  // extractor.findInputsOutputs(Inputs, Outputs);
+  assert(Outputs.empty() &&
+         "All results from detached CFG should be passed by memory already.");
+
+  // Clone the detached CFG into a helper function.
+  ValueToValueMapTy VMap;
+  Function *extracted;
+  {
+    SmallVector<ReturnInst *, 4> Returns;  // Ignore returns cloned.
+    std::vector<BasicBlock *> blocks(functionPieces.begin(), functionPieces.end());
+
+    extracted = CreateHelper(Inputs, Outputs, blocks,
+                             Spawned, Detacher, Continue,
+                             VMap, F.getParent(),
+                             F.getSubprogram() != nullptr, Returns, ".cilk",
+                             &ExitBlocks, nullptr, nullptr, nullptr, nullptr);
+
+    assert(Returns.empty() && "Returns cloned when cloning detached CFG.");
+
+    // Use a fast calling convention for the helper.
+    extracted->setCallingConv(CallingConv::Fast);
+    // extracted->setCallingConv(F.getCallingConv());
+
+    extracted->addFnAttr(Attribute::NoInline);
+  }
+
+  // Add alignment assumptions to arguments of helper, based on alignment of
+  // values in old function.
+  AddAlignmentAssumptions(&F, Inputs, VMap, &detach, &AC, &DT);
+
+  // Add call to new helper function in original function.
+  CallInst *TopCall;
+  {
+    // Create call instruction.
+    IRBuilder<> Builder(&detach);
+    TopCall = Builder.CreateCall(extracted, Inputs.getArrayRef());
+    // Use a fast calling convention for the helper.
+    TopCall->setCallingConv(CallingConv::Fast);
+    // TopCall->setCallingConv(extracted->getCallingConv());
+    TopCall->setDebugLoc(detach.getDebugLoc());
+  }
+  if (call)
+    *call = TopCall;
+
+  // Move allocas in the newly cloned detached CFG to the entry block of the
+  // helper.
+  {
+    // Collect reattach instructions.
+    SmallVector<Instruction *, 4> ReattachPoints;
+    for (pred_iterator PI = pred_begin(Continue), PE = pred_end(Continue);
+         PI != PE; ++PI) {
+      BasicBlock *Pred = *PI;
+      if (!isa<ReattachInst>(Pred->getTerminator())) continue;
+      if (functionPieces.count(Pred))
+        ReattachPoints.push_back(cast<BasicBlock>(VMap[Pred])->getTerminator());
+    }
+
+    // Move allocas in cloned detached block to entry of helper function.
+    BasicBlock *ClonedDetachedBlock = cast<BasicBlock>(VMap[Spawned]);
+    MoveStaticAllocasInBlock(&extracted->getEntryBlock(), ClonedDetachedBlock,
+                             ReattachPoints);
+
+    // We should not need to add new llvm.stacksave/llvm.stackrestore
+    // intrinsics, because calling and returning from the helper will
+    // automatically manage the stack.
+  }
+
+  return extracted;
+}
+
+Function *llvm::cilk::createDetach(DetachInst &detach,
+                                   ValueToValueMapTy &DetachCtxToStackFrame,
+                                   DominatorTree &DT, AssumptionCache &AC,
+                                   bool instrument) {
+  BasicBlock *detB = detach.getParent();
+  Function &F = *(detB->getParent());
+
+  BasicBlock *Spawned  = detach.getDetached();
+  BasicBlock *Continue = detach.getContinue();
+
+  Module *M = F.getParent();
+  //replace with branch to succesor
+  //entry / cilk.spawn.savestate
+  Value *SF = GetOrInitCilkStackFrame(F, DetachCtxToStackFrame,
+                                      /*isFast=*/false, instrument);
+  // assert(SF && "null stack frame unexpected");
+
+  // dbgs() << *detB << *Spawned << *Continue;
+
+  // if (!Spawned->getUniquePredecessor())
+  //   SplitEdge(detB, Spawned, &DT, nullptr);
+
+  // dbgs() << *detB << *(detach.getDetached());
+
+  CallInst *cal = nullptr;
+  Function *extracted = extractDetachBodyToFunction(detach, DT, AC, &cal);
+  assert(extracted && "could not extract detach body to function");
+
+  // Unlink the detached CFG in the original function.  The heavy lifting of
+  // removing the outlined detached-CFG is left to subsequent DCE.
+  BranchInst *ContinueBr;
+  {
+    // Replace the detach with a branch to the continuation.
+    ContinueBr = BranchInst::Create(Continue);
+    ReplaceInstWithInst(&detach, ContinueBr);
+
+    // Rewrite phis in the detached block.
+    BasicBlock::iterator BI = Spawned->begin();
+    while (PHINode *P = dyn_cast<PHINode>(BI)) {
+      // int j = P->getBasicBlockIndex(detB);
+      // assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
+      P->removeIncomingValue(detB);
+      ++BI;
+    }
+  }
+
+  Value *SetJmpRes;
+  {
+    IRBuilder<> B(cal);
+
+    if (instrument)
+      // cilk_spawn_prepare
+      B.CreateCall(CILK_CSI_FUNC(spawn_prepare, *M), SF);
+
+    // Need to save state before spawning
+    SetJmpRes = EmitCilkSetJmp(B, SF, *M);
+
+    if (instrument)
+      // cilk_spawn_or_continue
+      B.CreateCall(CILK_CSI_FUNC(spawn_or_continue, *M), SetJmpRes);
+  }
+
+  // Conditionally call the new helper function based on the result of the
+  // setjmp.
+  {
+    BasicBlock *CallBlock = SplitBlock(detB, cal, &DT);
+    BasicBlock *CallCont = SplitBlock(CallBlock,
+                                      CallBlock->getTerminator(), &DT);
+    IRBuilder<> B(detB->getTerminator());
+    SetJmpRes = B.CreateICmpEQ(SetJmpRes,
+                               ConstantInt::get(SetJmpRes->getType(), 0));
+    B.CreateCondBr(SetJmpRes, CallBlock, CallCont);
+    detB->getTerminator()->eraseFromParent();
+  }
+
+  makeFunctionDetachable(*extracted, DetachCtxToStackFrame, instrument);
+
+  return extracted;
+}
diff --git a/llvm/lib/Transforms/Tapir/LLVMBuild.txt b/llvm/lib/Transforms/Tapir/LLVMBuild.txt
new file mode 100644
index 00000000000000..9b7ec2935c92fc
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Transforms/Tapir/LLVMBuild.txt ---------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = TapirOpts
+parent = Transforms
+required_libraries = Analysis Core Scalar Support TransformUtils
diff --git a/llvm/lib/Transforms/Tapir/LoopSpawning.cpp b/llvm/lib/Transforms/Tapir/LoopSpawning.cpp
new file mode 100644
index 00000000000000..a62e445eecf277
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/LoopSpawning.cpp
@@ -0,0 +1,2413 @@
+//===- LoopSpawning.cpp - Spawn loop iterations efficiently ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Modify Tapir loops to spawn their iterations efficiently.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/LoopSpawning.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/Transforms/Tapir.h"
+#include "llvm/Transforms/Tapir/CilkABI.h"
+#include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <utility>
+
+using std::make_pair;
+
+using namespace llvm;
+
+#define LS_NAME "loop-spawning"
+#define DEBUG_TYPE LS_NAME
+
+STATISTIC(LoopsAnalyzed, "Number of Tapir loops analyzed");
+STATISTIC(LoopsConvertedToDAC,
+          "Number of Tapir loops converted to divide-and-conquer iteration spawning");
+STATISTIC(LoopsConvertedToCilkABI,
+          "Number of Tapir loops converted to use the Cilk ABI for loops");
+
+namespace {
+// Forward declarations.
+class LoopSpawningHints;
+
+// /// \brief This modifies LoopAccessReport to initialize message with
+// /// tapir-loop-specific part.
+// class LoopSpawningReport : public LoopAccessReport {
+// public:
+//   LoopSpawningReport(Instruction *I = nullptr)
+//       : LoopAccessReport("loop-spawning: ", I) {}
+
+//   /// \brief This allows promotion of the loop-access analysis report into the
+//   /// loop-spawning report.  It modifies the message to add the
+//   /// loop-spawning-specific part of the message.
+//   explicit LoopSpawningReport(const LoopAccessReport &R)
+//       : LoopAccessReport(Twine("loop-spawning: ") + R.str(),
+//                          R.getInstr()) {}
+// };
+
+
+/// Utility class for getting and setting loop spawning hints in the form
+/// of loop metadata.
+/// This class keeps a number of loop annotations locally (as member variables)
+/// and can, upon request, write them back as metadata on the loop. It will
+/// initially scan the loop for existing metadata, and will update the local
+/// values based on information in the loop.
+class LoopSpawningHints {
+  enum HintKind { HK_STRATEGY };
+
+  /// Hint - associates name and validation with the hint value.
+  struct Hint {
+    const char *Name;
+    unsigned Value; // This may have to change for non-numeric values.
+    HintKind Kind;
+
+    Hint(const char *Name, unsigned Value, HintKind Kind)
+        : Name(Name), Value(Value), Kind(Kind) {}
+
+    bool validate(unsigned Val) {
+      switch (Kind) {
+      case HK_STRATEGY:
+        return (Val < ST_END);
+      }
+      return false;
+    }
+  };
+
+  /// Spawning strategy
+  Hint Strategy;
+
+  /// Return the loop metadata prefix.
+  static StringRef Prefix() { return "tapir.loop."; }
+
+public:
+  enum SpawningStrategy {
+    ST_SEQ,
+    ST_DAC,
+    ST_END,
+  };
+
+  static std::string printStrategy(enum SpawningStrategy Strat) {
+    switch(Strat) {
+    case LoopSpawningHints::ST_SEQ:
+      return "Spawn iterations sequentially";
+    case LoopSpawningHints::ST_DAC:
+      return "Use divide-and-conquer";
+    case LoopSpawningHints::ST_END:
+    default:
+      return "Unknown";
+    }
+  }
+
+  LoopSpawningHints(const Loop *L, OptimizationRemarkEmitter &ORE)
+      : Strategy("spawn.strategy", ST_SEQ, HK_STRATEGY),
+        TheLoop(L), ORE(ORE) {
+    // Populate values with existing loop metadata.
+    getHintsFromMetadata();
+  }
+
+  // /// Dumps all the hint information.
+  // std::string emitRemark() const {
+  //   LoopSpawningReport R;
+  //   R << "Strategy = " << printStrategy(getStrategy());
+
+  //   return R.str();
+  // }
+
+  enum SpawningStrategy getStrategy() const {
+    return (SpawningStrategy)Strategy.Value;
+  }
+
+private:
+  /// Find hints specified in the loop metadata and update local values.
+  void getHintsFromMetadata() {
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (!LoopID)
+      return;
+
+    // First operand should refer to the loop id itself.
+    assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+    assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+    for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+      const MDString *S = nullptr;
+      SmallVector<Metadata *, 4> Args;
+
+      // The expected hint is either a MDString or a MDNode with the first
+      // operand a MDString.
+      if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+        if (!MD || MD->getNumOperands() == 0)
+          continue;
+        S = dyn_cast<MDString>(MD->getOperand(0));
+        for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+          Args.push_back(MD->getOperand(i));
+      } else {
+        S = dyn_cast<MDString>(LoopID->getOperand(i));
+        assert(Args.size() == 0 && "too many arguments for MDString");
+      }
+
+      if (!S)
+        continue;
+
+      // Check if the hint starts with the loop metadata prefix.
+      StringRef Name = S->getString();
+      if (Args.size() == 1)
+        setHint(Name, Args[0]);
+    }
+  }
+
+  /// Checks string hint with one operand and set value if valid.
+  void setHint(StringRef Name, Metadata *Arg) {
+    if (!Name.startswith(Prefix()))
+      return;
+    Name = Name.substr(Prefix().size(), StringRef::npos);
+
+    const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+    if (!C)
+      return;
+    unsigned Val = C->getZExtValue();
+
+    Hint *Hints[] = {&Strategy};
+    for (auto H : Hints) {
+      if (Name == H->Name) {
+        if (H->validate(Val))
+          H->Value = Val;
+        else
+          DEBUG(dbgs() << LS_NAME << " ignoring invalid hint '" <<
+                Name << "'\n");
+        break;
+      }
+    }
+  }
+
+  /// Create a new hint from name / value pair.
+  MDNode *createHintMetadata(StringRef Name, unsigned V) const {
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    Metadata *MDs[] = {MDString::get(Context, Name),
+                       ConstantAsMetadata::get(
+                           ConstantInt::get(Type::getInt32Ty(Context), V))};
+    return MDNode::get(Context, MDs);
+  }
+
+  /// Matches metadata with hint name.
+  bool matchesHintMetadataName(MDNode *Node, ArrayRef<Hint> HintTypes) {
+    MDString *Name = dyn_cast<MDString>(Node->getOperand(0));
+    if (!Name)
+      return false;
+
+    for (auto H : HintTypes)
+      if (Name->getString().endswith(H.Name))
+        return true;
+    return false;
+  }
+
+  /// Sets current hints into loop metadata, keeping other values intact.
+  void writeHintsToMetadata(ArrayRef<Hint> HintTypes) {
+    if (HintTypes.size() == 0)
+      return;
+
+    // Reserve the first element to LoopID (see below).
+    SmallVector<Metadata *, 4> MDs(1);
+    // If the loop already has metadata, then ignore the existing operands.
+    MDNode *LoopID = TheLoop->getLoopID();
+    if (LoopID) {
+      for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+        MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+        // If node in update list, ignore old value.
+        if (!matchesHintMetadataName(Node, HintTypes))
+          MDs.push_back(Node);
+      }
+    }
+
+    // Now, add the missing hints.
+    for (auto H : HintTypes)
+      MDs.push_back(createHintMetadata(Twine(Prefix(), H.Name).str(), H.Value));
+
+    // Replace current metadata node with new one.
+    LLVMContext &Context = TheLoop->getHeader()->getContext();
+    MDNode *NewLoopID = MDNode::get(Context, MDs);
+    // Set operand 0 to refer to the loop id itself.
+    NewLoopID->replaceOperandWith(0, NewLoopID);
+
+    TheLoop->setLoopID(NewLoopID);
+  }
+
+  /// The loop these hints belong to.
+  const Loop *TheLoop;
+
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+};
+
+// static void emitAnalysisDiag(const Loop *TheLoop,
+//                              OptimizationRemarkEmitter &ORE,
+//                              const LoopAccessReport &Message) {
+//   const char *Name = LS_NAME;
+//   LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);
+// }
+
+static void emitMissedWarning(Function *F, Loop *L,
+                              const LoopSpawningHints &LH,
+                              OptimizationRemarkEmitter *ORE) {
+  // ORE->emit(OptimizationRemarkMissed(
+  //               LS_NAME, "LSHint", L->getStartLoc(), L->getHeader())
+  //           << "Strategy = "
+  //           << LoopSpawningHints::printStrategy(LH.getStrategy()));
+  switch (LH.getStrategy()) {
+  case LoopSpawningHints::ST_DAC:
+    ORE->emit(DiagnosticInfoOptimizationFailure(
+                  DEBUG_TYPE, "FailedRequestedSpawning",
+                  L->getStartLoc(), L->getHeader())
+              << "Tapir loop not transformed: "
+              << "failed to use divide-and-conquer loop spawning");
+    break;
+  case LoopSpawningHints::ST_SEQ:
+    ORE->emit(DiagnosticInfoOptimizationFailure(
+                  DEBUG_TYPE, "SpawningDisabled",
+                  L->getStartLoc(), L->getHeader())
+              << "Tapir loop not transformed: "
+              << "loop-spawning transformation disabled");
+    break;
+  case LoopSpawningHints::ST_END:
+    ORE->emit(DiagnosticInfoOptimizationFailure(
+                  DEBUG_TYPE, "FailedRequestedSpawning",
+                  L->getStartLoc(), L->getHeader())
+              << "Tapir loop not transformed: "
+              << "unknown loop-spawning strategy");
+    break;
+  }
+}
+
+/// LoopOutline serves as a base class for different variants of LoopSpawning.
+/// LoopOutline implements common parts of LoopSpawning transformations, namely,
+/// lifting a Tapir loop into a separate helper function.
+class LoopOutline {
+public:
+
+  LoopOutline(Loop *OrigLoop, ScalarEvolution &SE,
+              LoopInfo *LI, DominatorTree *DT,
+              AssumptionCache *AC,
+              OptimizationRemarkEmitter &ORE)
+      : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), AC(AC), ORE(ORE),
+        ExitBlock(nullptr)
+  {
+    // Use the loop latch to determine the canonical exit block for this loop.
+    TerminatorInst *TI = OrigLoop->getLoopLatch()->getTerminator();
+    if (2 != TI->getNumSuccessors())
+      return;
+    ExitBlock = TI->getSuccessor(0);
+    if (ExitBlock == OrigLoop->getHeader())
+      ExitBlock = TI->getSuccessor(1);
+  }
+
+  virtual bool processLoop() = 0;
+
+  virtual ~LoopOutline() {}
+
+protected:
+  PHINode* canonicalizeIVs(Type *Ty);
+  Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit);
+  void unlinkLoop();
+
+  /// The original loop.
+  Loop *OrigLoop;
+
+  /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+  /// dynamic knowledge to simplify SCEV expressions and converts them to a
+  /// more usable form.
+  // PredicatedScalarEvolution &PSE;
+  ScalarEvolution &SE;
+  /// Loop info.
+  LoopInfo *LI;
+  /// Dominator tree.
+  DominatorTree *DT;
+  /// Assumption cache.
+  AssumptionCache *AC;
+  /// Interface to emit optimization remarks.
+  OptimizationRemarkEmitter &ORE;
+
+  /// The exit block of this loop.  We compute our own exit block, based on the
+  /// latch, and handle other exit blocks (i.e., for exception handling) in a
+  /// special manner.
+  BasicBlock *ExitBlock;
+
+// private:
+//   /// Report an analysis message to assist the user in diagnosing loops that are
+//   /// not transformed.  These are handled as LoopAccessReport rather than
+//   /// VectorizationReport because the << operator of LoopSpawningReport returns
+//   /// LoopAccessReport.
+//   void emitAnalysis(const LoopAccessReport &Message) const {
+//     emitAnalysisDiag(OrigLoop, *ORE, Message);
+//   }
+};
+
+/// DACLoopSpawning implements the transformation to spawn the iterations of a
+/// Tapir loop in a recursive divide-and-conquer fashion.
+class DACLoopSpawning : public LoopOutline {
+public:
+  // DACLoopSpawning(Loop *OrigLoop, ScalarEvolution &SE,
+  //                 LoopInfo *LI, DominatorTree *DT,
+  //                 const TargetLibraryInfo *TLI,
+  //                 const TargetTransformInfo *TTI,
+  //                 OptimizationRemarkEmitter *ORE)
+  //     : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT),
+  //       TLI(TLI), TTI(TTI), ORE(ORE)
+  // {}
+
+  DACLoopSpawning(Loop *OrigLoop, ScalarEvolution &SE,
+                  LoopInfo *LI, DominatorTree *DT,
+                  AssumptionCache *AC,
+                  OptimizationRemarkEmitter &ORE)
+      : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE)
+  {}
+
+  bool processLoop();
+
+  virtual ~DACLoopSpawning() {}
+
+protected:
+  Value* computeGrainsize(Value *Limit);
+  void implementDACIterSpawnOnHelper(Function *Helper,
+                                     BasicBlock *Preheader,
+                                     BasicBlock *Header,
+                                     PHINode *CanonicalIV,
+                                     Argument *Limit,
+                                     Argument *Grainsize,
+                                     Instruction *SyncRegion,
+                                     DominatorTree *DT,
+                                     LoopInfo *LI,
+                                     bool CanonicalIVFlagNUW = false,
+                                     bool CanonicalIVFlagNSW = false);
+
+// private:
+//   /// Report an analysis message to assist the user in diagnosing loops that are
+//   /// not transformed.  These are handled as LoopAccessReport rather than
+//   /// VectorizationReport because the << operator of LoopSpawningReport returns
+//   /// LoopAccessReport.
+//   void emitAnalysis(const LoopAccessReport &Message) const {
+//     emitAnalysisDiag(OrigLoop, *ORE, Message);
+//   }
+};
+
+/// CilkABILoopSpawning uses the Cilk Plus ABI to handle Tapir loops.
+class CilkABILoopSpawning : public LoopOutline {
+public:
+  CilkABILoopSpawning(Loop *OrigLoop, ScalarEvolution &SE,
+                      LoopInfo *LI, DominatorTree *DT,
+                      AssumptionCache *AC,
+                      OptimizationRemarkEmitter &ORE)
+      : LoopOutline(OrigLoop, SE, LI, DT, AC, ORE)
+  {}
+
+  bool processLoop();
+
+  virtual ~CilkABILoopSpawning() {}
+
+protected:
+  // PHINode* canonicalizeIVs(Type *Ty);
+  Value* canonicalizeLoopLatch(PHINode *IV, Value *Limit);
+
+// private:
+//   /// Report an analysis message to assist the user in diagnosing loops that are
+//   /// not transformed.  These are handled as LoopAccessReport rather than
+//   /// VectorizationReport because the << operator of LoopSpawningReport returns
+//   /// LoopAccessReport.
+//   void emitAnalysis(const LoopAccessReport &Message) const {
+//     emitAnalysisDiag(OrigLoop, *ORE, Message);
+//   }
+};
+
+struct LoopSpawningImpl {
+  // LoopSpawningImpl(Function &F, LoopInfo &LI, ScalarEvolution &SE,
+  //                  DominatorTree &DT,
+  //                  const TargetTransformInfo &TTI,
+  //                  const TargetLibraryInfo *TLI,
+  //                  AliasAnalysis &AA, AssumptionCache &AC,
+  //                  OptimizationRemarkEmitter &ORE)
+  //     : F(&F), LI(&LI), SE(&SE), DT(&DT), TTI(&TTI), TLI(TLI),
+  //       AA(&AA), AC(&AC), ORE(&ORE) {}
+  // LoopSpawningImpl(Function &F,
+  //                  function_ref<LoopInfo &(Function &)> GetLI,
+  //                  function_ref<ScalarEvolution &(Function &)> GetSE,
+  //                  function_ref<DominatorTree &(Function &)> GetDT,
+  //                  OptimizationRemarkEmitter &ORE)
+  //     : F(F), GetLI(GetLI), LI(nullptr), GetSE(GetSE), GetDT(GetDT),
+  //       ORE(ORE)
+  // {}
+  LoopSpawningImpl(Function &F,
+                   LoopInfo &LI,
+                   ScalarEvolution &SE,
+                   DominatorTree &DT,
+                   AssumptionCache &AC,
+                   OptimizationRemarkEmitter &ORE)
+      : F(F), LI(LI), SE(SE), DT(DT), AC(AC), ORE(ORE) {}
+
+  bool run();
+
+private:
+  void addTapirLoop(Loop *L, SmallVectorImpl<Loop *> &V);
+  bool isTapirLoop(const Loop *L);
+  bool processLoop(Loop *L);
+
+  Function &F;
+  // function_ref<LoopInfo &(Function &)> GetLI;
+  LoopInfo &LI;
+  // function_ref<ScalarEvolution &(Function &)> GetSE;
+  // function_ref<DominatorTree &(Function &)> GetDT;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+  // const TargetTransformInfo *TTI;
+  // const TargetLibraryInfo *TLI;
+  // AliasAnalysis *AA;
+  AssumptionCache &AC;
+  OptimizationRemarkEmitter &ORE;
+};
+} // end anonymous namespace
+
+/// Canonicalize the induction variables in the loop.  Return the canonical
+/// induction variable created or inserted by the scalar evolution expander.
+PHINode* LoopOutline::canonicalizeIVs(Type *Ty) {
+  Loop *L = OrigLoop;
+
+  BasicBlock* Header = L->getHeader();
+  Module* M = Header->getParent()->getParent();
+
+  SCEVExpander Exp(SE, M->getDataLayout(), "ls");
+
+  PHINode *CanonicalIV = Exp.getOrInsertCanonicalInductionVariable(L, Ty);
+  DEBUG(dbgs() << "LS Canonical induction variable " << *CanonicalIV << "\n");
+
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
+  Exp.replaceCongruentIVs(L, DT, DeadInsts);
+  for (WeakTrackingVH V : DeadInsts) {
+    DEBUG(dbgs() << "LS erasing dead inst " << *V << "\n");
+    Instruction *I = cast<Instruction>(V);
+    I->eraseFromParent();
+  }
+
+  return CanonicalIV;
+}
+
+/// \brief Replace the latch of the loop to check that IV is always less than or
+/// equal to the limit.
+///
+/// This method assumes that the loop has a single loop latch.
+Value* LoopOutline::canonicalizeLoopLatch(PHINode *IV, Value *Limit) {
+  Loop *L = OrigLoop;
+
+  Value *NewCondition;
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "No single loop latch found for loop.");
+
+  IRBuilder<> Builder(&*Latch->getFirstInsertionPt());
+
+  // This process assumes that IV's increment is in Latch.
+
+  // Create comparison between IV and Limit at top of Latch.
+  NewCondition = Builder.CreateICmpULT(IV, Limit);
+
+  // Replace the conditional branch at the end of Latch.
+  BranchInst *LatchBr = dyn_cast_or_null<BranchInst>(Latch->getTerminator());
+  assert(LatchBr && LatchBr->isConditional() &&
+         "Latch does not terminate with a conditional branch.");
+  Builder.SetInsertPoint(Latch->getTerminator());
+  Builder.CreateCondBr(NewCondition, Header, ExitBlock);
+
+  // Erase the old conditional branch.
+  Value *OldCond = LatchBr->getCondition();
+  LatchBr->eraseFromParent();
+  if (!OldCond->hasNUsesOrMore(1))
+    if (Instruction *OldCondInst = dyn_cast<Instruction>(OldCond))
+      OldCondInst->eraseFromParent();
+
+  return NewCondition;
+}
+
+/// Unlink the specified loop, and update analysis accordingly.  The heavy
+/// lifting of deleting the loop is carried out by a run of LoopDeletion after
+/// this pass.
+void LoopOutline::unlinkLoop() {
+  Loop *L = OrigLoop;
+
+  // Get components of the old loop.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  assert(Preheader && "Loop does not have a unique preheader.");
+  BasicBlock *Latch = L->getLoopLatch();
+
+  // Invalidate the analysis of the old loop.
+  SE.forgetLoop(L);
+
+  // Redirect the preheader to branch directly to loop exit.
+  assert(1 == Preheader->getTerminator()->getNumSuccessors() &&
+         "Preheader does not have a unique successor.");
+  Preheader->getTerminator()->replaceUsesOfWith(L->getHeader(),
+                                                ExitBlock);
+
+  // Rewrite phis in the exit block to get their inputs from
+  // the preheader instead of the exiting block.
+  BasicBlock::iterator BI = ExitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    int j = P->getBasicBlockIndex(Latch);
+    assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
+    P->setIncomingBlock(j, Preheader);
+    P->removeIncomingValue(Latch);
+    ++BI;
+  }
+
+  // Rewrite phis in the header block to not receive an input from
+  // the preheader.
+  BI = L->getHeader()->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    P->removeIncomingValue(Preheader);
+    ++BI;
+  }
+}
+
+/// \brief Compute the grainsize of the loop, based on the limit.
+///
+/// The grainsize is computed by the following equation:
+///
+///     Grainsize = min(2048, ceil(Limit / (8 * workers)))
+///
+/// This computation is inserted into the preheader of the loop.
+///
+/// TODO: This method is the only method that depends on the CilkABI.
+/// Generalize this method for other grainsize calculations and to query TLI.
+Value* DACLoopSpawning::computeGrainsize(Value *Limit) {
+  Loop *L = OrigLoop;
+
+  Value *Grainsize;
+  BasicBlock *Preheader = L->getLoopPreheader();
+  assert(Preheader && "No Preheader found for loop.");
+
+  IRBuilder<> Builder(Preheader->getTerminator());
+
+  // Get 8 * workers
+  Value *Workers8 = Builder.CreateIntCast(cilk::GetOrCreateWorker8(*Preheader->getParent()),
+                                          Limit->getType(), false);
+  // Compute ceil(limit / 8 * workers) = (limit + 8 * workers - 1) / (8 * workers)
+  Value *SmallLoopVal =
+    Builder.CreateUDiv(Builder.CreateSub(Builder.CreateAdd(Limit, Workers8),
+                                         ConstantInt::get(Limit->getType(), 1)),
+                       Workers8);
+  // Compute min
+  Value *LargeLoopVal = ConstantInt::get(Limit->getType(), 2048);
+  Value *Cmp = Builder.CreateICmpULT(LargeLoopVal, SmallLoopVal);
+  Grainsize = Builder.CreateSelect(Cmp, LargeLoopVal, SmallLoopVal);
+
+  return Grainsize;
+}
+
+/// \brief Method to help convertLoopToDACIterSpawn convert the Tapir
+/// loop cloned into function Helper to spawn its iterations in a
+/// parallel divide-and-conquer fashion.
+///
+/// Example: Suppose that Helper contains the following Tapir loop:
+///
+/// Helper(iter_t start, iter_t end, iter_t grain, ...) {
+///   iter_t i = start;
+///   ... Other loop setup ...
+///   do {
+///     spawn { ... loop body ... };
+///   } while (i++ < end);
+///   sync;
+/// }
+///
+/// Then this method transforms Helper into the following form:
+///
+/// Helper(iter_t start, iter_t end, iter_t grain, ...) {
+/// recur:
+///   iter_t itercount = end - start;
+///   if (itercount > grain) {
+///     // Invariant: itercount >= 2
+///     count_t miditer = start + itercount / 2;
+///     spawn Helper(start, miditer, grain, ...);
+///     start = miditer + 1;
+///     goto recur;
+///   }
+///
+///   iter_t i = start;
+///   ... Other loop setup ...
+///   do {
+///     ... Loop Body ...
+///   } while (i++ < end);
+///   sync;
+/// }
+///
+void DACLoopSpawning::implementDACIterSpawnOnHelper(Function *Helper,
+                                                    BasicBlock *Preheader,
+                                                    BasicBlock *Header,
+                                                    PHINode *CanonicalIV,
+                                                    Argument *Limit,
+                                                    Argument *Grainsize,
+                                                    Instruction *SyncRegion,
+                                                    DominatorTree *DT,
+                                                    LoopInfo *LI,
+                                                    bool CanonicalIVFlagNUW,
+                                                    bool CanonicalIVFlagNSW) {
+  // Serialize the cloned copy of the loop.
+  assert(Preheader->getParent() == Helper &&
+         "Preheader does not belong to helper function.");
+  assert(Header->getParent() == Helper &&
+         "Header does not belong to helper function.");
+  assert(CanonicalIV->getParent() == Header &&
+         "CanonicalIV does not belong to header");
+  assert(isa<DetachInst>(Header->getTerminator()) &&
+         "Cloned header is not terminated by a detach.");
+  DetachInst *DI = dyn_cast<DetachInst>(Header->getTerminator());
+  SerializeDetachedCFG(DI, DT);
+
+  // Convert the cloned loop into the strip-mined loop body.
+
+  BasicBlock *DACHead = Preheader;
+  if (&(Helper->getEntryBlock()) == Preheader)
+    // Split the entry block.  We'll want to create a backedge into
+    // the split block later.
+    DACHead = SplitBlock(Preheader, Preheader->getTerminator(), DT, LI);
+
+  BasicBlock *RecurHead, *RecurDet, *RecurCont;
+  Value *IterCount;
+  Value *CanonicalIVInput;
+  PHINode *CanonicalIVStart;
+  {
+    Instruction *PreheaderOrigFront = &(DACHead->front());
+    IRBuilder<> Builder(PreheaderOrigFront);
+    // Create branch based on grainsize.
+    DEBUG(dbgs() << "LS CanonicalIV: " << *CanonicalIV << "\n");
+    CanonicalIVInput = CanonicalIV->getIncomingValueForBlock(DACHead);
+    CanonicalIVStart = Builder.CreatePHI(CanonicalIV->getType(), 2,
+                                         CanonicalIV->getName()+".dac");
+    CanonicalIVInput->replaceAllUsesWith(CanonicalIVStart);
+    IterCount = Builder.CreateSub(Limit, CanonicalIVStart,
+                                  "itercount");
+    Value *IterCountCmp = Builder.CreateICmpUGT(IterCount, Grainsize);
+    TerminatorInst *RecurTerm =
+      SplitBlockAndInsertIfThen(IterCountCmp, PreheaderOrigFront,
+                                /*Unreachable=*/false,
+                                /*BranchWeights=*/nullptr,
+                                DT);
+    RecurHead = RecurTerm->getParent();
+    // Create skeleton of divide-and-conquer recursion:
+    // DACHead -> RecurHead -> RecurDet -> RecurCont -> DACHead
+    RecurDet = SplitBlock(RecurHead, RecurHead->getTerminator(),
+                          DT, LI);
+    RecurCont = SplitBlock(RecurDet, RecurDet->getTerminator(),
+                           DT, LI);
+    RecurCont->getTerminator()->replaceUsesOfWith(RecurTerm->getSuccessor(0),
+                                                  DACHead);
+  }
+
+  // Compute mid iteration in RecurHead.
+  Value *MidIter, *MidIterPlusOne;
+  {
+    IRBuilder<> Builder(&(RecurHead->front()));
+    MidIter = Builder.CreateAdd(CanonicalIVStart,
+                                Builder.CreateLShr(IterCount, 1,
+                                                   "halfcount"),
+                                "miditer",
+                                CanonicalIVFlagNUW, CanonicalIVFlagNSW);
+  }
+
+  // Create recursive call in RecurDet.
+  {
+    // Create input array for recursive call.
+    IRBuilder<> Builder(&(RecurDet->front()));
+    SetVector<Value*> RecurInputs;
+    Function::arg_iterator AI = Helper->arg_begin();
+    assert(cast<Argument>(CanonicalIVInput) == &*AI &&
+           "First argument does not match original input to canonical IV.");
+    RecurInputs.insert(CanonicalIVStart);
+    ++AI;
+    assert(Limit == &*AI &&
+           "Second argument does not match original input to the loop limit.");
+    RecurInputs.insert(MidIter);
+    ++AI;
+    for (Function::arg_iterator AE = Helper->arg_end();
+         AI != AE;  ++AI)
+        RecurInputs.insert(&*AI);
+    // RecurInputs.insert(CanonicalIVStart);
+    // // for (PHINode *IV : IVs)
+    // //   RecurInputs.insert(DACStart[IV]);
+    // RecurInputs.insert(Limit);
+    // RecurInputs.insert(Grainsize);
+    // for (Value *V : BodyInputs)
+    //   RecurInputs.insert(VMap[V]);
+    DEBUG({
+        dbgs() << "RecurInputs: ";
+        for (Value *Input : RecurInputs)
+          dbgs() << *Input << ", ";
+        dbgs() << "\n";
+      });
+
+    // Create call instruction.
+    CallInst *RecurCall = Builder.CreateCall(Helper, RecurInputs.getArrayRef());
+    RecurCall->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    // Use a fast calling convention for the helper.
+    RecurCall->setCallingConv(CallingConv::Fast);
+    // RecurCall->setCallingConv(Helper->getCallingConv());
+    // // Update CG graph with the recursive call we just added.
+    // CG[Helper]->addCalledFunction(RecurCall, CG[Helper]);
+  }
+
+  // Set up continuation of detached recursive call.  We effectively
+  // inline this tail call automatically.
+  {
+    IRBuilder<> Builder(&(RecurCont->front()));
+    MidIterPlusOne = Builder.CreateAdd(MidIter,
+                                       ConstantInt::get(Limit->getType(), 1),
+                                       "miditerplusone",
+                                       CanonicalIVFlagNUW,
+                                       CanonicalIVFlagNSW);
+  }
+
+  // Finish setup of new phi node for canonical IV.
+  {
+    CanonicalIVStart->addIncoming(CanonicalIVInput, Preheader);
+    CanonicalIVStart->addIncoming(MidIterPlusOne, RecurCont);
+  }
+
+  /// Make the recursive DAC parallel.
+  {
+    IRBuilder<> Builder(RecurHead->getTerminator());
+    // Create the detach.
+    DetachInst *DI = Builder.CreateDetach(RecurDet, RecurCont, SyncRegion);
+    DI->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    RecurHead->getTerminator()->eraseFromParent();
+    // Create the reattach.
+    Builder.SetInsertPoint(RecurDet->getTerminator());
+    ReattachInst *RI = Builder.CreateReattach(RecurCont, SyncRegion);
+    RI->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    RecurDet->getTerminator()->eraseFromParent();
+  }
+}
+
+/// Helper routine to get all exit blocks of a loop that are unreachable.
+static void getEHExits(Loop *L, const BasicBlock *DesignatedExitBlock,
+                       SmallVectorImpl<BasicBlock *> &EHExits) {
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+
+  SmallVector<BasicBlock *, 4> WorkList;
+  for (BasicBlock *Exit : ExitBlocks) {
+    if (Exit == DesignatedExitBlock) continue;
+    EHExits.push_back(Exit);
+    WorkList.push_back(Exit);
+  }
+
+  // Traverse the CFG from these frontier blocks to find all blocks involved in
+  // exception-handling exit code.
+  SmallPtrSet<BasicBlock *, 4> Visited;
+  while (!WorkList.empty()) {
+    BasicBlock *BB = WorkList.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+
+    // Check that the exception handling blocks do not reenter the loop.
+    assert(!L->contains(BB) &&
+           "Exception handling blocks re-enter loop.");
+
+    for (BasicBlock *Succ : successors(BB)) {
+      EHExits.push_back(Succ);
+      WorkList.push_back(Succ);
+    }
+  }
+}
+
+/// Top-level call to convert loop to spawn its iterations in a
+/// divide-and-conquer fashion.
+bool DACLoopSpawning::processLoop() {
+  Loop *L = OrigLoop;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Latch = L->getLoopLatch();
+
+  DEBUG({
+      LoopBlocksDFS DFS(L);
+      DFS.perform(LI);
+      dbgs() << "Blocks in loop (from DFS):\n";
+      for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
+        dbgs() << *BB;
+    });
+
+  using namespace ore;
+
+  // Check that this loop has a valid exit block after the latch.
+  if (!ExitBlock) {
+    DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "invalid latch exit");
+    return false;
+  }
+
+  // Get special exits from this loop.
+  SmallVector<BasicBlock *, 4> EHExits;
+  getEHExits(L, ExitBlock, EHExits);
+
+  // Check the exit blocks of the loop.
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+
+  for (const BasicBlock *Exit : ExitBlocks) {
+    if (Exit == ExitBlock) continue;
+    if (Exit->isLandingPad()) {
+      DEBUG({
+          const LandingPadInst *LPI = Exit->getLandingPadInst();
+          dbgs() << "landing pad found: " << *LPI << "\n";
+          for (const User *U : LPI->users())
+            dbgs() << "\tuser " << *U << "\n";
+        });
+    }
+  }
+  SmallPtrSet<BasicBlock *, 4> HandledExits;
+  for (BasicBlock *BB : EHExits)
+    HandledExits.insert(BB);
+  for (BasicBlock *Exit : ExitBlocks) {
+    if (Exit == ExitBlock) continue;
+    if (!HandledExits.count(Exit)) {
+      DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit);
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit",
+                                          L->getStartLoc(),
+                                          Header)
+               << "bad exit block found");
+      return false;
+    }
+  }
+
+  Function *F = Header->getParent();
+  Module* M = F->getParent();
+
+  DEBUG(dbgs() << "LS loop header:" << *Header);
+  DEBUG(dbgs() << "LS loop latch:" << *Latch);
+  DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n");
+
+  /// Get loop limit.
+  const SCEV *Limit = SE.getExitCount(L, Latch);
+  DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n");
+  // PredicatedScalarEvolution PSE(SE, *L);
+  // const SCEV *PLimit = PSE.getExitCount(L, Latch);
+  // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n");
+  // emitAnalysis(LoopSpawningReport()
+  //              << "computed loop limit " << *Limit << "\n");
+  if (SE.getCouldNotCompute() == Limit) {
+    DEBUG(dbgs() << "SE could not compute loop limit.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not compute limit");
+    return false;
+  }
+  // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(),
+  //                                     Header)
+  //          << "loop limit: " << NV("Limit", Limit));
+  /// Clean up the loop's induction variables.
+  PHINode *CanonicalIV = canonicalizeIVs(Limit->getType());
+  if (!CanonicalIV) {
+    DEBUG(dbgs() << "Could not get canonical IV.\n");
+    // emitAnalysis(LoopSpawningReport()
+    //              << "Could not get a canonical IV.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not find or create canonical IV");
+    return false;
+  }
+  const SCEVAddRecExpr *CanonicalSCEV =
+    cast<const SCEVAddRecExpr>(SE.getSCEV(CanonicalIV));
+
+  // Remove all IV's other than CanonicalIV.
+  // First, check that we can do this.
+  bool CanRemoveIVs = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    if (CanonicalIV == PN) continue;
+    // dbgs() << "IV " << *PN;
+    const SCEV *S = SE.getSCEV(PN);
+    // dbgs() << " SCEV " << *S << "\n";
+    if (SE.getCouldNotCompute() == S) {
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Could not compute the scalar evolution.\n");
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN)
+               << "could not compute scalar evolution of "
+               << NV("PHINode", PN));
+      CanRemoveIVs = false;
+    }
+  }
+
+  if (!CanRemoveIVs) {
+    DEBUG(dbgs() << "Could not compute scalar evolutions for all IV's.\n");
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // We now have everything we need to extract the loop.  It's time to
+  // do some surgery.
+
+  SCEVExpander Exp(SE, M->getDataLayout(), "ls");
+
+  // Remove the IV's (other than CanonicalIV) and replace them with
+  // their stronger forms.
+  //
+  // TODO?: We can probably adapt this loop->DAC process such that we
+  // don't require all IV's to be canonical.
+  {
+    SmallVector<PHINode*, 8> IVsToRemove;
+    for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+      PHINode *PN = cast<PHINode>(II);
+      if (PN == CanonicalIV) continue;
+      const SCEV *S = SE.getSCEV(PN);
+      DEBUG(dbgs() << "Removing the IV " << *PN << " (" << *S << ")\n");
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "RemoveIV", PN)
+               << "removing the IV "
+               << NV("PHINode", PN));
+      Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV);
+      PN->replaceAllUsesWith(NewIV);
+      IVsToRemove.push_back(PN);
+    }
+    for (PHINode *PN : IVsToRemove)
+      PN->eraseFromParent();
+  }
+
+  // All remaining IV's should be canonical.  Collect them.
+  //
+  // TODO?: We can probably adapt this loop->DAC process such that we
+  // don't require all IV's to be canonical.
+  SmallVector<PHINode*, 8> IVs;
+  bool AllCanonical = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    DEBUG({
+        const SCEVAddRecExpr *PNSCEV =
+          dyn_cast<const SCEVAddRecExpr>(SE.getSCEV(PN));
+        assert(PNSCEV && "PHINode did not have corresponding SCEVAddRecExpr");
+        assert(PNSCEV->getStart()->isZero() &&
+               "PHINode SCEV does not start at 0");
+        dbgs() << "LS step recurrence for SCEV " << *PNSCEV << " is "
+               << *(PNSCEV->getStepRecurrence(SE)) << "\n";
+        assert(PNSCEV->getStepRecurrence(SE)->isOne() &&
+               "PHINode SCEV step is not 1");
+      });
+    if (ConstantInt *C =
+        dyn_cast<ConstantInt>(PN->getIncomingValueForBlock(Preheader))) {
+      if (C->isZero()) {
+        DEBUG({
+            if (PN != CanonicalIV) {
+              const SCEVAddRecExpr *PNSCEV =
+                dyn_cast<const SCEVAddRecExpr>(SE.getSCEV(PN));
+              dbgs() << "Saving the canonical IV " << *PN << " (" << *PNSCEV << ")\n";
+            }
+          });
+        if (PN != CanonicalIV)
+          ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "SaveIV", PN)
+                   << "saving the canonical the IV "
+                   << NV("PHINode", PN));
+        IVs.push_back(PN);
+      }
+    } else {
+      AllCanonical = false;
+      DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN <<
+            "\n");
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Found a remaining non-canonical IV.\n");
+      ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN)
+               << "found a remaining noncanonical IV");
+    }
+  }
+  if (!AllCanonical)
+    return false;
+
+  // Insert the computation for the loop limit into the Preheader.
+  Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(),
+                                      Preheader->getTerminator());
+  DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n");
+
+  // Canonicalize the loop latch.
+  assert(SE.isLoopBackedgeGuardedByCond(L, ICmpInst::ICMP_ULT,
+                                        CanonicalSCEV, Limit) &&
+         "Loop backedge is not guarded by canonical comparison with limit.");
+  Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar);
+
+  // Insert computation of grainsize into the Preheader.
+  // For debugging:
+  // Value *GrainVar = ConstantInt::get(Limit->getType(), 2);
+  Value *GrainVar = computeGrainsize(LimitVar);
+  DEBUG(dbgs() << "GrainVar: " << *GrainVar << "\n");
+  // emitAnalysis(LoopSpawningReport()
+  //              << "grainsize value " << *GrainVar << "\n");
+  // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UsingGrainsize",
+  //                                     L->getStartLoc(), Header)
+  //          << "grainsize: " << NV("Grainsize", GrainVar));
+
+  /// Clone the loop into a new function.
+
+  // Get the inputs and outputs for the Loop blocks.
+  SetVector<Value *> Inputs, Outputs;
+  SetVector<Value *> BodyInputs, BodyOutputs;
+  ValueToValueMapTy VMap, InputMap;
+  std::vector<BasicBlock *> LoopBlocks;
+  SmallPtrSet<BasicBlock *, 4> ExitsToSplit;
+
+  // Get the sync region containing this Tapir loop.
+  const Instruction *InputSyncRegion;
+  {
+    const DetachInst *DI = cast<DetachInst>(Header->getTerminator());
+    InputSyncRegion = cast<Instruction>(DI->getSyncRegion());
+  }
+
+  // Add start iteration, end iteration, and grainsize to inputs.
+  {
+    LoopBlocks = L->getBlocks();
+    // // Add exit blocks terminated by unreachable.  There should not be any other
+    // // exit blocks in the loop.
+    // SmallSet<BasicBlock *, 4> UnreachableExits;
+    // for (BasicBlock *Exit : ExitBlocks) {
+    //   if (Exit == ExitBlock) continue;
+    //   assert(isa<UnreachableInst>(Exit->getTerminator()) &&
+    //          "Found problematic exit block.");
+    //   UnreachableExits.insert(Exit);
+    // }
+
+    // Add unreachable and exception-handling exits to the set of loop blocks to
+    // clone.
+    DEBUG({
+        dbgs() << "Handled exits of loop:";
+        for (BasicBlock *HE : HandledExits)
+          dbgs() << *HE;
+        dbgs() << "\n";
+      });
+    for (BasicBlock *HE : HandledExits)
+      LoopBlocks.push_back(HE);
+    {
+      const DetachInst *DI = cast<DetachInst>(Header->getTerminator());
+      BasicBlockEdge DetachEdge(Header, DI->getDetached());
+      for (BasicBlock *HE : HandledExits)
+        if (!DT || !DT->dominates(DetachEdge, HE))
+          ExitsToSplit.insert(HE);
+      DEBUG({
+          dbgs() << "Loop exits to split:";
+          for (BasicBlock *ETS : ExitsToSplit)
+            dbgs() << *ETS;
+          dbgs() << "\n";
+        });
+    }
+
+    // DEBUG({
+    //     dbgs() << "LoopBlocks: ";
+    //     for (BasicBlock *LB : LoopBlocks)
+    //       dbgs() << LB->getName() << "("
+    //              << *(LB->getTerminator()) << "), ";
+    //     dbgs() << "\n";
+    //   });
+
+    // Get the inputs and outputs for the loop body.
+    {
+      // CodeExtractor Ext(LoopBlocks, DT);
+      // Ext.findInputsOutputs(BodyInputs, BodyOutputs);
+      SmallPtrSet<BasicBlock *, 32> Blocks;
+      for (BasicBlock *BB : LoopBlocks)
+        Blocks.insert(BB);
+      findInputsOutputs(Blocks, BodyInputs, BodyOutputs, &ExitsToSplit);
+    }
+
+    // Add argument for start of CanonicalIV.
+    DEBUG({
+        Value *CanonicalIVInput =
+          CanonicalIV->getIncomingValueForBlock(Preheader);
+        // CanonicalIVInput should be the constant 0.
+        assert(isa<Constant>(CanonicalIVInput) &&
+               "Input to canonical IV from preheader is not constant.");
+      });
+    Argument *StartArg = new Argument(CanonicalIV->getType(),
+                                      CanonicalIV->getName()+".start");
+    Inputs.insert(StartArg);
+    InputMap[CanonicalIV] = StartArg;
+
+    // Add argument for end.
+    //
+    // In the general case, the loop limit is the result of some computation
+    // that the pass added to the loop's preheader.  In this case, the variable
+    // storing the loop limit is used exactly once, in the canonicalized loop
+    // latch.  In this case, the pass wants to prevent outlining from passing
+    // the loop-limit variable as an arbitrary argument to the outlined
+    // function.  Hence, this pass adds the loop-limit variable as an argument
+    // manually.
+    //
+    // There are two special cases to consider: the loop limit is a constant, or
+    // the loop limit is used elsewhere within the loop.  To handle these two
+    // cases, this pass adds an explict argument for the end of the loop, to
+    // supports the subsequent transformation to using recursive
+    // divide-and-conquer.  After the loop is outlined, this pass will rewrite
+    // the latch in the outlined loop to use this explicit argument.
+    // Furthermore, this pass does not prevent outliner from recognizing the
+    // loop limit as a potential argument to the function.
+    if (isa<Constant>(LimitVar) || !LimitVar->hasOneUse()) {
+      Argument *EndArg = new Argument(LimitVar->getType(), "end");
+      Inputs.insert(EndArg);
+      InputMap[LimitVar] = EndArg;
+    } else {
+      // If the limit var is not constant and has exactly one use, then the
+      // limit var is the result of some nontrivial computation, and that one
+      // use is the new condition inserted.
+      Inputs.insert(LimitVar);
+      InputMap[LimitVar] = LimitVar;
+    }
+
+    // Add argument for grainsize.
+    if (isa<Constant>(GrainVar)) {
+      Argument *GrainArg = new Argument(GrainVar->getType(), "grainsize");
+      Inputs.insert(GrainArg);
+      InputMap[GrainVar] = GrainArg;
+    } else {
+      Inputs.insert(GrainVar);
+      InputMap[GrainVar] = GrainVar;
+    }
+
+    // Put all of the inputs together, and clear redundant inputs from
+    // the set for the loop body.
+    SmallVector<Value *, 8> BodyInputsToRemove;
+    for (Value *V : BodyInputs)
+      if (V == InputSyncRegion)
+        BodyInputsToRemove.push_back(V);
+      else if (!Inputs.count(V))
+        Inputs.insert(V);
+      else
+        BodyInputsToRemove.push_back(V);
+    for (Value *V : BodyInputsToRemove)
+      BodyInputs.remove(V);
+    DEBUG({
+        for (Value *V : BodyInputs)
+          dbgs() << "Remaining body input: " << *V << "\n";
+      });
+    for (Value *V : BodyOutputs)
+      dbgs() << "EL output: " << *V << "\n";
+    assert(0 == BodyOutputs.size() &&
+           "All results from parallel loop should be passed by memory already.");
+  }
+  DEBUG({
+      for (Value *V : Inputs)
+        dbgs() << "EL input: " << *V << "\n";
+      for (Value *V : Outputs)
+        dbgs() << "EL output: " << *V << "\n";
+    });
+
+  // Clone the loop blocks into a new helper function.
+  Function *Helper;
+  {
+    SmallVector<ReturnInst *, 4> Returns;  // Ignore returns cloned.
+
+    // LowerDbgDeclare(*(Header->getParent()));
+
+    Helper = CreateHelper(Inputs, Outputs, LoopBlocks,
+                          Header, Preheader, ExitBlock,
+                          VMap, M,
+                          F->getSubprogram() != nullptr, Returns, ".ls",
+                          &ExitsToSplit, InputSyncRegion,
+                          nullptr, nullptr, nullptr);
+
+    assert(Returns.empty() && "Returns cloned when cloning loop.");
+
+    // Use a fast calling convention for the helper.
+    Helper->setCallingConv(CallingConv::Fast);
+    // Helper->setCallingConv(Header->getParent()->getCallingConv());
+  }
+
+  // Add a sync to the helper's return.
+  BasicBlock *HelperHeader = cast<BasicBlock>(VMap[Header]);
+  {
+    BasicBlock *HelperExit = cast<BasicBlock>(VMap[ExitBlock]);
+    assert(isa<ReturnInst>(HelperExit->getTerminator()));
+    BasicBlock *NewHelperExit = SplitBlock(HelperExit,
+                                           HelperExit->getTerminator(),
+                                           DT, LI);
+    IRBuilder<> Builder(&(HelperExit->front()));
+    SyncInst *NewSync = Builder.CreateSync(
+        NewHelperExit,
+        cast<Instruction>(VMap[InputSyncRegion]));
+    // Set debug info of new sync to match that of terminator of the header of
+    // the cloned loop.
+    NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc());
+    HelperExit->getTerminator()->eraseFromParent();
+  }
+
+  // // Add syncs to the helper's cloned resume blocks.
+  // for (BasicBlock *BB : Resumes) {
+  //   BasicBlock *HelperResume = cast<BasicBlock>(VMap[BB]);
+  //   assert(isa<ResumeInst>(HelperResume->getTerminator()));
+  //   BasicBlock *NewHelperResume = SplitBlock(HelperResume,
+  //                                            HelperResume->getTerminator(),
+  //                                            DT, LI);
+  //   IRBuilder<> Builder(&(HelperResume->front()));
+  //   SyncInst *NewSync = Builder.CreateSync(NewHelperResume);
+  //   // Set debug info of new sync to match that of terminator of the header of
+  //   // the cloned loop.
+  //   NewSync->setDebugLoc(HelperHeader->getTerminator()->getDebugLoc());
+  //   HelperResume->getTerminator()->eraseFromParent();
+  // }
+
+  BasicBlock *NewPreheader = cast<BasicBlock>(VMap[Preheader]);
+  PHINode *NewCanonicalIV = cast<PHINode>(VMap[CanonicalIV]);
+
+  // Rewrite the cloned IV's to start at the start iteration argument.
+  {
+    // Rewrite clone of canonical IV to start at the start iteration
+    // argument.
+    Argument *NewCanonicalIVStart = cast<Argument>(VMap[InputMap[CanonicalIV]]);
+    {
+      int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned canonical IV does not inherit a constant value from cloned preheader.");
+      NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart);
+    }
+
+    // Rewrite other cloned IV's to start at their value at the start
+    // iteration.
+    const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart);
+    DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n");
+    for (PHINode *IV : IVs) {
+      if (CanonicalIV == IV) continue;
+
+      // Get the value of the IV at the start iteration.
+      DEBUG(dbgs() << "IV " << *IV);
+      const SCEV *IVSCEV = SE.getSCEV(IV);
+      DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")");
+      const SCEVAddRecExpr *IVSCEVAddRec = cast<const SCEVAddRecExpr>(IVSCEV);
+      const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE);
+      DEBUG(dbgs() << " expands at iter " << *StartIterSCEV <<
+            " to " << *IVAtIter << "\n");
+
+      // NOTE: Expanded code should not refer to other IV's.
+      Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(),
+                                         NewPreheader->getTerminator());
+
+
+      // Set the value that the cloned IV inherits from the cloned preheader.
+      PHINode *NewIV = cast<PHINode>(VMap[IV]);
+      int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned IV does not inherit a constant value from cloned preheader.");
+      NewIV->setIncomingValue(NewPreheaderIdx, IVStart);
+    }
+
+    // Remap the newly added instructions in the new preheader to use
+    // values local to the helper.
+    for (Instruction &II : *NewPreheader)
+      RemapInstruction(&II, VMap, RF_IgnoreMissingLocals,
+                       /*TypeMapper=*/nullptr, /*Materializer=*/nullptr);
+  }
+
+  // The loop has been outlined by this point.  To handle the special cases
+  // where the loop limit was constant or used elsewhere within the loop, this
+  // pass rewrites the outlined loop-latch condition to use the explicit
+  // end-iteration argument.
+  if (isa<Constant>(LimitVar) || !LimitVar->hasOneUse()) {
+    CmpInst *HelperCond = cast<CmpInst>(VMap[NewCond]);
+    assert(((isa<Constant>(LimitVar) &&
+             HelperCond->getOperand(1) == LimitVar) ||
+            (!LimitVar->hasOneUse() &&
+             HelperCond->getOperand(1) == VMap[LimitVar])) &&
+           "Unexpected condition in loop latch.");
+    IRBuilder<> Builder(HelperCond);
+    Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0),
+                                                 VMap[InputMap[LimitVar]]);
+    HelperCond->replaceAllUsesWith(NewHelperCond);
+    HelperCond->eraseFromParent();
+    DEBUG(dbgs() << "Rewritten Latch: " <<
+          *(cast<Instruction>(NewHelperCond)->getParent()));
+  }
+
+  // DEBUGGING: Simply serialize the cloned loop.
+  // BasicBlock *NewHeader = cast<BasicBlock>(VMap[Header]);
+  // SerializeDetachedCFG(cast<DetachInst>(NewHeader->getTerminator()), nullptr);
+  implementDACIterSpawnOnHelper(Helper, NewPreheader,
+                                cast<BasicBlock>(VMap[Header]),
+                                cast<PHINode>(VMap[CanonicalIV]),
+                                cast<Argument>(VMap[InputMap[LimitVar]]),
+                                cast<Argument>(VMap[InputMap[GrainVar]]),
+                                cast<Instruction>(VMap[InputSyncRegion]),
+                                /*DT=*/nullptr, /*LI=*/nullptr,
+                                CanonicalSCEV->getNoWrapFlags(SCEV::FlagNUW),
+                                CanonicalSCEV->getNoWrapFlags(SCEV::FlagNSW));
+
+  if (verifyFunction(*Helper, &dbgs()))
+    return false;
+
+  // Update allocas in cloned loop body.
+  {
+    // Collect reattach instructions.
+    SmallVector<Instruction *, 4> ReattachPoints;
+    for (pred_iterator PI = pred_begin(Latch), PE = pred_end(Latch);
+         PI != PE; ++PI) {
+      BasicBlock *Pred = *PI;
+      if (!isa<ReattachInst>(Pred->getTerminator())) continue;
+      if (L->contains(Pred))
+        ReattachPoints.push_back(cast<BasicBlock>(VMap[Pred])->getTerminator());
+    }
+    // The cloned loop should be serialized by this point.
+    BasicBlock *ClonedLoopBodyEntry =
+      cast<BasicBlock>(VMap[Header])->getSingleSuccessor();
+    assert(ClonedLoopBodyEntry &&
+           "Head of cloned loop body has multiple successors.");
+    bool ContainsDynamicAllocas =
+      MoveStaticAllocasInBlock(&Helper->getEntryBlock(), ClonedLoopBodyEntry,
+                               ReattachPoints);
+
+    // If the cloned loop contained dynamic alloca instructions, wrap the cloned
+    // loop with llvm.stacksave/llvm.stackrestore intrinsics.
+    if (ContainsDynamicAllocas) {
+      Module *M = Helper->getParent();
+      // Get the two intrinsics we care about.
+      Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+      Function *StackRestore =
+        Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
+
+      // Insert the llvm.stacksave.
+      CallInst *SavedPtr = IRBuilder<>(&*ClonedLoopBodyEntry,
+                                       ClonedLoopBodyEntry->begin())
+                             .CreateCall(StackSave, {}, "savedstack");
+
+      // Insert a call to llvm.stackrestore before the reattaches in the
+      // original Tapir loop.
+      for (Instruction *ExitPoint : ReattachPoints)
+        IRBuilder<>(ExitPoint).CreateCall(StackRestore, SavedPtr);
+    }
+  }
+
+  if (verifyFunction(*Helper, &dbgs()))
+    return false;
+
+  // Add alignment assumptions to arguments of helper, based on alignment of
+  // values in old function.
+  AddAlignmentAssumptions(F, Inputs, VMap,
+                          Preheader->getTerminator(), AC, DT);
+
+  // Add call to new helper function in original function.
+  {
+    // Setup arguments for call.
+    SmallVector<Value *, 4> TopCallArgs;
+    // Add start iteration 0.
+    assert(CanonicalSCEV->getStart()->isZero() &&
+           "Canonical IV does not start at zero.");
+    TopCallArgs.push_back(ConstantInt::get(CanonicalIV->getType(), 0));
+    // Add loop limit.
+    TopCallArgs.push_back(LimitVar);
+    // Add grainsize.
+    TopCallArgs.push_back(GrainVar);
+    // Add the rest of the arguments.
+    for (Value *V : BodyInputs)
+      TopCallArgs.push_back(V);
+    DEBUG({
+        for (Value *TCArg : TopCallArgs)
+          dbgs() << "Top call arg: " << *TCArg << "\n";
+      });
+
+    // Create call instruction.
+    IRBuilder<> Builder(Preheader->getTerminator());
+    CallInst *TopCall = Builder.CreateCall(Helper,
+                                           ArrayRef<Value *>(TopCallArgs));
+
+    // Use a fast calling convention for the helper.
+    TopCall->setCallingConv(CallingConv::Fast);
+    // TopCall->setCallingConv(Helper->getCallingConv());
+    TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    // // Update CG graph with the call we just added.
+    // CG[F]->addCalledFunction(TopCall, CG[Helper]);
+  }
+
+  // Remove sync of loop in parent.
+  {
+    // Get the sync region for this loop's detached iterations.
+    DetachInst *HeadDetach = cast<DetachInst>(Header->getTerminator());
+    Value *SyncRegion = HeadDetach->getSyncRegion();
+    // Check the Tapir instructions contained in this sync region.  Look for a
+    // single sync instruction among those Tapir instructions.  Meanwhile,
+    // verify that the only detach instruction in this sync region is the detach
+    // in theloop header.  If these conditions are met, then we assume that the
+    // sync applies to this loop.  Otherwise, something more complicated is
+    // going on, and we give up.
+    SyncInst *LoopSync = nullptr;
+    bool SingleSyncJustForLoop = true;
+    for (User *U : SyncRegion->users()) {
+      // Skip the detach in the loop header.
+      if (HeadDetach == U) continue;
+      // Remember the first sync instruction we find.  If we find multiple sync
+      // instructions, then something nontrivial is going on.
+      if (SyncInst *SI = dyn_cast<SyncInst>(U)) {
+        if (!LoopSync)
+          LoopSync = SI;
+        else
+          SingleSyncJustForLoop = false;
+      }
+      // If we find a detach instruction that is not the loop header's, then
+      // something nontrivial is going on.
+      if (isa<DetachInst>(U))
+        SingleSyncJustForLoop = false;
+    }
+    if (LoopSync && SingleSyncJustForLoop)
+      // Replace the sync with a branch.
+      ReplaceInstWithInst(LoopSync,
+                          BranchInst::Create(LoopSync->getSuccessor(0)));
+    else if (!LoopSync)
+      DEBUG(dbgs() << "No sync found for this loop.");
+    else
+      DEBUG(dbgs() << "No single sync found that only affects this loop.");
+  }
+
+  ++LoopsConvertedToDAC;
+
+  unlinkLoop();
+
+  return Helper;
+}
+
+/// \brief Replace the latch of the loop to check that IV is always less than or
+/// equal to the limit.
+///
+/// This method assumes that the loop has a single loop latch.
+Value* CilkABILoopSpawning::canonicalizeLoopLatch(PHINode *IV, Value *Limit) {
+  Loop *L = OrigLoop;
+
+  Value *NewCondition;
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "No single loop latch found for loop.");
+
+  IRBuilder<> Builder(&*Latch->getFirstInsertionPt());
+
+  // This process assumes that IV's increment is in Latch.
+
+  // Create comparison between IV and Limit at top of Latch.
+  NewCondition =
+    Builder.CreateICmpULT(Builder.CreateAdd(IV,
+                                            ConstantInt::get(IV->getType(), 1)),
+                          Limit);
+
+  // Replace the conditional branch at the end of Latch.
+  BranchInst *LatchBr = dyn_cast_or_null<BranchInst>(Latch->getTerminator());
+  assert(LatchBr && LatchBr->isConditional() &&
+         "Latch does not terminate with a conditional branch.");
+  Builder.SetInsertPoint(Latch->getTerminator());
+  Builder.CreateCondBr(NewCondition, Header, ExitBlock);
+
+  // Erase the old conditional branch.
+  LatchBr->eraseFromParent();
+
+  return NewCondition;
+}
+
+/// Top-level call to convert a Tapir loop to be processed using an appropriate
+/// Cilk ABI call.
+bool CilkABILoopSpawning::processLoop() {
+  Loop *L = OrigLoop;
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock *Latch = L->getLoopLatch();
+
+  using namespace ore;
+
+  // Check the exit blocks of the loop.
+  if (!ExitBlock) {
+    DEBUG(dbgs() << "LS loop does not contain valid exit block after latch.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "InvalidLatchExit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "invalid latch exit");
+    return false;
+  }
+
+  SmallVector<BasicBlock *, 4> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  for (const BasicBlock *Exit : ExitBlocks) {
+    if (Exit == ExitBlock) continue;
+    if (!isa<UnreachableInst>(Exit->getTerminator())) {
+      DEBUG(dbgs() << "LS loop contains a bad exit block " << *Exit);
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "BadExit",
+                                          L->getStartLoc(),
+                                          Header)
+               << "bad exit block found");
+      return false;
+    }
+  }
+
+  Function *F = Header->getParent();
+  Module* M = F->getParent();
+
+  DEBUG(dbgs() << "LS loop header:" << *Header);
+  DEBUG(dbgs() << "LS loop latch:" << *Latch);
+
+  // DEBUG(dbgs() << "LS SE backedge taken count: " << *(SE.getBackedgeTakenCount(L)) << "\n");
+  // DEBUG(dbgs() << "LS SE max backedge taken count: " << *(SE.getMaxBackedgeTakenCount(L)) << "\n");
+  DEBUG(dbgs() << "LS SE exit count: " << *(SE.getExitCount(L, Latch)) << "\n");
+
+  /// Get loop limit.
+  const SCEV *BETC = SE.getExitCount(L, Latch);
+  const SCEV *Limit = SE.getAddExpr(BETC, SE.getOne(BETC->getType()));
+  DEBUG(dbgs() << "LS Loop limit: " << *Limit << "\n");
+  // PredicatedScalarEvolution PSE(SE, *L);
+  // const SCEV *PLimit = PSE.getExitCount(L, Latch);
+  // DEBUG(dbgs() << "LS predicated loop limit: " << *PLimit << "\n");
+  // emitAnalysis(LoopSpawningReport()
+  //              << "computed loop limit " << *Limit << "\n");
+  if (SE.getCouldNotCompute() == Limit) {
+    DEBUG(dbgs() << "SE could not compute loop limit.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "UnknownLoopLimit",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not compute limit");
+    return false;
+  }
+  // ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "LoopLimit", L->getStartLoc(),
+  //                                     Header)
+  //          << "loop limit: " << NV("Limit", Limit));
+  /// Clean up the loop's induction variables.
+  PHINode *CanonicalIV = canonicalizeIVs(Limit->getType());
+  if (!CanonicalIV) {
+    DEBUG(dbgs() << "Could not get canonical IV.\n");
+    // emitAnalysis(LoopSpawningReport()
+    //              << "Could not get a canonical IV.\n");
+    ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoCanonicalIV",
+                                        L->getStartLoc(),
+                                        Header)
+             << "could not find or create canonical IV");
+    return false;
+  }
+  const SCEVAddRecExpr *CanonicalSCEV =
+    cast<const SCEVAddRecExpr>(SE.getSCEV(CanonicalIV));
+
+  // Remove all IV's other can CanonicalIV.
+  // First, check that we can do this.
+  bool CanRemoveIVs = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    if (CanonicalIV == PN) continue;
+    // dbgs() << "IV " << *PN;
+    const SCEV *S = SE.getSCEV(PN);
+    // dbgs() << " SCEV " << *S << "\n";
+    if (SE.getCouldNotCompute() == S) {
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Could not compute the scalar evolution.\n");
+      ORE.emit(OptimizationRemarkAnalysis(LS_NAME, "NoSCEV", PN)
+               << "could not compute scalar evolution of "
+               << NV("PHINode", PN));
+      CanRemoveIVs = false;
+    }
+  }
+
+  if (!CanRemoveIVs) {
+    DEBUG(dbgs() << "Could not compute scalar evolutions for all IV's.\n");
+    return false;
+  }
+
+  ////////////////////////////////////////////////////////////////////////
+  // We now have everything we need to extract the loop.  It's time to
+  // do some surgery.
+
+  SCEVExpander Exp(SE, M->getDataLayout(), "ls");
+
+  // Remove the IV's (other than CanonicalIV) and replace them with
+  // their stronger forms.
+  //
+  // TODO?: We can probably adapt this process such that we don't require all
+  // IV's to be canonical.
+  {
+    SmallVector<PHINode*, 8> IVsToRemove;
+    for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+      PHINode *PN = cast<PHINode>(II);
+      if (PN == CanonicalIV) continue;
+      const SCEV *S = SE.getSCEV(PN);
+      Value *NewIV = Exp.expandCodeFor(S, S->getType(), CanonicalIV);
+      PN->replaceAllUsesWith(NewIV);
+      IVsToRemove.push_back(PN);
+    }
+    for (PHINode *PN : IVsToRemove)
+      PN->eraseFromParent();
+  }
+
+  // All remaining IV's should be canonical.  Collect them.
+  //
+  // TODO?: We can probably adapt this process such that we don't require all
+  // IV's to be canonical.
+  SmallVector<PHINode*, 8> IVs;
+  bool AllCanonical = true;
+  for (BasicBlock::iterator II = Header->begin(); isa<PHINode>(II); ++II) {
+    PHINode *PN = cast<PHINode>(II);
+    DEBUG({
+        const SCEVAddRecExpr *PNSCEV =
+          dyn_cast<const SCEVAddRecExpr>(SE.getSCEV(PN));
+        assert(PNSCEV && "PHINode did not have corresponding SCEVAddRecExpr");
+        assert(PNSCEV->getStart()->isZero() &&
+               "PHINode SCEV does not start at 0");
+        dbgs() << "LS step recurrence for SCEV " << *PNSCEV << " is "
+               << *(PNSCEV->getStepRecurrence(SE)) << "\n";
+        assert(PNSCEV->getStepRecurrence(SE)->isOne() &&
+               "PHINode SCEV step is not 1");
+      });
+    if (ConstantInt *C =
+        dyn_cast<ConstantInt>(PN->getIncomingValueForBlock(Preheader))) {
+      if (C->isZero())
+        IVs.push_back(PN);
+    } else {
+      AllCanonical = false;
+      DEBUG(dbgs() << "Remaining non-canonical PHI Node found: " << *PN << "\n");
+      // emitAnalysis(LoopSpawningReport(PN)
+      //              << "Found a remaining non-canonical IV.\n");
+      ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "NonCanonicalIV", PN)
+               << "found a remaining noncanonical IV");
+    }
+  }
+  if (!AllCanonical)
+    return false;
+
+  // Insert the computation for the loop limit into the Preheader.
+  Value *LimitVar = Exp.expandCodeFor(Limit, Limit->getType(),
+                                      Preheader->getTerminator());
+  DEBUG(dbgs() << "LimitVar: " << *LimitVar << "\n");
+
+  // Canonicalize the loop latch.
+  Value *NewCond = canonicalizeLoopLatch(CanonicalIV, LimitVar);
+
+  /// Clone the loop into a new function.
+
+  // Get the inputs and outputs for the Loop blocks.
+  SetVector<Value*> Inputs, Outputs;
+  SetVector<Value*> BodyInputs, BodyOutputs;
+  ValueToValueMapTy VMap, InputMap;
+  std::vector<BasicBlock *> LoopBlocks;
+  AllocaInst* closure;
+  // Add start iteration, end iteration, and grainsize to inputs.
+  {
+    LoopBlocks = L->getBlocks();
+    // // Add exit blocks terminated by unreachable.  There should not be any other
+    // // exit blocks in the loop.
+    // SmallSet<BasicBlock *, 4> UnreachableExits;
+    // for (BasicBlock *Exit : ExitBlocks) {
+    //   if (Exit == ExitBlock) continue;
+    //   assert(isa<UnreachableInst>(Exit->getTerminator()) &&
+    //          "Found problematic exit block.");
+    //   UnreachableExits.insert(Exit);
+    // }
+
+    // // Add unreachable and exception-handling exits to the set of loop blocks to
+    // // clone.
+    // for (BasicBlock *BB : UnreachableExits)
+    //   LoopBlocks.push_back(BB);
+    // for (BasicBlock *BB : EHExits)
+    //   LoopBlocks.push_back(BB);
+
+    // DEBUG({
+    //     dbgs() << "LoopBlocks: ";
+    //     for (BasicBlock *LB : LoopBlocks)
+    //       dbgs() << LB->getName() << "("
+    //              << *(LB->getTerminator()) << "), ";
+    //     dbgs() << "\n";
+    //   });
+
+    // Get the inputs and outputs for the loop body.
+    {
+      // CodeExtractor Ext(LoopBlocks, DT);
+      // Ext.findInputsOutputs(BodyInputs, BodyOutputs);
+      SmallPtrSet<BasicBlock *, 32> Blocks;
+      for (BasicBlock *BB : LoopBlocks)
+        Blocks.insert(BB);
+      findInputsOutputs(Blocks, BodyInputs, BodyOutputs);
+    }
+
+    // Add argument for start of CanonicalIV.
+    DEBUG({
+        Value *CanonicalIVInput =
+          CanonicalIV->getIncomingValueForBlock(Preheader);
+        // CanonicalIVInput should be the constant 0.
+        assert(isa<Constant>(CanonicalIVInput) &&
+               "Input to canonical IV from preheader is not constant.");
+      });
+    Argument *StartArg = new Argument(CanonicalIV->getType(),
+                                      CanonicalIV->getName()+".start");
+    Inputs.insert(StartArg);
+    InputMap[CanonicalIV] = StartArg;
+
+    // Add argument for end.
+    Value* ea;
+    if (isa<Constant>(LimitVar)) {
+      Argument *EndArg = new Argument(LimitVar->getType(), "end");
+      Inputs.insert(EndArg);
+      ea = InputMap[LimitVar] = EndArg;
+    } else {
+      Inputs.insert(LimitVar);
+      ea = InputMap[LimitVar] = LimitVar;
+    }
+
+    // Put all of the inputs together, and clear redundant inputs from
+    // the set for the loop body.
+    SmallVector<Value*, 8> BodyInputsToRemove;
+    SmallVector<Value*, 8> StructInputs;
+    SmallVector<Type*, 8> StructIT;
+    for (Value *V : BodyInputs) {
+      if (!Inputs.count(V)) {
+        StructInputs.push_back(V);
+        StructIT.push_back(V->getType());
+      }
+      else
+        BodyInputsToRemove.push_back(V);
+    }
+    StructType* ST = StructType::create(StructIT);
+    IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+    IRBuilder<> B2(L->getHeader()->getFirstNonPHIOrDbgOrLifetime());
+    closure = B.CreateAlloca(ST);
+    for(unsigned i=0; i<StructInputs.size(); i++) {
+      B.CreateStore(StructInputs[i], B.CreateConstGEP2_32(ST, closure, 0, i));
+      auto l2 = B2.CreateLoad(B2.CreateConstGEP2_32(ST, closure, 0, i));
+      auto UI = StructInputs[i]->use_begin(), E = StructInputs[i]->use_end();
+      for (; UI != E;) {
+        Use &U = *UI;
+        ++UI;
+        auto *Usr = dyn_cast<Instruction>(U.getUser());
+        if (Usr && !L->contains(Usr->getParent()))
+          continue;
+        U.set(l2);
+      }
+    }
+    Inputs.insert(closure);
+    //llvm::errs() << "<B>\n";
+    //for(auto& a : Inputs) a->dump();
+    //llvm::errs() << "</B>\n";
+    //StartArg->dump();
+    //ea->dump();
+    Inputs.remove(StartArg);
+    Inputs.insert(StartArg);
+    Inputs.remove(ea);
+    Inputs.insert(ea);
+    //llvm::errs() << "<A>\n";
+    //for(auto& a : Inputs) a->dump();
+    //llvm::errs() << "</A>\n";
+    for (Value *V : BodyInputsToRemove)
+      BodyInputs.remove(V);
+    assert(0 == BodyOutputs.size() &&
+           "All results from parallel loop should be passed by memory already.");
+  }
+  DEBUG({
+      for (Value *V : Inputs)
+        dbgs() << "EL input: " << *V << "\n";
+      for (Value *V : Outputs)
+        dbgs() << "EL output: " << *V << "\n";
+    });
+
+
+  Function *Helper;
+  {
+    SmallVector<ReturnInst *, 4> Returns;  // Ignore returns cloned.
+
+    // LowerDbgDeclare(*(Header->getParent()));
+
+    Helper = CreateHelper(Inputs, Outputs, L->getBlocks(),
+                          Header, Preheader, ExitBlock/*L->getExitBlock()*/,
+                          VMap, M,
+                          F->getSubprogram() != nullptr, Returns, ".ls",
+                          nullptr, nullptr, nullptr);
+
+    assert(Returns.empty() && "Returns cloned when cloning loop.");
+
+    // Use a fast calling convention for the helper.
+    //Helper->setCallingConv(CallingConv::Fast);
+    // Helper->setCallingConv(Header->getParent()->getCallingConv());
+  }
+
+  BasicBlock *NewPreheader = cast<BasicBlock>(VMap[Preheader]);
+  PHINode *NewCanonicalIV = cast<PHINode>(VMap[CanonicalIV]);
+
+  // Rewrite the cloned IV's to start at the start iteration argument.
+  {
+    // Rewrite clone of canonical IV to start at the start iteration
+    // argument.
+    Argument *NewCanonicalIVStart = cast<Argument>(VMap[InputMap[CanonicalIV]]);
+    {
+      int NewPreheaderIdx = NewCanonicalIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewCanonicalIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned canonical IV does not inherit a constant value from cloned preheader.");
+      NewCanonicalIV->setIncomingValue(NewPreheaderIdx, NewCanonicalIVStart);
+    }
+
+    // Rewrite other cloned IV's to start at their value at the start
+    // iteration.
+    const SCEV *StartIterSCEV = SE.getSCEV(NewCanonicalIVStart);
+    DEBUG(dbgs() << "StartIterSCEV: " << *StartIterSCEV << "\n");
+    for (PHINode *IV : IVs) {
+      if (CanonicalIV == IV) continue;
+
+      // Get the value of the IV at the start iteration.
+      DEBUG(dbgs() << "IV " << *IV);
+      const SCEV *IVSCEV = SE.getSCEV(IV);
+      DEBUG(dbgs() << " (SCEV " << *IVSCEV << ")");
+      const SCEVAddRecExpr *IVSCEVAddRec = cast<const SCEVAddRecExpr>(IVSCEV);
+      const SCEV *IVAtIter = IVSCEVAddRec->evaluateAtIteration(StartIterSCEV, SE);
+      DEBUG(dbgs() << " expands at iter " << *StartIterSCEV <<
+            " to " << *IVAtIter << "\n");
+
+      // NOTE: Expanded code should not refer to other IV's.
+      Value *IVStart = Exp.expandCodeFor(IVAtIter, IVAtIter->getType(),
+                                         NewPreheader->getTerminator());
+
+
+      // Set the value that the cloned IV inherits from the cloned preheader.
+      PHINode *NewIV = cast<PHINode>(VMap[IV]);
+      int NewPreheaderIdx = NewIV->getBasicBlockIndex(NewPreheader);
+      assert(isa<Constant>(NewIV->getIncomingValue(NewPreheaderIdx)) &&
+             "Cloned IV does not inherit a constant value from cloned preheader.");
+      NewIV->setIncomingValue(NewPreheaderIdx, IVStart);
+    }
+
+    // Remap the newly added instructions in the new preheader to use
+    // values local to the helper.
+    for (Instruction &II : *NewPreheader)
+      RemapInstruction(&II, VMap, RF_IgnoreMissingLocals,
+                       /*TypeMapper=*/nullptr, /*Materializer=*/nullptr);
+  }
+
+  // If the loop limit is constant, then rewrite the loop latch
+  // condition to use the end-iteration argument.
+  if (isa<Constant>(LimitVar)) {
+    CmpInst *HelperCond = cast<CmpInst>(VMap[NewCond]);
+    assert(HelperCond->getOperand(1) == LimitVar);
+    IRBuilder<> Builder(HelperCond);
+    Value *NewHelperCond = Builder.CreateICmpULT(HelperCond->getOperand(0),
+                                                 VMap[InputMap[LimitVar]]);
+    HelperCond->replaceAllUsesWith(NewHelperCond);
+    HelperCond->eraseFromParent();
+  }
+
+  // For debugging:
+  BasicBlock *NewHeader = cast<BasicBlock>(VMap[Header]);
+  SerializeDetachedCFG(cast<DetachInst>(NewHeader->getTerminator()), nullptr);
+  {
+    Value* v = &*Helper->arg_begin();
+    auto UI = v->use_begin(), E = v->use_end();
+    for (; UI != E;) {
+      Use &U = *UI;
+      ++UI;
+      auto *Usr = dyn_cast<Instruction>(U.getUser());
+      Usr->moveBefore(Helper->getEntryBlock().getTerminator());
+
+      auto UI2 = Usr->use_begin(), E2 = Usr->use_end();
+      for (; UI2 != E2;) {
+        Use &U2 = *UI2;
+        ++UI2;
+        auto *Usr2 = dyn_cast<Instruction>(U2.getUser());
+        Usr2->moveBefore(Helper->getEntryBlock().getTerminator());
+      }
+    }
+  }
+
+  if (verifyFunction(*Helper, &dbgs()))
+    return false;
+
+  // Add call to new helper function in original function.
+  {
+    // Setup arguments for call.
+    SetVector<Value*> TopCallArgs;
+    // Add start iteration 0.
+    assert(CanonicalSCEV->getStart()->isZero() &&
+           "Canonical IV does not start at zero.");
+    TopCallArgs.insert(ConstantInt::get(CanonicalIV->getType(), 0));
+    // Add loop limit.
+    TopCallArgs.insert(LimitVar);
+    // Add grainsize.
+    //TopCallArgs.insert(GrainVar);
+    // Add the rest of the arguments.
+    for (Value *V : BodyInputs)
+      TopCallArgs.insert(V);
+
+    // Create call instruction.
+    IRBuilder<> Builder(Preheader->getTerminator());
+
+    llvm::Function* F;
+    if( ((llvm::IntegerType*)LimitVar->getType())->getBitWidth() == 32 )
+      F = CILKRTS_FUNC(cilk_for_32, *M);
+    else {
+      assert( ((llvm::IntegerType*)LimitVar->getType())->getBitWidth() == 64 );
+      F = CILKRTS_FUNC(cilk_for_64, *M);
+    }
+    llvm::Value* args[] = {
+      Builder.CreatePointerCast(Helper, F->getFunctionType()->getParamType(0)),
+      Builder.CreatePointerCast(closure, F->getFunctionType()->getParamType(1)),
+      LimitVar,
+      ConstantInt::get(IntegerType::get(F->getContext(), sizeof(int)*8),0)
+    };
+
+    /*CallInst *TopCall = */Builder.CreateCall(F, args);
+
+    // Use a fast calling convention for the helper.
+    //TopCall->setCallingConv(CallingConv::Fast);
+    // TopCall->setCallingConv(Helper->getCallingConv());
+    //TopCall->setDebugLoc(Header->getTerminator()->getDebugLoc());
+    // // Update CG graph with the call we just added.
+    // CG[F]->addCalledFunction(TopCall, CG[Helper]);
+  }
+
+  ++LoopsConvertedToCilkABI;
+
+  unlinkLoop();
+
+  return Helper;
+}
+
+/// Checks if this loop is a Tapir loop.  Right now we check that the loop is
+/// in a canonical form:
+/// 1) The header detaches the body.
+/// 2) The loop contains a single latch.
+/// 3) The body reattaches to the latch (which is necessary for a valid
+///    detached CFG).
+/// 4) The loop only branches to the exit block from the header or the latch.
+bool LoopSpawningImpl::isTapirLoop(const Loop *L) {
+  const BasicBlock *Header = L->getHeader();
+  const BasicBlock *Latch = L->getLoopLatch();
+  // const BasicBlock *Exit = L->getExitBlock();
+
+  // DEBUG(dbgs() << "LS checking if Tapir loop: " << *L);
+
+  // Header must be terminated by a detach.
+  if (!isa<DetachInst>(Header->getTerminator())) {
+    DEBUG(dbgs() << "LS loop header is not terminated by a detach: " << *L << "\n");
+    return false;
+  }
+
+  // Loop must have a unique latch.
+  if (nullptr == Latch) {
+    DEBUG(dbgs() << "LS loop does not have a unique latch: " << *L << "\n");
+    return false;
+  }
+
+  // // Loop must have a unique exit block.
+  // if (nullptr == Exit) {
+  //   DEBUG(dbgs() << "LS loop does not have a unique exit block: " << *L << "\n");
+  //   SmallVector<BasicBlock *, 4> ExitBlocks;
+  //   L->getUniqueExitBlocks(ExitBlocks);
+  //   for (BasicBlock *Exit : ExitBlocks)
+  //     DEBUG(dbgs() << *Exit);
+  //   return false;
+  // }
+
+  // Continuation of header terminator must be the latch.
+  const DetachInst *HeaderDetach = cast<DetachInst>(Header->getTerminator());
+  const BasicBlock *Continuation = HeaderDetach->getContinue();
+  if (Continuation != Latch) {
+    DEBUG(dbgs() << "LS continuation of detach in header is not the latch: "
+                 << *L << "\n");
+    return false;
+  }
+
+  // All other predecessors of Latch are terminated by reattach instructions.
+  for (auto PI = pred_begin(Latch), PE = pred_end(Latch);  PI != PE; ++PI) {
+    const BasicBlock *Pred = *PI;
+    if (Header == Pred) continue;
+    if (!isa<ReattachInst>(Pred->getTerminator())) {
+      DEBUG(dbgs() << "LS Latch has a predecessor that is not terminated "
+                   << "by a reattach: " << *L << "\n");
+      return false;
+    }
+  }
+
+  // Get the exit block from Latch.
+  BasicBlock *Exit = Latch->getTerminator()->getSuccessor(0);
+  if (Header == Exit)
+    Exit = Latch->getTerminator()->getSuccessor(1);
+
+  // The only predecessors of Exit inside the loop are Header and Latch.
+  for (auto PI = pred_begin(Exit), PE = pred_end(Exit);  PI != PE; ++PI) {
+    const BasicBlock *Pred = *PI;
+    if (!L->contains(Pred))
+      continue;
+    if (Header != Pred && Latch != Pred) {
+      DEBUG(dbgs() << "LS Loop branches to exit block from a block "
+                   << "other than the header or latch" << *L << "\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+/// This routine recursively examines all descendants of the specified loop and
+/// adds all Tapir loops in that tree to the vector.  This routine performs a
+/// pre-order traversal of the tree of loops and pushes each Tapir loop found
+/// onto the end of the vector.
+void LoopSpawningImpl::addTapirLoop(Loop *L, SmallVectorImpl<Loop *> &V) {
+  if (isTapirLoop(L)) {
+    V.push_back(L);
+    return;
+  }
+
+  LoopSpawningHints Hints(L, ORE);
+
+  DEBUG(dbgs() << "LS: Loop hints:"
+               << " strategy = " << Hints.printStrategy(Hints.getStrategy())
+               << "\n");
+
+  using namespace ore;
+
+  if (LoopSpawningHints::ST_SEQ != Hints.getStrategy()) {
+    DEBUG(dbgs() << "LS: Marked loop is not a valid Tapir loop.\n"
+          << "\tLoop hints:"
+          << " strategy = " << Hints.printStrategy(Hints.getStrategy())
+          << "\n");
+    ORE.emit(OptimizationRemarkMissed(LS_NAME, "NotTapir",
+                                      L->getStartLoc(), L->getHeader())
+             << "marked loop is not a valid Tapir loop");
+  }
+
+  for (Loop *InnerL : *L)
+    addTapirLoop(InnerL, V);
+}
+
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+  std::string Result;
+  if (L) {
+    raw_string_ostream OS(Result);
+    if (const DebugLoc LoopDbgLoc = L->getStartLoc())
+      LoopDbgLoc.print(OS);
+    else
+      // Just print the module name.
+      OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+    OS.flush();
+  }
+  return Result;
+}
+#endif
+
+bool LoopSpawningImpl::run() {
+  // Build up a worklist of inner-loops to vectorize. This is necessary as
+  // the act of vectorizing or partially unrolling a loop creates new loops
+  // and can invalidate iterators across the loops.
+  SmallVector<Loop *, 8> Worklist;
+
+  // Examine all top-level loops in this function, and call addTapirLoop to push
+  // those loops onto the work list.
+  for (Loop *L : LI)
+    addTapirLoop(L, Worklist);
+
+  LoopsAnalyzed += Worklist.size();
+
+  // Now walk the identified inner loops.
+  bool Changed = false;
+  while (!Worklist.empty())
+    // Process the work list of loops backwards.  For each tree of loops in this
+    // function, addTapirLoop pushed those loops onto the work list according to
+    // a pre-order tree traversal.  Therefore, processing the work list
+    // backwards leads us to process innermost loops first.
+    Changed |= processLoop(Worklist.pop_back_val());
+
+  // Process each loop nest in the function.
+  return Changed;
+}
+
+// Top-level routine to process a given loop.
+bool LoopSpawningImpl::processLoop(Loop *L) {
+#ifndef NDEBUG
+  const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+  // Function containing loop
+  Function *F = L->getHeader()->getParent();
+
+  DEBUG(dbgs() << "\nLS: Checking a Tapir loop in \""
+               << L->getHeader()->getParent()->getName() << "\" from "
+        << DebugLocStr << ": " << *L << "\n");
+
+  LoopSpawningHints Hints(L, ORE);
+
+  DEBUG(dbgs() << "LS: Loop hints:"
+               << " strategy = " << Hints.printStrategy(Hints.getStrategy())
+               << "\n");
+
+  using namespace ore;
+
+  // Get the loop preheader.  LoopSimplify should guarantee that the loop
+  // preheader is not terminated by a sync.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  if (!Preheader) {
+    DEBUG(dbgs() << "LS: Loop lacks a preheader.\n");
+    ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoPreheader",
+                                      L->getStartLoc(), L->getHeader())
+             << "loop lacks a preheader");
+    emitMissedWarning(F, L, Hints, &ORE);
+    return false;
+  } else if (!isa<BranchInst>(Preheader->getTerminator())) {
+    DEBUG(dbgs() << "LS: Loop preheader is not terminated by a branch.\n");
+    ORE.emit(OptimizationRemarkMissed(LS_NAME, "ComplexPreheader",
+                                      L->getStartLoc(), L->getHeader())
+             << "loop preheader not terminated by a branch");
+    emitMissedWarning(F, L, Hints, &ORE);
+    return false;
+  }
+
+  switch(Hints.getStrategy()) {
+  case LoopSpawningHints::ST_SEQ:
+    DEBUG(dbgs() << "LS: Hints dictate sequential spawning.\n");
+    break;
+  case LoopSpawningHints::ST_DAC:
+    DEBUG(dbgs() << "LS: Hints dictate DAC spawning.\n");
+    {
+      DebugLoc DLoc = L->getStartLoc();
+      BasicBlock *Header = L->getHeader();
+      DACLoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE);
+      // CilkABILoopSpawning DLS(L, SE, &LI, &DT, &AC, ORE);
+      // DACLoopSpawning DLS(L, SE, LI, DT, TLI, TTI, ORE);
+      if (DLS.processLoop()) {
+        DEBUG({
+            if (verifyFunction(*L->getHeader()->getParent())) {
+              dbgs() << "Transformed function is invalid.\n";
+              return false;
+            }
+          });
+        // Report success.
+        ORE.emit(OptimizationRemark(LS_NAME, "DACSpawning", DLoc, Header)
+                 << "spawning iterations using divide-and-conquer");
+        return true;
+      } else {
+        // Report failure.
+        ORE.emit(OptimizationRemarkMissed(LS_NAME, "NoDACSpawning", DLoc,
+                                          Header)
+                 << "cannot spawn iterations using divide-and-conquer");
+        emitMissedWarning(F, L, Hints, &ORE);
+        return false;
+      }
+    }
+    break;
+  case LoopSpawningHints::ST_END:
+    dbgs() << "LS: Hints specify unknown spawning strategy.\n";
+    break;
+  }
+  return false;
+}
+
+// PreservedAnalyses LoopSpawningPass::run(Module &M, ModuleAnalysisManager &AM) {
+//   // Find functions that detach for processing.
+//   SmallVector<Function *, 4> WorkList;
+//   for (Function &F : M)
+//     for (BasicBlock &BB : F)
+//       if (isa<DetachInst>(BB.getTerminator()))
+//         WorkList.push_back(&F);
+
+//   if (WorkList.empty())
+//     return PreservedAnalyses::all();
+
+//   bool Changed = false;
+//   while (!WorkList.empty()) {
+//     Function *F = WorkList.back();
+//     auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+//     auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+//     auto &LI = FAM.getResult<LoopAnalysis>(*F);
+//     auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(*F);
+//     auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
+//     auto &TTI = FAM.getResult<TargetIRAnalysis>(*F);
+//     auto &AA = FAM.getResult<AAManager>(*F);
+//     auto &AC = FAM.getResult<AssumptionAnalysis>(*F);
+//     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+//     LoopSpawningImpl Impl(*F, LI, SE, DT, TTI, &TLI, AA, AC, ORE);
+//     Changed |= Impl.run();
+//     WorkList.pop_back();
+//   }
+
+//   if (Changed)
+//     return PreservedAnalyses::none();
+//   return PreservedAnalyses::all();
+// }
+
+PreservedAnalyses LoopSpawningPass::run(Function &F,
+                                        FunctionAnalysisManager &AM) {
+  // Determine if function detaches.
+  bool DetachingFunction = false;
+  for (BasicBlock &BB : F)
+    if (isa<DetachInst>(BB.getTerminator()))
+      DetachingFunction = true;
+
+  if (!DetachingFunction)
+    return PreservedAnalyses::all();
+
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  // auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+  // auto &TLI = AM.getResult<TargetLibraryAnalysis>(M);
+  // auto &AA = AM.getResult<AAManager>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &ORE =
+    AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  // OptimizationRemarkEmitter ORE(F);
+
+  bool Changed = LoopSpawningImpl(F, LI, SE, DT, AC, ORE).run();
+
+  AM.invalidate<ScalarEvolutionAnalysis>(F);
+
+  if (Changed)
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+namespace {
+struct LoopSpawning : public FunctionPass {
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit LoopSpawning() : FunctionPass(ID) {
+    initializeLoopSpawningPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    bool DetachingFunction = false;
+    for (BasicBlock &BB : F)
+      if (isa<DetachInst>(BB.getTerminator()))
+        DetachingFunction = true;
+
+    if (!DetachingFunction)
+      return false;
+
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    // auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
+    // auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+    // auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+    // auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+    // auto *AA = &getAnalysis<AAResultsWrapperPass>(*F).getAAResults();
+    auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    auto &ORE =
+      getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    // OptimizationRemarkEmitter ORE(F);
+    return LoopSpawningImpl(F, LI, SE, DT, AC, ORE).run();
+  }
+
+  // bool runOnModule(Module &M) override {
+  //   if (skipModule(M))
+  //     return false;
+
+  //   // Find functions that detach for processing.
+  //   SmallVector<Function *, 4> WorkList;
+  //   for (Function &F : M)
+  //     for (BasicBlock &BB : F)
+  //       if (isa<DetachInst>(BB.getTerminator()))
+  //         WorkList.push_back(&F);
+
+  //   if (WorkList.empty())
+  //     return false;
+
+  //   auto GetLI = [this](Function &F) -> LoopInfo & {
+  //     return getAnalysis<LoopInfoWrapperPass>(F).getLoopInfo();
+  //   };
+  //   auto GetSE = [this](Function &F) -> ScalarEvolution & {
+  //     return getAnalysis<ScalarEvolutionWrapperPass>(F).getSE();
+  //   };
+  //   auto GetDT = [this](Function &F) -> DominatorTree & {
+  //     return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+  //   };
+
+  //   bool Changed = false;
+  //   while (!WorkList.empty()) {
+  //     // Process the next function.
+  //     Function *F = WorkList.back();
+  //     // auto *LI = &getAnalysis<LoopInfoWrapperPass>(*F).getLoopInfo();
+  //     // auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>(*F).getSE();
+  //     // auto *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+  //     // auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*F);
+  //     // auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+  //     // auto *TLI = TLIP ? &TLIP->getTLI() : nullptr;
+  //     // auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  //     // auto *AA = &getAnalysis<AAResultsWrapperPass>(*F).getAAResults();
+  //     // auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(*F);
+  //     auto &ORE =
+  //       getAnalysis<OptimizationRemarkEmitterWrapperPass>(*F).getORE();
+  //     // OptimizationRemarkEmitter ORE(F);
+  //     // LoopSpawningImpl Impl(*F, GetLI, GetSE, GetDT, *TTI, TLI, *AA, *AC, ORE);
+  //     LoopSpawningImpl Impl(*F, GetLI, GetSE, GetDT, ORE);
+  //     Changed |= Impl.run();
+
+  //     WorkList.pop_back();
+  //   }
+  //   return Changed;
+  // }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    // AU.addRequired<LoopAccessLegacyAnalysis>();
+    // getAAResultsAnalysisUsage(AU);
+    // AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+  }
+};
+}
+
+char LoopSpawning::ID = 0;
+// static RegisterPass<LoopSpawning> X(LS_NAME, "Transform Tapir loops to spawn iterations efficiently", false, false);
+static const char ls_name[] = "Loop Spawning";
+INITIALIZE_PASS_BEGIN(LoopSpawning, LS_NAME, ls_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+// INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+// INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopSpawning, LS_NAME, ls_name, false, false)
+
+namespace llvm {
+Pass *createLoopSpawningPass() {
+  return new LoopSpawning();
+}
+}
diff --git a/llvm/lib/Transforms/Tapir/LowerToCilk.cpp b/llvm/lib/Transforms/Tapir/LowerToCilk.cpp
new file mode 100644
index 00000000000000..2d8b1ccb82572e
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/LowerToCilk.cpp
@@ -0,0 +1,219 @@
+//===- LowerToCilk.cpp - Convert Tapir into Cilk runtime calls ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts functions that include Tapir instructions to call out to
+// the Cilk runtime system.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/CilkABI.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Tapir.h"
+
+#define DEBUG_TYPE "tapir2cilk"
+
+using namespace llvm;
+
+static cl::opt<bool> ClInstrumentCilk("instrument-cilk", cl::init(false),
+                                      cl::Hidden,
+                                      cl::desc("Instrument Cilk events"));
+
+cl::opt<bool> fastCilk("fast-cilk", cl::init(false), cl::Hidden,
+                       cl::desc("Attempt faster cilk call implementation"));
+
+namespace {
+
+struct LowerTapirToCilk : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  bool DisablePostOpts;
+  bool Instrument;
+  explicit LowerTapirToCilk(bool DisablePostOpts = false, bool Instrument = false)
+      : ModulePass(ID), DisablePostOpts(DisablePostOpts),
+        Instrument(Instrument) {
+    initializeLowerTapirToCilkPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Simple Lowering of Tapir to Cilk ABI";
+  }
+
+  bool runOnModule(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+  }
+private:
+  ValueToValueMapTy DetachCtxToStackFrame;
+  bool unifyReturns(Function &F);
+  SmallVectorImpl<Function *> *processFunction(Function &F, DominatorTree &DT,
+                                               AssumptionCache &AC);
+};
+}  // End of anonymous namespace
+
+char LowerTapirToCilk::ID = 0;
+INITIALIZE_PASS_BEGIN(LowerTapirToCilk, "tapir2cilk",
+                      "Simple Lowering of Tapir to Cilk ABI", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(LowerTapirToCilk, "tapir2cilk",
+                    "Simple Lowering of Tapir to Cilk ABI", false, false)
+
+// Helper function to inline calls to compiler-generated Cilk Plus runtime
+// functions when possible.  This inlining is necessary to properly implement
+// some Cilk runtime "calls," such as __cilkrts_detach().
+static inline void inlineCilkFunctions(Function &F) {
+  bool inlining = true;
+  while (inlining) {
+    inlining = false;
+    for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I)
+      if (CallInst *cal = dyn_cast<CallInst>(&*I))
+        if (Function *fn = cal->getCalledFunction())
+          if (fn->getName().startswith("__cilk")) {
+            InlineFunctionInfo IFI;
+            if (InlineFunction(cal, IFI)) {
+              if (fn->getNumUses()==0)
+                fn->eraseFromParent();
+              inlining = true;
+              break;
+            }
+          }
+  }
+
+  if (verifyFunction(F, &errs())) {
+    DEBUG(F.dump());
+    assert(0);
+  }
+}
+
+bool LowerTapirToCilk::unifyReturns(Function &F) {
+  SmallVector<BasicBlock *, 4> ReturningBlocks;
+  for (BasicBlock &BB : F)
+    if (isa<ReturnInst>(BB.getTerminator()))
+      ReturningBlocks.push_back(&BB);
+
+  // If this function already has a single return, then terminate early.
+  if (ReturningBlocks.size() == 1)
+    return false;
+
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
+                                               "UnifiedReturnBlock", &F);
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+  return true;
+}
+
+SmallVectorImpl<Function *>
+*LowerTapirToCilk::processFunction(Function &F, DominatorTree &DT,
+                                   AssumptionCache &AC) {
+  if (fastCilk && F.getName()=="main") {
+    IRBuilder<> start(F.getEntryBlock().getFirstNonPHIOrDbg());
+    auto m = start.CreateCall(CILKRTS_FUNC(init, *F.getParent()));
+    m->moveBefore(F.getEntryBlock().getTerminator());
+  }
+
+  if (unifyReturns(F))
+    DT.recalculate(F);
+
+  // Lower Tapir instructions in this function.  Collect the set of helper
+  // functions generated by this process.
+  SmallVector<Function *, 4> *NewHelpers = new SmallVector<Function *, 4>();
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    if (DetachInst* DI = dyn_cast_or_null<DetachInst>(I->getTerminator())) {
+      // Lower a detach instruction, and collect the helper function generated
+      // in this process for executing the detached task.
+      Function *Helper = cilk::createDetach(*DI, DetachCtxToStackFrame, DT, AC,
+                                            ClInstrumentCilk || Instrument);
+      NewHelpers->push_back(Helper);
+    } else if (SyncInst* SI = dyn_cast_or_null<SyncInst>(I->getTerminator())) {
+      // Lower a sync instruction.
+      cilk::createSync(*SI, DetachCtxToStackFrame,
+                       ClInstrumentCilk || Instrument);
+    }
+  }
+
+  if (verifyFunction(F, &errs())) {
+    DEBUG(F.dump());
+    assert(0);
+  }
+
+  // Inline Cilk runtime calls in the function and generated helper functions.
+  inlineCilkFunctions(F);
+  for (Function *H : *NewHelpers)
+    inlineCilkFunctions(*H);
+
+  return NewHelpers;
+}
+
+bool LowerTapirToCilk::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  // Add functions that detach to the work list.
+  SmallVector<Function *, 4> WorkList;
+  for (Function &F : M)
+    for (BasicBlock &BB : F)
+      if (isa<DetachInst>(BB.getTerminator())) {
+        WorkList.push_back(&F);
+        break;
+      }
+
+  if (WorkList.empty())
+    return false;
+
+  bool Changed = false;
+  std::unique_ptr<SmallVectorImpl<Function *>> NewHelpers;
+  while (!WorkList.empty()) {
+    // Process the next function.
+    Function *F = WorkList.back();
+    WorkList.pop_back();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+    AssumptionCacheTracker &ACT = getAnalysis<AssumptionCacheTracker>();
+    NewHelpers.reset(processFunction(*F, DT, ACT.getAssumptionCache(*F)));
+    Changed |= !NewHelpers->empty();
+    // Check the generated helper functions to see if any need to be processed,
+    // that is, to see if any of them themselves detach a subtask.
+    for (Function *Helper : *NewHelpers)
+      for (BasicBlock &BB : *Helper)
+        if (isa<DetachInst>(BB.getTerminator()))
+          WorkList.push_back(Helper);
+  }
+  return Changed;
+}
+
+// createLowerTapirToCilkPass - Provide an entry point to create this pass.
+//
+namespace llvm {
+ModulePass *createLowerTapirToCilkPass(bool DisablePostOpts, bool Instrument) {
+  return new LowerTapirToCilk(DisablePostOpts, Instrument);
+}
+}
diff --git a/llvm/lib/Transforms/Tapir/Outline.cpp b/llvm/lib/Transforms/Tapir/Outline.cpp
new file mode 100644
index 00000000000000..ce347c4bf7fdf6
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/Outline.cpp
@@ -0,0 +1,379 @@
+//===- TapirOutline.cpp - Outlining for Tapir -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements helper functions for outlining portions of code
+// containing Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir/Outline.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "outlining"
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                            Value *V) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                            Value *V) {
+  if (isa<Argument>(V)) return true;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (!Blocks.count(I->getParent()))
+      return true;
+  return false;
+}
+
+void llvm::findInputsOutputs(const SmallPtrSetImpl<BasicBlock *> &Blocks,
+                             ValueSet &Inputs,
+                             ValueSet &Outputs,
+                             const SmallPtrSetImpl<BasicBlock *> *ExitBlocks) {
+  for (BasicBlock *BB : Blocks) {
+    // If a used value is defined outside the region, it's an input.  If an
+    // instruction is used outside the region, it's an output.
+    for (Instruction &II : *BB) {
+      for (User::op_iterator OI = II.op_begin(), OE = II.op_end(); OI != OE;
+           ++OI) {
+        // The PHI nodes in each exit block will be updated after the exit block
+        // is cloned.  Hence, we don't want to count their uses of values
+        // defined outside the region.
+        if (ExitBlocks->count(BB))
+          if (PHINode *PN = dyn_cast<PHINode>(&II))
+            if (!Blocks.count(PN->getIncomingBlock(*OI)))
+              continue;
+        if (definedInCaller(Blocks, *OI))
+          Inputs.insert(*OI);
+      }
+
+      for (User *U : II.users())
+        if (!definedInRegion(Blocks, U)) {
+          Outputs.insert(&II);
+          break;
+        }
+    }
+  }
+}
+
+// Clone Blocks into NewFunc, transforming the old arguments into references to
+// VMap values.
+//
+/// TODO: Fix the std::vector part of the type of this function.
+void llvm::CloneIntoFunction(Function *NewFunc, const Function *OldFunc,
+                             std::vector<BasicBlock *> Blocks,
+                             ValueToValueMapTy &VMap,
+                             bool ModuleLevelChanges,
+                             SmallVectorImpl<ReturnInst *> &Returns,
+                             const StringRef NameSuffix,
+                             SmallPtrSetImpl<BasicBlock *> *ExitBlocks,
+                             DISubprogram *SP,
+                             ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
+  // Get the predecessors of the exit blocks
+  SmallPtrSet<const BasicBlock *, 4> ExitBlockPreds, ClonedEBPreds;
+  for (BasicBlock *EB : *ExitBlocks)
+    for (BasicBlock *Pred : predecessors(EB))
+      ExitBlockPreds.insert(Pred);
+
+  // When we remap instructions, we want to avoid duplicating inlined
+  // DISubprograms, so record all subprograms we find as we duplicate
+  // instructions and then freeze them in the MD map.
+  DebugInfoFinder DIFinder;
+
+  // Loop over all of the basic blocks in the function, cloning them as
+  // appropriate.
+  for (const BasicBlock *BB : Blocks) {
+    // Record all exit block predecessors that are cloned.
+    if (ExitBlockPreds.count(BB))
+      ClonedEBPreds.insert(BB);
+
+    // Create a new basic block and copy instructions into it!
+    BasicBlock *CBB = CloneBasicBlock(BB, VMap, NameSuffix, NewFunc, CodeInfo,
+                                      SP ? &DIFinder : nullptr);
+
+    // Add basic block mapping.
+    VMap[BB] = CBB;
+
+    // It is only legal to clone a function if a block address within that
+    // function is never referenced outside of the function.  Given that, we
+    // want to map block addresses from the old function to block addresses in
+    // the clone. (This is different from the generic ValueMapper
+    // implementation, which generates an invalid blockaddress when
+    // cloning a function.)
+    if (BB->hasAddressTaken()) {
+      Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+                                              const_cast<BasicBlock*>(BB));
+      VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
+    }
+
+    // Note return instructions for the caller.
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+      Returns.push_back(RI);
+  }
+
+  // For each exit block, clean up its phi nodes to exclude predecessors that
+  // were not cloned.
+  if (ExitBlocks) {
+    for (BasicBlock *EB : *ExitBlocks) {
+      // Get the predecessors of this exit block that were not cloned.
+      SmallVector<BasicBlock *, 4> PredNotCloned;
+      for (BasicBlock *Pred : predecessors(EB))
+        if (!ClonedEBPreds.count(Pred))
+          PredNotCloned.push_back(Pred);
+
+      // Iterate over the phi nodes in the cloned exit block and remove incoming
+      // values from predecessors that were not cloned.
+      BasicBlock *ClonedEB = cast<BasicBlock>(VMap[EB]);
+      BasicBlock::iterator BI = ClonedEB->begin();
+      while (PHINode *PN = dyn_cast<PHINode>(BI)) {
+        for (BasicBlock *DeadPred : PredNotCloned)
+          if (PN->getBasicBlockIndex(DeadPred) > -1)
+            PN->removeIncomingValue(DeadPred);
+        ++BI;
+      }
+    }
+  }
+
+  // for (DISubprogram *ISP : DIFinder.subprograms()) {
+  //   if (ISP != SP) {
+  //     VMap.MD()[ISP].reset(ISP);
+  //   }
+  // }
+
+  // Loop over all of the instructions in the function, fixing up operand
+  // references as we go.  This uses VMap to do all the hard work.
+  for (const BasicBlock *BB : Blocks) {
+    BasicBlock *CBB = cast<BasicBlock>(VMap[BB]);
+    // Loop over all instructions, fixing each one as we find it...
+    for (Instruction &II : *CBB)
+      RemapInstruction(&II, VMap,
+                       ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                       TypeMapper, Materializer);
+  }
+}
+
+/// Create a helper function whose signature is based on Inputs and
+/// Outputs as follows: f(in0, ..., inN, out0, ..., outN)
+///
+/// TODO: Fix the std::vector part of the type of this function.
+Function *llvm::CreateHelper(const ValueSet &Inputs,
+                             const ValueSet &Outputs,
+                             std::vector<BasicBlock *> Blocks,
+                             BasicBlock *Header,
+                             const BasicBlock *OldEntry,
+                             const BasicBlock *OldExit,
+                             ValueToValueMapTy &VMap,
+                             Module *DestM,
+                             bool ModuleLevelChanges,
+                             SmallVectorImpl<ReturnInst *> &Returns,
+                             const StringRef NameSuffix,
+                             SmallPtrSetImpl<BasicBlock *> *ExitBlocks,
+                             const Instruction *InputSyncRegion,
+                             ClonedCodeInfo *CodeInfo,
+                             ValueMapTypeRemapper *TypeMapper,
+                             ValueMaterializer *Materializer) {
+  DEBUG(dbgs() << "inputs: " << Inputs.size() << "\n");
+  DEBUG(dbgs() << "outputs: " << Outputs.size() << "\n");
+
+  Function *OldFunc = Header->getParent();
+  Type *RetTy = Type::getVoidTy(Header->getContext());
+
+  std::vector<Type *> paramTy;
+
+  // Add the types of the input values to the function's argument list
+  for (Value *value : Inputs) {
+    DEBUG(dbgs() << "value used in func: " << *value << "\n");
+    paramTy.push_back(value->getType());
+  }
+
+  // Add the types of the output values to the function's argument list.
+  for (Value *output : Outputs) {
+    DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+    paramTy.push_back(PointerType::getUnqual(output->getType()));
+  }
+
+  DEBUG({
+      dbgs() << "Function type: " << *RetTy << " f(";
+      for (Type *i : paramTy)
+	dbgs() << *i << ", ";
+      dbgs() << ")\n";
+    });
+
+  FunctionType *FTy = FunctionType::get(RetTy, paramTy, false);
+
+  // Create the new function
+  Function *NewFunc = Function::Create(FTy,
+				       GlobalValue::InternalLinkage,
+				       OldFunc->getName() + "_" +
+				       Header->getName() + NameSuffix, DestM);
+
+  // Set names for input and output arguments.
+  Function::arg_iterator DestI = NewFunc->arg_begin();
+  for (Value *I : Inputs)
+    if (VMap.count(I) == 0) {       // Is this argument preserved?
+      DestI->setName(I->getName()+NameSuffix); // Copy the name over...
+      VMap[I] = &*DestI++;          // Add mapping to VMap
+    }
+  for (Value *I : Outputs)
+    if (VMap.count(I) == 0) {              // Is this argument preserved?
+      DestI->setName(I->getName()+NameSuffix); // Copy the name over...
+      VMap[I] = &*DestI++;                 // Add mapping to VMap
+    }
+
+  // Copy all attributes other than those stored in the AttributeSet.  We need
+  // to remap the parameter indices of the AttributeSet.
+  AttributeList NewAttrs = NewFunc->getAttributes();
+  NewFunc->copyAttributesFrom(OldFunc);
+  NewFunc->setAttributes(NewAttrs);
+
+  // Fix up the personality function that got copied over.
+  if (OldFunc->hasPersonalityFn())
+    NewFunc->setPersonalityFn(
+        MapValue(OldFunc->getPersonalityFn(), VMap,
+                 ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                 TypeMapper, Materializer));
+
+  SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
+  AttributeList OldAttrs = OldFunc->getAttributes();
+
+  // Clone any argument attributes
+  for (Argument &OldArg : OldFunc->args()) {
+    // Check if we're passing this argument to the helper.  We check Inputs here
+    // instead of the VMap to avoid potentially populating the VMap with a null
+    // entry for the old argument.
+    if (Inputs.count(&OldArg) || Outputs.count(&OldArg)) {
+      Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg]);
+      NewArgAttrs[NewArg->getArgNo()] =
+          OldAttrs.getParamAttributes(OldArg.getArgNo());
+    }
+  }
+
+  // Ignore the return attributes of the old function.
+  NewFunc->setAttributes(
+      AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
+                         AttributeSet(), NewArgAttrs));
+
+  // Clone the metadata from the old function into the new.
+  bool MustCloneSP =
+      OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+  DISubprogram *SP = OldFunc->getSubprogram();
+  if (SP) {
+    assert(!MustCloneSP || ModuleLevelChanges);
+    // Add mappings for some DebugInfo nodes that we don't want duplicated
+    // even if they're distinct.
+    auto &MD = VMap.MD();
+    MD[SP->getUnit()].reset(SP->getUnit());
+    MD[SP->getType()].reset(SP->getType());
+    MD[SP->getFile()].reset(SP->getFile());
+    // If we're not cloning into the same module, no need to clone the
+    // subprogram
+    if (!MustCloneSP)
+      MD[SP].reset(SP);
+  }
+
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  OldFunc->getAllMetadata(MDs);
+  for (auto MD : MDs) {
+    NewFunc->addMetadata(
+        MD.first,
+        *MapMetadata(MD.second, VMap,
+                     ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+                     TypeMapper, Materializer));
+  }
+
+  // We assume that the Helper reads and writes its arguments.  If the parent
+  // function had stronger attributes on memory access -- specifically, if the
+  // parent is marked as only reading memory -- we must replace this attribute
+  // with an appropriate weaker form.
+  if (OldFunc->onlyReadsMemory()) {
+    NewFunc->removeFnAttr(Attribute::ReadNone);
+    NewFunc->removeFnAttr(Attribute::ReadOnly);
+    NewFunc->setOnlyAccessesArgMemory();
+  }
+
+  // Inherit the calling convention from the parent.
+  NewFunc->setCallingConv(OldFunc->getCallingConv());
+
+  // The new function needs a root node because other nodes can branch to the
+  // head of the region, but the entry node of a function cannot have preds.
+  BasicBlock *NewEntry = BasicBlock::Create(Header->getContext(),
+					    OldEntry->getName()+NameSuffix,
+                                            NewFunc);
+  // The new function also needs an exit node.
+  BasicBlock *NewExit = BasicBlock::Create(Header->getContext(),
+					   OldExit->getName()+NameSuffix,
+                                           NewFunc);
+
+  // Add mappings to the NewEntry and NewExit.
+  VMap[OldEntry] = NewEntry;
+  VMap[OldExit] = NewExit;
+
+  // Create new sync region to replace the old one containing any cloned Tapir
+  // instructions, and add the appropriate mappings.
+  if (InputSyncRegion) {
+    Instruction *NewSR = InputSyncRegion->clone();
+    if (InputSyncRegion->hasName())
+      NewSR->setName(InputSyncRegion->getName()+NameSuffix);
+    NewEntry->getInstList().push_back(NewSR);
+    VMap[InputSyncRegion] = NewSR;
+  }
+
+  // Clone Blocks into the new function.
+  CloneIntoFunction(NewFunc, OldFunc, Blocks, VMap, ModuleLevelChanges,
+                    Returns, NameSuffix, ExitBlocks, SP, CodeInfo,
+                    TypeMapper, Materializer);
+
+  // Add a branch in the new function to the cloned Header.
+  BranchInst::Create(cast<BasicBlock>(VMap[Header]), NewEntry);
+  // Add a return in the new function.
+  ReturnInst::Create(Header->getContext(), NewExit);
+
+  return NewFunc;
+}
+
+// Add alignment assumptions to parameters of outlined function, based on known
+// alignment data in the caller.
+void llvm::AddAlignmentAssumptions(const Function *Caller,
+                                   const ValueSet &Inputs,
+                                   ValueToValueMapTy &VMap,
+                                   const Instruction *CallSite,
+                                   AssumptionCache *AC,
+                                   DominatorTree *DT) {
+  auto &DL = Caller->getParent()->getDataLayout();
+  for (Value *ArgVal : Inputs) {
+    // Ignore arguments to non-pointer types
+    if (!ArgVal->getType()->isPointerTy()) continue;
+    Argument *Arg = cast<Argument>(VMap[ArgVal]);
+    // Ignore arguments to non-pointer types
+    if (!Arg->getType()->isPointerTy()) continue;
+    // If the argument already has an alignment attribute, skip it.
+    if (Arg->getParamAlignment()) continue;
+    // Get any known alignment information for this argument's value.
+    unsigned Align = getKnownAlignment(ArgVal, DL, CallSite, AC, DT);
+    // If we have alignment data, add it as an attribute to the outlined
+    // function's parameter.
+    if (Align)
+      Arg->addAttr(Attribute::getWithAlignment(Arg->getContext(), Align));
+  }
+}
diff --git a/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp b/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp
new file mode 100644
index 00000000000000..8b9242b1424e4a
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/RedundantSpawn.cpp
@@ -0,0 +1,87 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+
+namespace {
+struct RedundantSpawn : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  RedundantSpawn() : FunctionPass(ID) {
+    //initializeRedundantSpawnPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("RedundantSpawn_"+F.getName());
+
+    bool effective = false;
+    do {
+      effective = false;
+      TerminatorInst* prior = nullptr;
+      BasicBlock* start = nullptr;
+      bool lookForDetach = false;
+      int rank = 0;
+      for (BasicBlock &BB: F) {
+        if (isa<ReattachInst>(BB.getTerminator()) && BB.size() == 1) {
+          lookForDetach = true;
+          start = &BB;
+          effective = true;
+          break;
+        }
+        if (prior != nullptr && isa<DetachInst>(prior))
+          rank +=1;
+        if (prior != nullptr && isa<ReattachInst>(prior))
+          rank -=1;
+        prior = BB.getTerminator();
+      }
+      if (lookForDetach) {
+        BasicBlock* current = start;
+        int currentRank = rank;
+        while (true) {
+          for (BasicBlock *Pred : predecessors(current)) {
+            current = Pred;
+            break;
+          }
+          if (isa<DetachInst>(current->getTerminator()) && currentRank == rank) {
+            BranchInst* replaceReattach = BranchInst::Create(start->getSingleSuccessor());
+            BranchInst* replaceDetach = BranchInst::Create(current->getTerminator()->getSuccessor(0));
+            ReplaceInstWithInst(start->getTerminator(), replaceReattach);
+            ReplaceInstWithInst(current->getTerminator(), replaceDetach);
+            break;
+          }
+          if (isa<DetachInst>(current->getTerminator()))
+            currentRank -= 1;
+          if (isa<ReattachInst>(current->getTerminator()))
+            currentRank += 1;
+        }
+      }
+    } while (effective);
+
+    return true;
+  }
+};
+}
+
+char RedundantSpawn::ID = 0;
+static RegisterPass<RedundantSpawn> X("redundantspawn", "Do RedundantSpawn pass", false, false);
+
+// Public interface to the RedundantSpawn pass
+FunctionPass *llvm::createRedundantSpawnPass() {
+  return new RedundantSpawn();
+}
diff --git a/llvm/lib/Transforms/Tapir/SmallBlock.cpp b/llvm/lib/Transforms/Tapir/SmallBlock.cpp
new file mode 100644
index 00000000000000..c46e90baeb620a
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SmallBlock.cpp
@@ -0,0 +1,68 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct SmallBlock : public FunctionPass {
+  static const int threshold = 10;
+  static char ID; // Pass identification, replacement for typeid
+  SmallBlock() : FunctionPass(ID) {
+    //initializeSmallBlockPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("SmallBlock_"+F.getName());
+
+    BasicBlock* b = nullptr;
+    BasicBlock* prior = nullptr;
+    bool effective;
+    int count = 0;
+    do {
+      effective = false;
+      for (BasicBlock &BB: F) {
+        count += BB.size();
+        if (isa<DetachInst>(BB.getTerminator())) {
+          b = &BB;
+          count = 0;
+        }
+        if (isa<ReattachInst>(BB.getTerminator()) && count < threshold && prior != b) {
+          // b ensured to be the corresponding reattach
+          effective = true;
+          prior = b;
+          BranchInst* replaceReattach = BranchInst::Create(BB.getSingleSuccessor());
+          BranchInst* replaceDetach = BranchInst::Create(b->getTerminator()->getSuccessor(0));
+          ReplaceInstWithInst(BB.getTerminator(), replaceReattach);
+          ReplaceInstWithInst(b->getTerminator(), replaceDetach);
+        }
+      }
+    } while (effective);
+
+    return true;
+  }
+};
+}
+
+char SmallBlock::ID = 0;
+static RegisterPass<SmallBlock> X("smallblock", "Do SmallBlock pass", false, false);
+
+// Public interface to the SmallBlock pass
+FunctionPass *llvm::createSmallBlockPass() {
+  return new SmallBlock();
+}
diff --git a/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp b/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp
new file mode 100644
index 00000000000000..2b0b15ca1900a6
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SpawnRestructure.cpp
@@ -0,0 +1,48 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+
+namespace {
+struct SpawnRestructure : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  SpawnRestructure() : FunctionPass(ID) {
+    //initializeSpawnRestructurePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("SpawnRestructure_"+F.getName());
+
+    for (BasicBlock &BB: F) {
+
+    }
+
+    return true;
+  }
+};
+}
+
+char SpawnRestructure::ID = 0;
+static RegisterPass<SpawnRestructure> X("spawnrestructure", "Do SpawnRestructure pass", false, false);
+
+// Public interface to the RedundantSpawn pass
+FunctionPass *llvm::createSpawnRestructurePass() {
+  return new SpawnRestructure();
+}
diff --git a/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp b/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp
new file mode 100644
index 00000000000000..9206c90b987393
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SpawnUnswitch.cpp
@@ -0,0 +1,96 @@
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+
+namespace {
+struct SpawnUnswitch : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  SpawnUnswitch() : FunctionPass(ID) {
+    //initializeSpawnUnswitchPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    //AU.addRequired<TargetTransformInfoWrapperPass>();
+    //AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    F.setName("SpawnUnswitch_"+F.getName());
+
+
+    bool effective;
+    do {
+      effective = false;
+      BasicBlock* body = nullptr;
+      BasicBlock* end = nullptr;
+
+      for (BasicBlock &BB: F) {
+        if (BB.size() == 1 && isa<ReattachInst>(BB.getTerminator())) {
+          end = BB.getSingleSuccessor();
+          int count = 0;
+          for (BasicBlock *Pred : predecessors(&BB)) {
+            for (BasicBlock *PredPred : predecessors(Pred)) {
+              if (!isa<DetachInst>(PredPred->getTerminator())) {
+                body = Pred;
+              }
+            }
+            count++;
+          }
+          if (count == 2) { // only predecessors are det.achd and if.then
+            for (BasicBlock *Pred : predecessors(&BB)) {
+              if (Pred->size() == 2 && isa<BranchInst>(Pred->getTerminator())) { // if clause only compares register contents
+                Instruction* cmp = nullptr;
+                for (Instruction &I : *Pred) {
+                  cmp = &I;
+                  break;
+                }
+                for (BasicBlock *PredPred : predecessors(Pred)) {
+                  if (DetachInst *DI = dyn_cast<DetachInst>(PredPred->getTerminator())) { // outer spawn
+                    Value *SyncRegion = DI->getSyncRegion();
+                    effective = true;
+                    // move cmp instruction to outside spawn
+                    Instruction *pi = PredPred->getTerminator();
+                    cmp->moveBefore(pi);
+
+                    // branch now to detach or end
+                    TerminatorInst* temp = Pred->getTerminator();
+                    BranchInst* replaceDetach = BranchInst::Create(Pred, end, ((BranchInst*)temp)->getCondition());
+                    ReplaceInstWithInst(PredPred->getTerminator(), replaceDetach);
+
+                    // detach now goes straight to body
+                    DetachInst* newDetach = DetachInst::Create(body, end, SyncRegion);
+                    ReplaceInstWithInst(Pred->getTerminator(), newDetach);
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    } while (effective);
+
+    return true;
+  }
+};
+}
+
+char SpawnUnswitch::ID = 0;
+static RegisterPass<SpawnUnswitch> X("spawnunswitch", "Do SpawnUnswitch pass", false, false);
+
+// Public interface to the RedundantSpawn pass
+FunctionPass *llvm::createSpawnUnswitchPass() {
+  return new SpawnUnswitch();
+}
diff --git a/llvm/lib/Transforms/Tapir/SyncElimination.cpp b/llvm/lib/Transforms/Tapir/SyncElimination.cpp
new file mode 100644
index 00000000000000..62301069348471
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/SyncElimination.cpp
@@ -0,0 +1,273 @@
+//===- SyncElimination.cpp - Eliminate unnecessary sync calls ----------------===//
+
+#include "llvm/Transforms/Tapir.h"
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/ADT/SmallSet.h"
+
+#include <deque>
+#include <map>
+
+using namespace llvm;
+
+namespace {
+
+typedef SmallSet<const BasicBlock *, 32> BasicBlockSet;
+typedef std::deque<const BasicBlock *> BasicBlockDeque;
+
+struct SyncElimination : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  SyncElimination() : FunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+
+    errs() << "SyncElimination: Found function: " << F.getName() << "\n";
+
+    bool ChangedAny = false;
+
+    while (true) {
+      bool Changed = false;
+
+      for (BasicBlock &block: F) {
+        if (isa<SyncInst>(block.getTerminator())) {
+          if (processSyncInstBlock(block)) {
+            Changed = true;
+            ChangedAny = true;
+            break;
+          }
+        }
+      }
+
+      if (!Changed) {
+        break;
+      }
+    }
+
+    return ChangedAny;
+  }
+
+private:
+
+  // We will explain what Rosetta and Vegas are later. Or rename them.
+  // We promise.
+
+  // Rosetta-finding code
+
+  void findRosetta(const BasicBlock &BB, BasicBlockSet &OutputSet) {
+    assert(isa<SyncInst>(BB.getTerminator()));
+
+    BasicBlockSet Visited;
+    BasicBlockDeque Frontier;
+    std::map<const BasicBlock *, int> DetachLevel;
+
+    DetachLevel[&BB] = 0;
+    Frontier.push_back(&BB);
+    OutputSet.insert(&BB);
+
+    while (!Frontier.empty()) {
+      const BasicBlock *Current = Frontier.front();
+      Frontier.pop_front();
+
+      for (const BasicBlock *Pred: predecessors(Current)) {
+        // TODO@jiahao: Investigate potential issues with continue edges here.
+
+        if (Visited.count(Pred) > 0) {
+          continue;
+        }
+
+        if (isa<SyncInst>(Pred->getTerminator())) {
+          continue;
+        }
+
+        Visited.insert(Pred);
+
+        DetachLevel[Pred] = DetachLevel[Current];
+
+        if (isa<ReattachInst>(Pred->getTerminator())) {
+          DetachLevel[Pred] ++;
+        } else if (isa<DetachInst>(Pred->getTerminator())) {
+          DetachLevel[Pred] --;
+        }
+
+        if (DetachLevel[Pred] > 0) {
+          OutputSet.insert(Pred);
+        }
+
+        if (DetachLevel[Pred] >= 0) {
+          Frontier.push_back(Pred);
+        }
+      }
+    }
+  }
+
+  // Vegas-finding code
+  //
+  // We run BFS starting from the sync block, following all foward edges, and stop a branch whenever
+  // we hit another sync block.
+
+  void findVegas(const BasicBlock &BB, BasicBlockSet &OutputSet) {
+    assert(isa<SyncInst>(BB.getTerminator()));
+
+    BasicBlockSet Visited;
+    BasicBlockDeque Frontier;
+
+    Frontier.push_back(&BB);
+
+    while (!Frontier.empty()) {
+      const BasicBlock *Current = Frontier.front();
+      Frontier.pop_front();
+
+      for (const BasicBlock *Succ: successors(Current)) {
+        if (Visited.count(Succ) > 0) {
+          continue;
+        }
+
+        Visited.insert(Succ);
+        OutputSet.insert(Succ);
+
+        // We need to include blocks whose terminator is another sync.
+        // Therefore we still insert the block into OutputSet in this case.
+        // However we do not search any further past the sync block.
+        if (!isa<SyncInst>(Succ->getTerminator())) {
+          Frontier.push_back(Succ);
+        }
+      }
+    }
+  }
+
+  bool willMod(const ModRefInfo &Info) {
+    return (Info == MRI_Mod || Info == MRI_ModRef);
+  }
+
+  bool instTouchesMemory(const Instruction &Inst) {
+    return Inst.getOpcode() == Instruction::Load ||
+           Inst.getOpcode() == Instruction::Store ||
+           Inst.getOpcode() == Instruction::VAArg ||
+           Inst.getOpcode() == Instruction::AtomicCmpXchg ||
+           Inst.getOpcode() == Instruction::AtomicRMW;
+  }
+
+  // FIXME: we can do better
+  void checkBlowUp(const Instruction &Inst) {
+    if (isa<FenceInst>(Inst)) {
+      errs() << Inst << "\n";
+      llvm_unreachable("BOOOOOOOOOOOOOOOOOOOOOOOOM! not supported (yet)");
+    }
+  }
+
+  bool isSyncEliminationLegal(const BasicBlockSet &RosettaSet, const BasicBlockSet &VegasSet) {
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+    for (const BasicBlock *RBB : RosettaSet) {
+      for (const Instruction &RI : *RBB) {
+        checkBlowUp(RI);
+
+        if (RI.getOpcode() == Instruction::Sync) {
+          continue;
+        }
+
+        for (const BasicBlock *VBB : VegasSet) {
+          for (const Instruction &VI : *VBB) {
+            checkBlowUp(VI);
+
+            if (VI.getOpcode() == Instruction::Sync) {
+              continue;
+            }
+
+            ImmutableCallSite RC(&RI), VC(&VI);
+
+            if (!!RC) {
+              // If RI is a call/invoke
+              if (instTouchesMemory(VI) &&
+                  AA->getModRefInfo(const_cast<Instruction *>(&VI), RC) != MRI_NoModRef) {
+                errs() << "SyncElimination:     Conflict found between " << RI << " and " << VI << "\n";
+                return false;
+              }
+            } else if (!!VC) {
+              // If VI is a call/invoke
+              if (instTouchesMemory(RI) &&
+                  AA->getModRefInfo(const_cast<Instruction *>(&RI), VC) != MRI_NoModRef) {
+                errs() << "SyncElimination:     Conflict found between " << RI << " and " << VI << "\n";
+                return false;
+              }
+            } else {
+              if (!instTouchesMemory(VI) || !instTouchesMemory(RI)) {
+                continue;
+              }
+
+              // If neither instruction is a call/invoke
+              MemoryLocation VML = MemoryLocation::get(&VI);
+              MemoryLocation RML = MemoryLocation::get(&RI);
+
+              if (AA->alias(RML, VML) && (willMod(AA->getModRefInfo(&RI, RML)) || willMod(AA->getModRefInfo(&VI, VML)))) {
+                // If the two memory location can potentially be aliasing each other, and
+                // at least one instruction modifies its memory location.
+                errs() << "SyncElimination:     Conflict found between " << RI << " and " << VI << "\n";
+                return false;
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool processSyncInstBlock(BasicBlock &BB) {
+    errs() << "SyncElimination: Found sync block: " << BB.getName() << "\n";
+
+    BasicBlockSet RosettaSet, VegasSet;
+
+    findRosetta(BB, RosettaSet);
+    findVegas(BB, VegasSet);
+
+    errs() << "SyncElimination:     Blocks found in the Rosetta set: " << "\n";
+    for (const BasicBlock *BB: RosettaSet) {
+      errs() << "SyncElimination:         " + BB->getName() << "\n";
+    }
+
+    errs() << "SyncElimination:     Blocks found in the Vegas set: " << "\n";
+    for (const BasicBlock *BB: VegasSet) {
+      errs() << "SyncElimination:         " + BB->getName() << "\n";
+    }
+
+    if (isSyncEliminationLegal(RosettaSet, VegasSet)) {
+      SyncInst *Sync = dyn_cast<SyncInst>(BB.getTerminator());
+      assert(Sync != NULL);
+      BasicBlock* suc = Sync->getSuccessor(0);
+      IRBuilder<> Builder(Sync);
+      Builder.CreateBr(suc);
+      Sync->eraseFromParent();
+      errs() << "SyncElimination:     A sync is removed. " << "\n";
+      return true;
+    }
+
+    return false;
+  }
+};
+
+}
+
+char SyncElimination::ID = 0;
+static RegisterPass<SyncElimination> X("sync-elimination", "Do sync-elimination's pass", false, false);
+
+// Public interface to the SyncElimination pass
+FunctionPass *llvm::createSyncEliminationPass() {
+  return new SyncElimination();
+}
diff --git a/llvm/lib/Transforms/Tapir/Tapir.cpp b/llvm/lib/Transforms/Tapir/Tapir.cpp
new file mode 100644
index 00000000000000..50813076c64b10
--- /dev/null
+++ b/llvm/lib/Transforms/Tapir/Tapir.cpp
@@ -0,0 +1,43 @@
+//===-- Tapir.cpp ---------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMTapirOpts.a, which
+// implements several transformations over the Tapir/LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Tapir.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Tapir.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+
+using namespace llvm;
+
+/// initializeTapirOpts - Initialize all passes linked into the
+/// TapirOpts library.
+void llvm::initializeTapirOpts(PassRegistry &Registry) {
+  initializeLoopSpawningPass(Registry);
+  initializeLowerTapirToCilkPass(Registry);
+}
+
+void LLVMInitializeTapirOpts(LLVMPassRegistryRef R) {
+  initializeTapirOpts(*unwrap(R));
+}
+
+void LLVMAddLoopSpawningPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLoopSpawningPass());
+}
+
+void LLVMAddLowerTapirToCilkPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLowerTapirToCilkPass());
+}
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 7da768252fc198..2402e4b99779c2 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -149,6 +149,18 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   // Don't break unwinding instructions.
   if (PredBB->getTerminator()->isExceptionalTerminator())
     return false;
+  // For now, don't break syncs.
+  // TODO: Don't break syncs unless they don't sync anything.
+  if (isa<SyncInst>(PredBB->getTerminator())) return false;
+  // Don't break entry blocks of detached CFG's.
+  for (pred_iterator PI = pred_begin(PredBB), PE = pred_end(PredBB);
+       PI != PE; ++PI) {
+    BasicBlock *PredPredBB = *PI;
+    if (const DetachInst *DI =
+        dyn_cast<DetachInst>(PredPredBB->getTerminator()))
+      if (DI->getDetached() == PredBB)
+        return false;
+  }
 
   // Can't merge if there are multiple distinct successors.
   if (PredBB->getUniqueSuccessor() != BB)
@@ -301,7 +313,18 @@ BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
   // block.
   assert(BB->getTerminator()->getNumSuccessors() == 1 &&
          "Should have a single succ!");
-  return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU);
+  // return SplitBlock(BB, BB->getTerminator(), DT, LI);
+  BasicBlock *NewBB = SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU);
+  if (SyncInst *OldSI = dyn_cast<SyncInst>(NewBB->getTerminator())) {
+    // Make sure the original BB is terminated by the sync.
+    SyncInst *SI = SyncInst::Create(NewBB, OldSI->getSyncRegion(),
+                                    BB->getTerminator());
+    BranchInst::Create(Succ, OldSI);
+    SI->setDebugLoc(OldSI->getDebugLoc());
+    BB->getTerminator()->eraseFromParent();
+    OldSI->eraseFromParent();
+  }
+  return NewBB;
 }
 
 unsigned
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index fafc9aaba5c9cc..befb2ed13587e9 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -137,10 +137,27 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
 
   assert(!isa<IndirectBrInst>(TI) &&
          "Cannot split critical edge from IndirectBrInst");
+  assert(!isa<ReattachInst>(TI) &&
+         "Cannot split critical edge from ReattachInst");
+
+  bool SplittingDetachContinue = isa<DetachInst>(TI) && (1 == SuccNum);
+  if (SplittingDetachContinue)
+    assert((Options.SplitDetachContinue && Options.DT) &&
+           "Cannot split critical continuation edge from a detach");
 
   BasicBlock *TIBB = TI->getParent();
   BasicBlock *DestBB = TI->getSuccessor(SuccNum);
 
+  // If we're splitting a detach-continue edge, get the associated reattaches.
+  SmallVector<BasicBlock *, 1> Reattaches;
+  if (SplittingDetachContinue) {
+    BasicBlockEdge DetachEdge(TIBB, TI->getSuccessor(0));
+    for (BasicBlock *Pred : predecessors(DestBB))
+      if (isa<ReattachInst>(Pred->getTerminator()))
+        if (Options.DT->dominates(DetachEdge, Pred))
+          Reattaches.push_back(Pred);
+  }
+
   // Splitting the critical edge to a pad block is non-trivial. Don't do
   // it in this generic function.
   if (DestBB->isEHPad()) return nullptr;
@@ -155,6 +172,12 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
   // Branch to the new block, breaking the edge.
   TI->setSuccessor(SuccNum, NewBB);
 
+  // If we're splitting a detach-continue edge, redirect all appropriate
+  // reattach edges to branch to the new block
+  if (SplittingDetachContinue)
+    for (BasicBlock *RBB : Reattaches)
+      RBB->getTerminator()->setSuccessor(0, NewBB);
+
   // Insert the block into the function... right after the block TI lives in.
   Function &F = *TIBB->getParent();
   Function::iterator FBBI = TIBB->getIterator();
@@ -179,6 +202,28 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
         BBIdx = PN->getBasicBlockIndex(TIBB);
       PN->setIncomingBlock(BBIdx, NewBB);
     }
+
+    // Update the PHI node entries for the reattach predecessors as well.
+    if (SplittingDetachContinue) {
+      for (BasicBlock *RBB : Reattaches) {
+        unsigned BBIdx = 0;
+        for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+          // We no longer enter through RBB, now we come in through NewBB.
+          // Revector exactly one entry in the PHI node that used to come from
+          // TIBB to come from NewBB.
+          PHINode *PN = cast<PHINode>(I);
+
+          // Reuse the previous value of BBIdx if it lines up.  In cases where we
+          // have multiple phi nodes with *lots* of predecessors, this is a speed
+          // win because we don't have to scan the PHI looking for TIBB.  This
+          // happens because the BB list of PHI nodes are usually in the same
+          // order.
+          if (PN->getIncomingBlock(BBIdx) != RBB)
+            BBIdx = PN->getBasicBlockIndex(RBB);
+          PN->removeIncomingValue(BBIdx);
+        }
+      }
+    }
   }
 
   // If there are any other edges from TIBB to DestBB, update those to go
diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt
index cb3dc17c03ad8d..e89b1d3c221cc2 100644
--- a/llvm/lib/Transforms/Utils/CMakeLists.txt
+++ b/llvm/lib/Transforms/Utils/CMakeLists.txt
@@ -54,6 +54,7 @@ add_llvm_library(LLVMTransformUtils
   SplitModule.cpp
   StripNonLineTableDebugInfo.cpp
   SymbolRewriter.cpp
+  TapirUtils.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 623fe91a5a6094..42ad327ab195e9 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -62,6 +62,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -1623,6 +1625,18 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
   }
 
+  // Get the entry block of the detached context into which we're inlining.  If
+  // we move allocas from the inlined code, we must move them to this block.
+  BasicBlock *DetachedCtxEntryBlock;
+  {
+    BasicBlock *CallingBlock = TheCall->getParent();
+    DetachedCtxEntryBlock = GetDetachedCtx(CallingBlock);
+    assert(((&(CallingBlock->getParent()->getEntryBlock()) ==
+             DetachedCtxEntryBlock) ||
+            DetachedCtxEntryBlock->getSinglePredecessor()) &&
+           "Entry block of detached context has multiple predecessors.");
+  }
+
   // Get an iterator to the last basic block in the function, which will have
   // the new function inlined after it.
   Function::iterator LastBlock = --Caller->end();
@@ -1781,7 +1795,8 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // calculate which instruction they should be inserted before.  We insert the
   // instructions at the end of the current alloca list.
   {
-    BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    // BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+    BasicBlock::iterator InsertPoint = DetachedCtxEntryBlock->begin();
     for (BasicBlock::iterator I = FirstNewBlock->begin(),
          E = FirstNewBlock->end(); I != E; ) {
       AllocaInst *AI = dyn_cast<AllocaInst>(I++);
@@ -1811,7 +1826,9 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // Transfer all of the allocas over in a block.  Using splice means
       // that the instructions aren't removed from the symbol table, then
       // reinserted.
-      Caller->getEntryBlock().getInstList().splice(
+      // Caller->getEntryBlock().getInstList().splice(
+      //     InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
+      DetachedCtxEntryBlock->getInstList().splice(
           InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
     }
     // Move any dbg.declares describing the allocas into the entry basic block.
@@ -1819,6 +1836,23 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     for (auto &AI : IFI.StaticAllocas)
       replaceDbgDeclareForAlloca(AI, AI, DIB, DIExpression::NoDeref, 0,
                                  DIExpression::NoDeref);
+
+    // Move any syncregion_start's into the entry basic block.
+    for (BasicBlock::iterator I = FirstNewBlock->begin(),
+         E = FirstNewBlock->end(); I != E; ) {
+      IntrinsicInst *II = dyn_cast<IntrinsicInst>(I++);
+      if (!II) continue;
+      if (Intrinsic::syncregion_start != II->getIntrinsicID())
+        continue;
+
+      while (isa<IntrinsicInst>(I) &&
+             Intrinsic::syncregion_start ==
+             cast<IntrinsicInst>(I)->getIntrinsicID())
+        ++I;
+
+      DetachedCtxEntryBlock->getInstList().splice(
+          InsertPoint, FirstNewBlock->getInstList(), II->getIterator(), I);
+    }
   }
 
   SmallVector<Value*,4> VarArgsToForward;
@@ -2224,6 +2258,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // this is an invoke instruction or a call instruction.
   BasicBlock *AfterCallBB;
   BranchInst *CreatedBranchToNormalDest = nullptr;
+
   if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 380f4fca54d9ed..a9ac90d6e391da 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -522,6 +522,12 @@ static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
     if (Preheader)
       Changed = true;
   }
+  // Ensure that the preheader is not terminated by a sync.
+  if (Preheader && isa<SyncInst>(Preheader->getTerminator())) {
+    DEBUG(dbgs() << "LoopSimplify: Splitting sync-terminated preheader.\n");
+    SplitEdge(Preheader, L->getHeader(), DT, LI);
+    Preheader = L->getLoopPreheader();
+  }
 
   // Next, check to make sure that all exit nodes of the loop only have
   // predecessors that are inside of the loop.  This check guarantees that the
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index da7ed2bd165268..f3feb40ac97e08 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -154,6 +154,15 @@ BasicBlock *llvm::foldBlockIntoPredecessor(BasicBlock *BB, LoopInfo *LI,
   return OnlyPred;
 }
 
+//! Identify if a loop could be a cilk for loop and thus diasble unrolling
+bool isCilkFor(Loop* L) {
+  //TODO use a more precise detection of cilk for loops
+  for (BasicBlock* BB : L->blocks())
+    if (dyn_cast<DetachInst>(BB->getTerminator()))
+      return true;
+  return false;
+}
+
 /// Check if unrolling created a situation where we need to insert phi nodes to
 /// preserve LCSSA form.
 /// \param Blocks is a vector of basic blocks representing unrolled loop.
@@ -411,6 +420,7 @@ LoopUnrollResult llvm::UnrollLoop(
 
   // Are we eliminating the loop control altogether?
   bool CompletelyUnroll = Count == TripCount;
+  if (isCilkFor(L) && !CompletelyUnroll) return false;
   SmallVector<BasicBlock *, 4> ExitBlocks;
   L->getExitBlocks(ExitBlocks);
   std::vector<BasicBlock*> OriginalLoopBlocks = L->getBlocks();
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index 112e80d27e345d..240e92b81d1873 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -613,6 +613,67 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
   }
 }
 
+/// Returns true if the instruction in a loop is guaranteed to execute at least
+/// once.
+bool llvm::isGuaranteedToExecute(const Instruction &Inst,
+                                 const DominatorTree *DT, const Loop *CurLoop,
+                                 const LoopSafetyInfo *SafetyInfo) {
+  // We have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    // If there's a throw in the header block, we can't guarantee we'll reach
+    // Inst.
+    return !SafetyInfo->HeaderMayThrow;
+
+  // Somewhere in this loop there is an instruction which may throw and make us
+  // exit the loop.
+  if (SafetyInfo->MayThrow)
+    return false;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (unsigned i=0,e=ExitBlocks.size(); i<e; i++)
+    if (!DT->dominates(Inst.getParent(), ExitBlocks[i])) {
+      bool valid = false;
+      for( BasicBlock* b : CurLoop->getBlocks() ) {
+        if( auto RE = dyn_cast<ReattachInst>(b->getTerminator()) ) {
+          if( b == Inst.getParent() || DT->dominates(Inst.getParent(), b) ) {
+            bool tv = true;
+            for(unsigned i2=0; i2!=e; ++i2){
+              if( !DT->dominates( RE->getSuccessor(0), ExitBlocks[i2] ) )  {
+                tv = false; break;
+              }
+            }
+            if( tv ) {
+              valid = true;
+              break;
+            }
+          }
+        }
+      }
+      if (valid) continue;
+      return false;
+    }
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  // FIXME: In general, we have to prove that the loop isn't an infinite loop.
+  // See http::llvm.org/PR24078 .  (The "ExitBlocks.empty()" check above is
+  // just a special case of this.)
+  return true;
+}
+
 Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   // Only support loops with a unique exiting block, and a latch.
   if (!L->getExitingBlock())
diff --git a/llvm/lib/Transforms/Utils/Mem2Reg.cpp b/llvm/lib/Transforms/Utils/Mem2Reg.cpp
index 23145e5847512a..269d9a18d12efa 100644
--- a/llvm/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/llvm/lib/Transforms/Utils/Mem2Reg.cpp
@@ -35,18 +35,33 @@ STATISTIC(NumPromoted, "Number of alloca's promoted");
 static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
                                     AssumptionCache &AC) {
   std::vector<AllocaInst *> Allocas;
-  BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
   bool Changed = false;
 
+  // Scan the function to get its entry block and all entry blocks of detached
+  // CFG's.  We can perform this scan for entry blocks once for the function,
+  // because this pass preserves the CFG.
+  SmallVector<BasicBlock *, 4> EntryBlocks;
+  bool FunctionContainsDetach = false;
+  EntryBlocks.push_back(&F.getEntryBlock());
+  for (BasicBlock &BB : F)
+    if (BasicBlock *Pred = BB.getUniquePredecessor())
+      if (DetachInst *DI = dyn_cast<DetachInst>(Pred->getTerminator())) {
+        FunctionContainsDetach = true;
+        if (DI->getDetached() == &BB)
+          EntryBlocks.push_back(&BB);
+      }
+
   while (true) {
     Allocas.clear();
 
     // Find allocas that are safe to promote, by looking at all instructions in
     // the entry node
-    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
-      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
-        if (isAllocaPromotable(AI))
-          Allocas.push_back(AI);
+    for (BasicBlock *BB : EntryBlocks)
+      for (BasicBlock::iterator I = BB->begin(), E = --BB->end(); I != E; ++I)
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+          if (isAllocaPromotable(AI) &&
+              (!FunctionContainsDetach || isAllocaParallelPromotable(AI, DT)))
+            Allocas.push_back(AI);
 
     if (Allocas.empty())
       break;
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index ae5e72ea4d30f3..87aafa83ecfcab 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -127,6 +127,24 @@ void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
   appendToUsedList(M, "llvm.compiler.used", Values);
 }
 
+Function *llvm::checkCsiInterfaceFunction(Constant *FuncOrBitcast) {
+  if (Function *F = dyn_cast<Function>(FuncOrBitcast)) {
+    return F;
+  }
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(FuncOrBitcast)) {
+    if (CE->isCast() && CE->getOpcode() == Instruction::BitCast) {
+      if (Function *F = dyn_cast<Function>(CE->getOperand(0))) {
+        return F;
+      }
+    }
+  }
+  FuncOrBitcast->print(errs());
+  std::string Err;
+  raw_string_ostream Stream(Err);
+  Stream << "ComprehensiveStaticInstrumentation interface function redefined: " << *FuncOrBitcast;
+  report_fatal_error(Err);
+}
+
 Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
   if (isa<Function>(FuncOrBitcast))
     return cast<Function>(FuncOrBitcast);
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 91e4f4254b3e76..7e87fce8edf218 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -61,6 +61,7 @@ STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
 STATISTIC(NumSingleStore,   "Number of alloca's promoted with a single store");
 STATISTIC(NumDeadAlloca,    "Number of dead alloca's removed");
 STATISTIC(NumPHIInsert,     "Number of PHI nodes inserted");
+STATISTIC(NumAllocaWithDetachedUses,  "Number of alloca's with detached uses");
 
 bool llvm::isAllocaPromotable(const AllocaInst *AI) {
   // FIXME: If the memory unit is of pointer or integer type, we can permit
@@ -143,13 +144,12 @@ struct AllocaInfo {
         DefiningBlocks.push_back(SI->getParent());
         AllocaPointerVal = SI->getOperand(0);
         OnlyStore = SI;
-      } else {
-        LoadInst *LI = cast<LoadInst>(User);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
         // Otherwise it must be a load instruction, keep track of variable
         // reads.
         UsingBlocks.push_back(LI->getParent());
         AllocaPointerVal = LI;
-      }
+      } else continue;
 
       if (OnlyUsedInOneBlock) {
         if (!OnlyBlock)
@@ -556,10 +556,18 @@ void PromoteMem2Reg::run() {
   LargeBlockInfo LBI;
   ForwardIDFCalculator IDF(DT);
 
+  bool FunctionContainsDetach = false;
+  {
+    for (BasicBlock &BB : F)
+      FunctionContainsDetach |= isa<DetachInst>(BB.getTerminator());
+  }
+
   for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
     AllocaInst *AI = Allocas[AllocaNum];
 
     assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
+    assert((!FunctionContainsDetach || isAllocaParallelPromotable(AI, DT)) &&
+           "Cannot promote non-promotable alloca in function with detach!");
     assert(AI->getParent()->getParent() == &F &&
            "All allocas should be in the same function, which is same as DF!");
 
@@ -607,17 +615,8 @@ void PromoteMem2Reg::run() {
         BBNumbers[&BB] = ID++;
     }
 
-    // Remember the dbg.declare intrinsic describing this alloca, if any.
-    if (!Info.DbgDeclares.empty())
-      AllocaDbgDeclares[AllocaNum] = Info.DbgDeclares;
-
-    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
-    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
-
-    // At this point, we're committed to promoting the alloca using IDF's, and
-    // the standard SSA construction algorithm.  Determine which blocks need PHI
-    // nodes and see if we can optimize out some work by avoiding insertion of
-    // dead phi nodes.
+    // Determine which blocks need PHI nodes and see if we can optimize out some
+    // work by avoiding insertion of dead phi nodes.
 
     // Unique the set of defining blocks for efficient lookup.
     SmallPtrSet<BasicBlock *, 32> DefBlocks;
@@ -628,14 +627,44 @@ void PromoteMem2Reg::run() {
     SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
     ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
 
-    // At this point, we're committed to promoting the alloca using IDF's, and
-    // the standard SSA construction algorithm.  Determine which blocks need phi
-    // nodes and see if we can optimize out some work by avoiding insertion of
-    // dead phi nodes.
+    // Determine which blocks need PHI nodes and see if we can optimize out some
+    // work by avoiding insertion of dead phi nodes.
     IDF.setLiveInBlocks(LiveInBlocks);
     IDF.setDefiningBlocks(DefBlocks);
     SmallVector<BasicBlock *, 32> PHIBlocks;
     IDF.calculate(PHIBlocks);
+
+    // Determine which PHI nodes want to use a value from a detached
+    // predecessor.  Because register state is not preserved across a reattach,
+    // these alloca's cannot be promoted.
+    bool DetachedPred = false;
+    for (unsigned i = 0, e = PHIBlocks.size(); i != e && !DetachedPred; ++i) {
+      BasicBlock *BB = PHIBlocks[i];
+      for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+           PI != E && !DetachedPred; ++PI) {
+        BasicBlock *P = *PI;
+        if (isa<ReattachInst>(P->getTerminator())) {
+          DEBUG(dbgs() << "Alloca " << *AI << " has use reattached from " <<
+                P->getName() << "\n");
+          DetachedPred = true;
+        }
+      }
+    }
+    if (DetachedPred) {
+      RemoveFromAllocasList(AllocaNum);
+      ++NumAllocaWithDetachedUses;
+      continue;
+    }
+
+    // Remember the dbg.declare intrinsic describing this alloca, if any.
+    if (!Info.DbgDeclares.empty())
+      AllocaDbgDeclares[AllocaNum] = Info.DbgDeclares;
+
+    // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+    AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+    // At this point, we're committed to promoting the alloca using IDF's, and
+    // the standard SSA construction algorithm.
     if (PHIBlocks.size() > 1)
       llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) {
         return BBNumbers.lookup(A) < BBNumbers.lookup(B);
@@ -791,7 +820,7 @@ void PromoteMem2Reg::run() {
 /// These are blocks which lead to uses.  Knowing this allows us to avoid
 /// inserting PHI nodes into blocks which don't lead to uses (thus, the
 /// inserted phi nodes would be dead).
-void PromoteMem2Reg::ComputeLiveInBlocks(
+static void ExternComputeLiveInBlocks(
     AllocaInst *AI, AllocaInfo &Info,
     const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
     SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
@@ -860,6 +889,62 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
   }
 }
 
+void PromoteMem2Reg::ComputeLiveInBlocks(
+    AllocaInst *AI, AllocaInfo &Info,
+    const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+    SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
+  ExternComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+}
+
+// \brief Augmentation is isAllocaPromotable to handle detach and reattach.
+//
+// TODO: Replace the implementation of this method to use an analysis of
+// parallel regions.
+bool llvm::isAllocaParallelPromotable(const AllocaInst *AIP,
+                                      DominatorTree &DT) {
+  AllocaInst* AI = const_cast<AllocaInst*>(AIP);
+  AllocaInfo Info;
+  LargeBlockInfo LBI;
+  ForwardIDFCalculator IDF(DT);
+
+  // Calculate the set of read and write-locations for each alloca.  This is
+  // analogous to finding the 'uses' and 'definitions' of each variable.
+  Info.AnalyzeAlloca(AI);
+
+  if (Info.OnlyUsedInOneBlock) return true;
+
+  // Unique the set of defining blocks for efficient lookup.
+  SmallPtrSet<BasicBlock *, 32> DefBlocks;
+  DefBlocks.insert(Info.DefiningBlocks.begin(), Info.DefiningBlocks.end());
+
+  // Determine which blocks the value is live in.  These are blocks which lead
+  // to uses.
+  SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+  ExternComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+  // Determine which blocks need PHI nodes and see if we can optimize out some
+  // work by avoiding insertion of dead phi nodes.
+  IDF.setLiveInBlocks(LiveInBlocks);
+  IDF.setDefiningBlocks(DefBlocks);
+  SmallVector<BasicBlock *, 32> PHIBlocks;
+  IDF.calculate(PHIBlocks);
+
+  // Determine which PHI nodes want to use a value from a detached predecessor.
+  // Because register state is not preserved across a reattach, these alloca's
+  // cannot be promoted.
+  for (unsigned i = 0, e = PHIBlocks.size(); i != e; ++i) {
+    BasicBlock *BB = PHIBlocks[i];
+    for (pred_iterator PI = pred_begin(BB), E = pred_end(BB);
+         PI != E; ++PI) {
+      BasicBlock *P = *PI;
+      if (isa<ReattachInst>(P->getTerminator()))
+        return false;
+    }
+  }
+
+  return true;
+}
+
 /// Queue a phi-node to be added to a basic-block for a specific Alloca.
 ///
 /// Returns true if there wasn't already a phi-node for that variable
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index 9e5fb0e7172d4d..c9dced38c694f2 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -44,11 +44,18 @@ static AvailableValsTy &getAvailableVals(void *AV) {
   return *static_cast<AvailableValsTy*>(AV);
 }
 
+typedef DenseMap<BasicBlock*, bool> ValIsDetachedTy;
+static ValIsDetachedTy &getValIsDetached(void *VID) {
+  return *static_cast<ValIsDetachedTy*>(VID);
+}
+
 SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode *> *NewPHI)
   : InsertedPHIs(NewPHI) {}
 
 SSAUpdater::~SSAUpdater() {
   delete static_cast<AvailableValsTy*>(AV);
+  if (VID)
+    delete static_cast<ValIsDetachedTy*>(VID);
 }
 
 void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
@@ -56,6 +63,10 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
     AV = new AvailableValsTy();
   else
     getAvailableVals(AV).clear();
+  if (!VID)
+    VID = new ValIsDetachedTy();
+  else
+    getValIsDetached(VID).clear();
   ProtoType = Ty;
   ProtoName = Name;
 }
@@ -107,6 +118,7 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   // predecessor.
   SmallVector<std::pair<BasicBlock *, Value *>, 8> PredValues;
   Value *SingularValue = nullptr;
+  BasicBlock *DetachPred = nullptr, *ReattachPred = nullptr;
 
   // We can get our predecessor info by walking the pred_iterator list, but it
   // is relatively slow.  If we already have PHI nodes in this block, walk one
@@ -115,6 +127,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
       BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
       Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      if (isa<ReattachInst>(PredBB->getTerminator())) {
+        ReattachPred = PredBB;
+        continue;
+      }
+      if (isa<DetachInst>(PredBB->getTerminator()))
+        DetachPred = PredBB;
       PredValues.push_back(std::make_pair(PredBB, PredVal));
 
       // Compute SingularValue.
@@ -128,6 +146,12 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
     for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
       BasicBlock *PredBB = *PI;
       Value *PredVal = GetValueAtEndOfBlock(PredBB);
+      if (isa<ReattachInst>(PredBB->getTerminator())) {
+        ReattachPred = PredBB;
+        continue;
+      }
+      if (isa<DetachInst>(PredBB->getTerminator()))
+        DetachPred = PredBB;
       PredValues.push_back(std::make_pair(PredBB, PredVal));
 
       // Compute SingularValue.
@@ -138,6 +162,18 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
         SingularValue = nullptr;
     }
   }
+  // Record any values we discover whose definitions occur in detached blocks.
+  if (ReattachPred) {
+    assert(DetachPred &&
+           "Reattached predecessor of a block with no detached predecessor.");
+    Value *DetachVal = GetValueAtEndOfBlock(DetachPred);
+    PredValues.push_back(std::make_pair(ReattachPred, DetachVal));
+    Value *ReattachVal = GetValueAtEndOfBlock(ReattachPred);
+    if (ReattachVal != DetachVal) {
+      SingularValue = nullptr;
+      getValIsDetached(VID)[BB] = true;
+    }
+  }
 
   // If there are no predecessors, just return undef.
   if (PredValues.empty())
@@ -187,6 +223,10 @@ Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
   return InsertedPHI;
 }
 
+bool SSAUpdater::GetValueIsDetachedInBlock(BasicBlock *BB) {
+  return getValIsDetached(VID)[BB];
+}
+
 void SSAUpdater::RewriteUse(Use &U) {
   Instruction *User = cast<Instruction>(U.getUser());
 
@@ -274,6 +314,18 @@ class SSAUpdaterTraits<SSAUpdater> {
     return UndefValue::get(Updater->ProtoType);
   }
 
+  /// BlockReattaches - Return true if this block is terminated with a
+  /// reattach, false otherwise.
+  static bool BlockReattaches(BasicBlock *BB, SSAUpdater *Updater) {
+    return isa<ReattachInst>(BB->getTerminator());
+  }
+
+  /// BlockReattaches - Return true if this block is terminated with a
+  /// detach, false otherwise.
+  static bool BlockDetaches(BasicBlock *BB, SSAUpdater *Updater) {
+    return isa<DetachInst>(BB->getTerminator());
+  }
+
   /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
   /// Reserve space for the operands but do not fill them in yet.
   static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
@@ -326,7 +378,8 @@ Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
   if (Value *V = AvailableVals[BB])
     return V;
 
-  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs,
+                                  &getValIsDetached(VID));
   return Impl.GetValue(BB);
 }
 
@@ -448,7 +501,14 @@ run(const SmallVectorImpl<Instruction *> &Insts) const {
   // Okay, now we rewrite all loads that use live-in values in the loop,
   // inserting PHI nodes as necessary.
   for (LoadInst *ALoad : LiveInLoads) {
-    Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
+    BasicBlock *BB = ALoad->getParent();
+    Value *NewVal = SSA.GetValueInMiddleOfBlock(BB);
+
+    // Skip loads whose definitions are detached.
+    if (Instruction *Def = dyn_cast<Instruction>(NewVal))
+      if (SSA.GetValueIsDetachedInBlock(Def->getParent()))
+        continue;
+
     replaceLoadWithValue(ALoad, NewVal);
 
     // Avoid assertions in unreachable code.
@@ -463,6 +523,8 @@ run(const SmallVectorImpl<Instruction *> &Insts) const {
   // Now that everything is rewritten, delete the old instructions from the
   // function.  They should all be dead now.
   for (Instruction *User : Insts) {
+    if (isa<StoreInst>(User) && !User->use_empty()) continue;
+
     // If this is a load that still has uses, then the load must have been added
     // as a live value in the SSAUpdate data structure for a block (e.g. because
     // the loaded value was stored later).  In this case, we need to recursively
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 03b73954321d86..7480b94e34ab61 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -66,6 +66,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/TapirUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -5751,6 +5753,14 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
   return false;
 }
 
+static bool BlockIsEntryOfDetachedCtx(const BasicBlock *BB) {
+  if (const BasicBlock *PredBB = BB->getSinglePredecessor())
+    if (const DetachInst *DI = dyn_cast<DetachInst>(PredBB->getTerminator()))
+      if (DI->getDetached() == BB)
+        return true;
+  return false;
+}
+
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
                                           IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
@@ -5769,6 +5779,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
        (LoopHeaders->count(BB) || LoopHeaders->count(Succ)));
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+      !BlockIsEntryOfDetachedCtx(BB) &&
       !NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB))
     return true;
 
@@ -5993,6 +6004,139 @@ static bool removeUndefIntroducingPredecessor(BasicBlock *BB) {
   return false;
 }
 
+/// If BB immediately syncs and BB's predecessor detaches, serialize
+/// the sync and detach.  This will allow normal serial
+/// optimization passes to remove the blocks appropriately.  Return
+/// false if BB does not terminate with a reattach.
+static bool serializeDetachToImmediateSync(BasicBlock *BB) {
+  Instruction *I = BB->getFirstNonPHIOrDbgOrLifetime();
+  if (isa<SyncInst>(I)) {
+    // This block is empty
+    bool Changed = false;
+    // Collect the detach and reattach predecessors.
+    SmallSet<DetachInst *, 4> DetachPreds;
+    SmallVector<Instruction *, 4> ReattachPreds;
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      if (DetachInst *DI = dyn_cast<DetachInst>(PredBB->getTerminator()))
+        DetachPreds.insert(DI);
+
+      if (ReattachInst *RI = dyn_cast<ReattachInst>(PredBB->getTerminator()))
+        ReattachPreds.push_back(RI);
+    }
+    Value *SyncRegion = cast<SyncInst>(I)->getSyncRegion();
+    for (DetachInst *DI : DetachPreds) {
+      BasicBlock *Detached = DI->getDetached();
+
+      // Replace the detach with a branch to the detached block.
+      BB->removePredecessor(DI->getParent());
+      ReplaceInstWithInst(DI, BranchInst::Create(Detached));
+
+      // Move static alloca instructions in the detached block to the
+      // appropriate entry block.
+      MoveStaticAllocasInBlock(cast<Instruction>(SyncRegion)->getParent(),
+                               Detached, ReattachPreds);
+      // We should not need to add new llvm.stacksave/llvm.stackrestore
+      // intrinsics, because we're not introducing new alloca's into a loop.
+      Changed = true;
+    }
+    for (Instruction *RI : ReattachPreds) {
+      // Replace the reattach with an unconditional branch.
+      ReplaceInstWithInst(RI, BranchInst::Create(BB));
+      Changed = true;
+    }
+    return Changed;
+  }
+  return false;
+}
+
+/// If BB immediately reattaches and BB's predecessor detaches,
+/// serialize the reattach and detach.  This will allow normal serial
+/// optimization passes to remove the blocks appropriately.  Return
+/// false if BB does not terminate with a reattach or predecessor does
+/// terminate with detach.
+static bool serializeTrivialDetachedBlock(BasicBlock *BB) {
+  Instruction *I = BB->getFirstNonPHI();
+  if (ReattachInst *RI = dyn_cast<ReattachInst>(I)) {
+    // This detached block is empty
+    // Scan predecessors to verify that all of them detach BB.
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      if (!isa<DetachInst>(PredBB->getTerminator()))
+	return false;
+    }
+    // All predecessors detach BB, so we can serialize
+    for (BasicBlock *PredBB : predecessors(BB)) {
+      DetachInst *DI = dyn_cast<DetachInst>(PredBB->getTerminator());
+      BasicBlock *Detached = DI->getDetached();
+      BasicBlock *Continue = DI->getContinue();
+      assert(RI->getSuccessor(0) == Continue &&
+             "Reattach destination does not match continue block of associated detach.");
+      // Remove the predecessor through the detach from the continue
+      // block.
+      Continue->removePredecessor(PredBB);
+      // Serialize the detach: replace it with an unconditional branch.
+      ReplaceInstWithInst(DI, BranchInst::Create(Detached));
+    }
+    // Serialize the reattach: replace it with an unconditional branch.
+    ReplaceInstWithInst(RI, BranchInst::Create(RI->getSuccessor(0)));
+    return true;
+  }
+  return false;
+}
+
+/// If BB detaches an CFG that cannot reach the continuation, serialize the
+/// detach.  Assuming the CFG is valid, this scenario arises when the detached
+/// CFG is terminated by unreachable instructions.
+static bool serializeDetachOfUnreachable(BasicBlock *BB) {
+  // This method assumes that the detached CFG is valid.
+  Instruction *I = BB->getTerminator();
+  if (DetachInst *DI = dyn_cast<DetachInst>(I)) {
+    // Check if continuation of the detach is not reached by reattach
+    // instructions.  If the detached CFG is valid, then the detached CFG must
+    // be terminated by unreachable instructions.
+    BasicBlock *Continue = DI->getContinue();
+    for (BasicBlock *PredBB : predecessors(Continue))
+      if (isa<ReattachInst>(PredBB->getTerminator()))
+        return false;
+    // TODO: Add stronger checks to make sure the detached CFG is valid.
+    // Remove the predecessor through the detach from the continue
+    // block.
+    Continue->removePredecessor(BB);
+    // Replace the detach with a branch to the detached block.
+    ReplaceInstWithInst(DI, BranchInst::Create(DI->getDetached()));
+    return true;
+  }
+  return false;
+}
+
+// Remove any syncs whose sync region is empty, meaning that the region contains
+// no detach instructions.  These sync instructions don't synchronize anything,
+// so they can be removed.
+static bool removeEmptySyncs(BasicBlock *BB) {
+  if (SyncInst *SI = dyn_cast<SyncInst>(BB->getTerminator())) {
+    // Get the sync region containing this sync
+    Value *SyncRegion = SI->getSyncRegion();
+    bool SyncRegionIsEmpty = true;
+    SmallVector<SyncInst *, 4> Syncs;
+    // Scan the Tapir instructions in this sync region.
+    for (User *U : SyncRegion->users()) {
+      // If the sync region contains a detach or a reattach, then it's not
+      // empty.
+      if (isa<DetachInst>(U) || isa<ReattachInst>(U))
+        SyncRegionIsEmpty = false;
+      // Collect the syncs in this region.
+      else if (isa<SyncInst>(U))
+        Syncs.push_back(cast<SyncInst>(U));
+    }
+    // If the sync region is empty, then remove all sync instructions in it.
+    if (SyncRegionIsEmpty) {
+      for (SyncInst *Sync : Syncs)
+        ReplaceInstWithInst(Sync, BranchInst::Create(Sync->getSuccessor(0)));
+      return true;
+    }
+  }
+  return false;
+}
+
 bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   bool Changed = false;
 
@@ -6018,6 +6162,14 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
   // Check for and remove branches that will always cause undefined behavior.
   Changed |= removeUndefIntroducingPredecessor(BB);
 
+  // Check for and remove trivial detached blocks.
+  Changed |= serializeTrivialDetachedBlock(BB);
+  Changed |= serializeDetachToImmediateSync(BB);
+  Changed |= serializeDetachOfUnreachable(BB);
+
+  // Check for and remove sync instructions in empty sync regions.
+  Changed |= removeEmptySyncs(BB);
+
   // Merge basic blocks into their predecessor if there is only one distinct
   // pred, and if there is only one distinct successor of the predecessor, and
   // if there are no PHI nodes.
diff --git a/llvm/lib/Transforms/Utils/TapirUtils.cpp b/llvm/lib/Transforms/Utils/TapirUtils.cpp
new file mode 100644
index 00000000000000..cba2f39411076d
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/TapirUtils.cpp
@@ -0,0 +1,318 @@
+//===-- TapirUtils.cpp - Utility methods for Tapir -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file utility methods for handling code containing Tapir instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/TapirUtils.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tapirutils"
+
+/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
+/// block. Allocas used in inalloca calls and allocas of dynamic array size
+/// cannot be static.
+/// (Borrowed from Transforms/Utils/InlineFunction.cpp)
+static bool allocaWouldBeStaticInEntry(const AllocaInst *AI) {
+  return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
+}
+
+// Check whether this Value is used by a lifetime intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+  for (User *U : V->users()) {
+    if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+      switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::lifetime_start:
+      case Intrinsic::lifetime_end:
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+  Type *Ty = AI->getType();
+  Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(),
+                                       Ty->getPointerAddressSpace());
+  if (Ty == Int8PtrTy)
+    return isUsedByLifetimeMarker(AI);
+
+  // Do a scan to find all the casts to i8*.
+  for (User *U : AI->users()) {
+    if (U->getType() != Int8PtrTy) continue;
+    if (U->stripPointerCasts() != AI) continue;
+    if (isUsedByLifetimeMarker(U))
+      return true;
+  }
+  return false;
+}
+
+// Move static allocas in a cloned block into the entry block of helper.  Leave
+// lifetime markers behind for those static allocas.  Returns true if the cloned
+// block still contains dynamic allocas, which cannot be moved.
+bool llvm::MoveStaticAllocasInBlock(
+    BasicBlock *Entry,
+    BasicBlock *Block,
+    SmallVectorImpl<Instruction *> &ExitPoints) {
+  Function *F = Entry->getParent();
+  SmallVector<AllocaInst *, 4> StaticAllocas;
+  bool ContainsDynamicAllocas = false;
+  BasicBlock::iterator InsertPoint = Entry->begin();
+  for (BasicBlock::iterator I = Block->begin(),
+         E = Block->end(); I != E; ) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(I++);
+    if (!AI) continue;
+
+    if (!allocaWouldBeStaticInEntry(AI)) {
+      ContainsDynamicAllocas = true;
+      continue;
+    }
+
+    StaticAllocas.push_back(AI);
+
+    // Scan for the block of allocas that we can move over, and move them
+    // all at once.
+    while (isa<AllocaInst>(I) &&
+           allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
+      StaticAllocas.push_back(cast<AllocaInst>(I));
+      ++I;
+    }
+
+    // Transfer all of the allocas over in a block.  Using splice means
+    // that the instructions aren't removed from the symbol table, then
+    // reinserted.
+    Entry->getInstList().splice(
+        InsertPoint, Block->getInstList(), AI->getIterator(), I);
+  }
+  // Move any dbg.declares describing the allocas into the entry basic block.
+  DIBuilder DIB(*F->getParent());
+  for (auto &AI : StaticAllocas)
+    replaceDbgDeclareForAlloca(AI, AI, DIB, /*Deref=*/false);
+
+  // Move any syncregion_start's into the entry basic block.
+  for (BasicBlock::iterator I = Block->begin(),
+         E = Block->end(); I != E; ) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I++);
+    if (!II) continue;
+    if (Intrinsic::syncregion_start != II->getIntrinsicID())
+      continue;
+
+    while (isa<IntrinsicInst>(I) &&
+           Intrinsic::syncregion_start ==
+           cast<IntrinsicInst>(I)->getIntrinsicID())
+        ++I;
+
+    Entry->getInstList().splice(
+        InsertPoint, Block->getInstList(), II->getIterator(), I);
+  }
+
+  // Leave lifetime markers for the static alloca's, scoping them to the
+  // from cloned block to cloned exit.
+  if (!StaticAllocas.empty()) {
+    IRBuilder<> Builder(&Block->front());
+    for (unsigned ai = 0, ae = StaticAllocas.size(); ai != ae; ++ai) {
+      AllocaInst *AI = StaticAllocas[ai];
+      // Don't mark swifterror allocas. They can't have bitcast uses.
+      if (AI->isSwiftError())
+        continue;
+
+      // If the alloca is already scoped to something smaller than the whole
+      // function then there's no need to add redundant, less accurate markers.
+      if (hasLifetimeMarkers(AI))
+        continue;
+
+      // Try to determine the size of the allocation.
+      ConstantInt *AllocaSize = nullptr;
+      if (ConstantInt *AIArraySize =
+          dyn_cast<ConstantInt>(AI->getArraySize())) {
+        auto &DL = F->getParent()->getDataLayout();
+        Type *AllocaType = AI->getAllocatedType();
+        uint64_t AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+        uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+
+        // Don't add markers for zero-sized allocas.
+        if (AllocaArraySize == 0)
+          continue;
+
+        // Check that array size doesn't saturate uint64_t and doesn't
+        // overflow when it's multiplied by type size.
+        if (AllocaArraySize != ~0ULL &&
+            UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
+          AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                        AllocaArraySize * AllocaTypeSize);
+        }
+      }
+
+      Builder.CreateLifetimeStart(AI, AllocaSize);
+      for (Instruction *ExitPoint : ExitPoints) {
+        IRBuilder<>(ExitPoint).CreateLifetimeEnd(AI, AllocaSize);
+      }
+    }
+  }
+
+  return ContainsDynamicAllocas;
+}
+
+
+/// SerializeDetachedCFG - Serialize the sub-CFG detached by the
+/// specified detach instruction.  Removes the detach instruction and
+/// returns a pointer to the branch instruction that replaces it.
+///
+BranchInst *llvm::SerializeDetachedCFG(DetachInst *DI, DominatorTree *DT) {
+  // Get the parent of the detach instruction.
+  BasicBlock *Detacher = DI->getParent();
+  // Get the detached block and continuation of this detach.
+  BasicBlock *Detached = DI->getDetached();
+  BasicBlock *Continuation = DI->getContinue();
+
+  assert(Detached->getSinglePredecessor() &&
+         "Detached block has multiple predecessors.");
+
+  // Get the detach edge from DI.
+  BasicBlockEdge DetachEdge(Detacher, Detached);
+
+  // Collect the reattaches into the continuation.  If DT is
+  // available, verify that all reattaches are dominated by the detach
+  // edge from DI.
+  SmallVector<ReattachInst *, 8> Reattaches;
+  // If we only find a single reattach into the continuation, capture
+  // it so we can later update the dominator tree.
+  BasicBlock *SingleReattacher = nullptr;
+  int ReattachesFound = 0;
+  for (auto PI = pred_begin(Continuation), PE = pred_end(Continuation);
+       PI != PE; PI++) {
+    BasicBlock *Pred = *PI;
+    // Skip the detacher.
+    if (Detacher == Pred) continue;
+    // Record the reattaches found.
+    if (isa<ReattachInst>(Pred->getTerminator())) {
+      ReattachesFound++;
+      if (!SingleReattacher)
+        SingleReattacher = Pred;
+      if (DT) {
+        assert(DT->dominates(DetachEdge, Pred) &&
+               "Detach edge does not dominate a reattach into its continuation.");
+      }
+      Reattaches.push_back(cast<ReattachInst>(Pred->getTerminator()));
+    }
+  }
+  // TODO: It's possible to detach a CFG that does not terminate with a
+  // reattach.  For example, optimizations can create detached CFG's that are
+  // terminated by unreachable terminators only.  Some of these special cases
+  // lead to problems with other passes, however, and this check will identify
+  // those special cases early while we sort out those issues.
+  assert(!Reattaches.empty() && "No reattach found for detach.");
+
+  // Replace each reattach with branches to the continuation.
+  for (ReattachInst *RI : Reattaches) {
+    BranchInst *ReplacementBr = BranchInst::Create(Continuation, RI);
+    ReplacementBr->setDebugLoc(RI->getDebugLoc());
+    RI->eraseFromParent();
+  }
+
+  // Replace the new detach with a branch to the detached CFG.
+  BranchInst *ReplacementBr = BranchInst::Create(Detached, DI);
+  ReplacementBr->setDebugLoc(DI->getDebugLoc());
+  DI->eraseFromParent();
+
+  // Update the dominator tree.
+  if (DT)
+    if (DT->dominates(Detacher, Continuation) && 1 == ReattachesFound)
+      DT->changeImmediateDominator(Continuation, SingleReattacher);
+
+  return ReplacementBr;
+}
+
+/// GetDetachedCtx - Get the entry basic block to the detached context
+/// that contains the specified block.
+///
+BasicBlock *llvm::GetDetachedCtx(BasicBlock *BB) {
+  return const_cast<BasicBlock *>(
+      GetDetachedCtx(const_cast<const BasicBlock *>(BB)));
+}
+
+const BasicBlock *llvm::GetDetachedCtx(const BasicBlock *BB) {
+  // Traverse the CFG backwards until we either reach the entry block
+  // of the function or we find a detach instruction that detaches the
+  // current block.
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  SmallVector<const BasicBlock *, 32> WorkList;
+  WorkList.push_back(BB);
+  while (!WorkList.empty()) {
+    const BasicBlock *CurrBB = WorkList.pop_back_val();
+    if (!Visited.insert(CurrBB).second)
+      continue;
+
+    for (auto PI = pred_begin(CurrBB), PE = pred_end(CurrBB);
+         PI != PE; ++PI) {
+      const BasicBlock *PredBB = *PI;
+
+      // Skip predecessors via reattach instructions.  The detacher
+      // block corresponding to this reattach is also a predecessor of
+      // the current basic block.
+      if (isa<ReattachInst>(PredBB->getTerminator()))
+        continue;
+
+      // If the predecessor is terminated by a detach, check to see if
+      // that detach detached the current basic block.
+      if (isa<DetachInst>(PredBB->getTerminator())) {
+        const DetachInst *DI = cast<DetachInst>(PredBB->getTerminator());
+        if (DI->getDetached() == CurrBB)
+          // Return the current block, which is the entry of this detached
+          // sub-CFG.
+          return CurrBB;
+      }
+
+      // Otherwise, add the predecessor block to the work list to
+      // search.
+      WorkList.push_back(PredBB);
+    }
+  }
+
+  // Our search didn't find anything, so return the entry of the
+  // function containing the given block.
+  return &(BB->getParent()->getEntryBlock());
+}
+
+/// isCriticalContinueEdge - Return true if the specified edge is a critical
+/// detach-continue edge.  Critical detach-continue edges are critical edges -
+/// from a block with multiple successors to a block with multiple predecessors
+/// - even after ignoring all reattach edges.
+bool llvm::isCriticalContinueEdge(const TerminatorInst *TI, unsigned SuccNum) {
+  assert(SuccNum < TI->getNumSuccessors() && "Illegal edge specification!");
+  if (TI->getNumSuccessors() == 1) return false;
+
+  // Edge must come from a detach.
+  if (!isa<DetachInst>(TI)) return false;
+  // Edge must go to the continuation.
+  if (SuccNum != 1) return false;
+
+  const BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  const_pred_iterator I = pred_begin(Dest), E = pred_end(Dest);
+
+  // If there is more than one predecessor, this is a critical edge...
+  assert(I != E && "No preds, but we have an edge to the block?");
+  const BasicBlock *DetachPred = TI->getParent();
+  for (; I != E; ++I) {
+    if (DetachPred == *I) continue;
+    if (isa<ReattachInst>((*I)->getTerminator())) continue;
+    return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c45dee590b8452..3d7800dd9b82b0 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2741,6 +2741,15 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   assert(VectorPH && "Invalid loop structure");
   assert(ExitBlock && "Must have an exit block");
 
+  BasicBlock *sync_split = nullptr;
+  if (isa<SyncInst>(VectorPH->getTerminator())) {
+    sync_split = VectorPH->splitBasicBlockWithTerminator("vector.sync_split");
+    DT->splitBlock(sync_split);
+    //DT->changeImmediateDominator(LoopExitBlock, LoopBypassBlocks[0]);
+    DT->verifyDomTree();
+    VectorPH = sync_split;
+  }
+
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
@@ -2773,6 +2782,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
     ParentLoop->addChildLoop(Lp);
     ParentLoop->addBasicBlockToLoop(ScalarPH, *LI);
     ParentLoop->addBasicBlockToLoop(MiddleBlock, *LI);
+    if (sync_split) ParentLoop->addBasicBlockToLoop(sync_split, *LI);
   } else {
     LI->addTopLevelLoop(Lp);
   }
diff --git a/llvm/microbenchmarks/everything/everything.c b/llvm/microbenchmarks/everything/everything.c
new file mode 100644
index 00000000000000..d2dd0aa96e5f2c
--- /dev/null
+++ b/llvm/microbenchmarks/everything/everything.c
@@ -0,0 +1,32 @@
+#include <cilk/cilk.h>
+#include <math.h>
+
+int foo() {
+  return 10;
+}
+
+int bar();
+
+int main() {
+  double c = foo();
+  cilk_spawn {
+    c += sin(c);
+    c += sin(c);
+    c += sin(c);
+  }
+  cilk_spawn {
+    cilk_spawn {
+      c += sin(c);
+      c += sin(c);
+      c += sin(c);
+    }
+  }
+  cilk_spawn {
+    if (c) {
+      c += sin(c);
+      c += sin(c);
+      c += sin(c);
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/everything/everything.ll b/llvm/microbenchmarks/everything/everything.ll
new file mode 100644
index 00000000000000..249549a7131cd5
--- /dev/null
+++ b/llvm/microbenchmarks/everything/everything.ll
@@ -0,0 +1,118 @@
+; ModuleID = 'everything.c'
+source_filename = "everything.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @foo() #0 {
+entry:
+  ret i32 10
+}
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca double, align 8
+  store i32 0, i32* %retval, align 4
+  %call = call i32 @foo()
+  %conv = sitofp i32 %call to double
+  store double %conv, double* %c, align 8
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = bitcast i32 undef to i32
+  %1 = load double, double* %c, align 8
+  %call1 = call double @sin(double %1) #2
+  %2 = load double, double* %c, align 8
+  %add = fadd double %2, %call1
+  store double %add, double* %c, align 8
+  %3 = load double, double* %c, align 8
+  %call2 = call double @sin(double %3) #2
+  %4 = load double, double* %c, align 8
+  %add3 = fadd double %4, %call2
+  store double %add3, double* %c, align 8
+  %5 = load double, double* %c, align 8
+  %call4 = call double @sin(double %5) #2
+  %6 = load double, double* %c, align 8
+  %add5 = fadd double %6, %call4
+  store double %add5, double* %c, align 8
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  detach label %det.achd6, label %det.cont15
+
+det.achd6:                                        ; preds = %det.cont
+  %7 = bitcast i32 undef to i32
+  detach label %det.achd7, label %det.cont14
+
+det.achd7:                                        ; preds = %det.achd6
+  %8 = bitcast i32 undef to i32
+  %9 = load double, double* %c, align 8
+  %call8 = call double @sin(double %9) #2
+  %10 = load double, double* %c, align 8
+  %add9 = fadd double %10, %call8
+  store double %add9, double* %c, align 8
+  %11 = load double, double* %c, align 8
+  %call10 = call double @sin(double %11) #2
+  %12 = load double, double* %c, align 8
+  %add11 = fadd double %12, %call10
+  store double %add11, double* %c, align 8
+  %13 = load double, double* %c, align 8
+  %call12 = call double @sin(double %13) #2
+  %14 = load double, double* %c, align 8
+  %add13 = fadd double %14, %call12
+  store double %add13, double* %c, align 8
+  reattach label %det.cont14
+
+det.cont14:                                       ; preds = %det.achd7, %det.achd6
+  reattach label %det.cont15
+
+det.cont15:                                       ; preds = %det.cont14, %det.cont
+  detach label %det.achd16, label %det.cont23
+
+det.achd16:                                       ; preds = %det.cont15
+  %15 = bitcast i32 undef to i32
+  %16 = load double, double* %c, align 8
+  %tobool = fcmp une double %16, 0.000000e+00
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %det.achd16
+  %17 = load double, double* %c, align 8
+  %call17 = call double @sin(double %17) #2
+  %18 = load double, double* %c, align 8
+  %add18 = fadd double %18, %call17
+  store double %add18, double* %c, align 8
+  %19 = load double, double* %c, align 8
+  %call19 = call double @sin(double %19) #2
+  %20 = load double, double* %c, align 8
+  %add20 = fadd double %20, %call19
+  store double %add20, double* %c, align 8
+  %21 = load double, double* %c, align 8
+  %call21 = call double @sin(double %21) #2
+  %22 = load double, double* %c, align 8
+  %add22 = fadd double %22, %call21
+  store double %add22, double* %c, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %det.achd16
+  reattach label %det.cont23
+
+det.cont23:                                       ; preds = %if.end, %det.cont15
+  %23 = load double, double* %c, align 8
+  %conv24 = fptosi double %23 to i32
+  ret i32 %conv24
+}
+
+; Function Attrs: nounwind readnone
+declare double @sin(double) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/everything/simple.c b/llvm/microbenchmarks/everything/simple.c
new file mode 100644
index 00000000000000..aa4252c4bc3890
--- /dev/null
+++ b/llvm/microbenchmarks/everything/simple.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = 0;
+  for (int i=0; i < 1000; i++) {
+    cilk_spawn {
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/everything/simple.ll b/llvm/microbenchmarks/everything/simple.ll
new file mode 100644
index 00000000000000..268be428dbd3c6
--- /dev/null
+++ b/llvm/microbenchmarks/everything/simple.ll
@@ -0,0 +1,53 @@
+; ModuleID = 'simple.c'
+source_filename = "simple.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  store i32 0, i32* %c, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %i, align 4
+  %cmp = icmp slt i32 %0, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %for.body
+  %1 = bitcast i32 undef to i32
+  %call = call i32 (...) @foo()
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %for.body
+  br label %for.inc
+
+for.inc:                                          ; preds = %det.cont
+  %2 = load i32, i32* %i, align 4
+  %inc = add nsw i32 %2, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %3 = load i32, i32* %c, align 4
+  ret i32 %3
+}
+
+declare i32 @foo(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/everything/temp.ll b/llvm/microbenchmarks/everything/temp.ll
new file mode 100644
index 00000000000000..5d49d66271d392
--- /dev/null
+++ b/llvm/microbenchmarks/everything/temp.ll
@@ -0,0 +1,24 @@
+; ModuleID = '<stdin>'
+source_filename = "everything.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define i32 @SpawnUnswitch_SmallBlock_RedundantSpawn_foo() local_unnamed_addr #0 {
+entry:
+  ret i32 10
+}
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define i32 @SpawnUnswitch_SmallBlock_RedundantSpawn_main() local_unnamed_addr #0 {
+entry:
+  ret i32 9
+}
+
+attributes #0 = { noinline norecurse nounwind readnone ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/redundantspawn/complex.c b/llvm/microbenchmarks/redundantspawn/complex.c
new file mode 100644
index 00000000000000..23874168629bd1
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/complex.c
@@ -0,0 +1,32 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    bar();
+    cilk_spawn {
+      cilk_spawn {
+        cilk_spawn {
+          foo();
+        }
+      }
+      bar();
+    }
+    cilk_spawn {
+      cilk_spawn {
+        foo();
+        foo();
+      }
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/multiple_nested.c b/llvm/microbenchmarks/redundantspawn/multiple_nested.c
new file mode 100644
index 00000000000000..3f9a1f235b183a
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/multiple_nested.c
@@ -0,0 +1,21 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/multiple_redundant.c b/llvm/microbenchmarks/redundantspawn/multiple_redundant.c
new file mode 100644
index 00000000000000..aa52f045e0be6f
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/multiple_redundant.c
@@ -0,0 +1,20 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      cilk_spawn {
+        cilk_spawn {
+          foo();
+          foo();
+        }
+      }
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/serial.c b/llvm/microbenchmarks/redundantspawn/serial.c
new file mode 100644
index 00000000000000..12b21b6b0ebc38
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/serial.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c > 0) {
+    bar();
+  } else {
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/simple_spawn.c b/llvm/microbenchmarks/redundantspawn/simple_spawn.c
new file mode 100644
index 00000000000000..41183d94ae8ad0
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/simple_spawn.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/redundantspawn/single_redundant.c b/llvm/microbenchmarks/redundantspawn/single_redundant.c
new file mode 100644
index 00000000000000..33de19ce0f1872
--- /dev/null
+++ b/llvm/microbenchmarks/redundantspawn/single_redundant.c
@@ -0,0 +1,16 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/conditional.c b/llvm/microbenchmarks/smallblock/conditional.c
new file mode 100644
index 00000000000000..058b70da06735f
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/conditional.c
@@ -0,0 +1,27 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c*2 > 1) {
+    cilk_spawn {
+      if (c > 1) {
+        bar();
+      } else {
+        foo();
+      }
+    }
+  } else if (c*3 < 1) {
+    cilk_spawn {
+      bar();
+    }
+  } else {
+    cilk_spawn {
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/conditional.ll b/llvm/microbenchmarks/smallblock/conditional.ll
new file mode 100644
index 00000000000000..6e796bb19273e1
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/conditional.ll
@@ -0,0 +1,66 @@
+; ModuleID = 'conditional.c'
+source_filename = "conditional.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %mul = shl nsw i32 %call, 1
+  %cmp = icmp sgt i32 %mul, 1
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  detach label %det.achd, label %if.end17
+
+det.achd:                                         ; preds = %if.then
+  %cmp1 = icmp sgt i32 %call, 1
+  br i1 %cmp1, label %if.then2, label %if.else
+
+if.then2:                                         ; preds = %det.achd
+  %call3 = tail call i32 (...) @bar() #2
+  br label %if.end
+
+if.else:                                          ; preds = %det.achd
+  %call4 = tail call i32 (...) @foo() #2
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then2
+  reattach label %if.end17
+
+if.else5:                                         ; preds = %entry
+  %cmp7 = icmp slt i32 %call, 1
+  br i1 %cmp7, label %if.then8, label %if.else12
+
+if.then8:                                         ; preds = %if.else5
+  detach label %det.achd9, label %if.end17
+
+det.achd9:                                        ; preds = %if.then8
+  %call10 = tail call i32 (...) @bar() #2
+  reattach label %if.end17
+
+if.else12:                                        ; preds = %if.else5
+  detach label %det.achd13, label %if.end17
+
+det.achd13:                                       ; preds = %if.else12
+  %call14 = tail call i32 (...) @foo() #2
+  reattach label %if.end17
+
+if.end17:                                         ; preds = %det.achd9, %if.then8, %det.achd13, %if.else12, %if.then, %if.end
+  ret i32 %call
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/smallblock/conditional_opt.ll b/llvm/microbenchmarks/smallblock/conditional_opt.ll
new file mode 100644
index 00000000000000..226b5972c852b4
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/conditional_opt.ll
@@ -0,0 +1,89 @@
+; ModuleID = '<stdin>'
+source_filename = "conditional.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @SmallBlock_main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  %call = call i32 (...) @foo()
+  store i32 %call, i32* %c, align 4
+  %0 = load i32, i32* %c, align 4
+  %mul = mul nsw i32 %0, 2
+  %cmp = icmp sgt i32 %mul, 1
+  br i1 %cmp, label %if.then, label %if.else5
+
+if.then:                                          ; preds = %entry
+  br label %det.achd
+
+det.achd:                                         ; preds = %if.then
+  %1 = bitcast i32 undef to i32
+  %2 = load i32, i32* %c, align 4
+  %cmp1 = icmp sgt i32 %2, 1
+  br i1 %cmp1, label %if.then2, label %if.else
+
+if.then2:                                         ; preds = %det.achd
+  %call3 = call i32 (...) @bar()
+  br label %if.end
+
+if.else:                                          ; preds = %det.achd
+  %call4 = call i32 (...) @foo()
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then2
+  br label %det.cont
+
+det.cont:                                         ; preds = %if.end
+  br label %if.end17
+
+if.else5:                                         ; preds = %entry
+  %3 = load i32, i32* %c, align 4
+  %mul6 = mul nsw i32 %3, 3
+  %cmp7 = icmp slt i32 %mul6, 1
+  br i1 %cmp7, label %if.then8, label %if.else12
+
+if.then8:                                         ; preds = %if.else5
+  br label %det.achd9
+
+det.achd9:                                        ; preds = %if.then8
+  %4 = bitcast i32 undef to i32
+  %call10 = call i32 (...) @bar()
+  br label %det.cont11
+
+det.cont11:                                       ; preds = %det.achd9
+  br label %if.end16
+
+if.else12:                                        ; preds = %if.else5
+  br label %det.achd13
+
+det.achd13:                                       ; preds = %if.else12
+  %5 = bitcast i32 undef to i32
+  %call14 = call i32 (...) @foo()
+  br label %det.cont15
+
+det.cont15:                                       ; preds = %det.achd13
+  br label %if.end16
+
+if.end16:                                         ; preds = %det.cont15, %det.cont11
+  br label %if.end17
+
+if.end17:                                         ; preds = %if.end16, %det.cont
+  %6 = load i32, i32* %c, align 4
+  ret i32 %6
+}
+
+declare i32 @foo(...) #1
+
+declare i32 @bar(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/smallblock/multiple_nested.c b/llvm/microbenchmarks/smallblock/multiple_nested.c
new file mode 100644
index 00000000000000..3f9a1f235b183a
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/multiple_nested.c
@@ -0,0 +1,21 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/multiple_spawn.c b/llvm/microbenchmarks/smallblock/multiple_spawn.c
new file mode 100644
index 00000000000000..b551796f050ed0
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/multiple_spawn.c
@@ -0,0 +1,19 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  cilk_spawn {
+    foo();
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/serial.c b/llvm/microbenchmarks/smallblock/serial.c
new file mode 100644
index 00000000000000..12b21b6b0ebc38
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/serial.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c > 0) {
+    bar();
+  } else {
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/smallblock/simple_spawn.c b/llvm/microbenchmarks/smallblock/simple_spawn.c
new file mode 100644
index 00000000000000..41183d94ae8ad0
--- /dev/null
+++ b/llvm/microbenchmarks/smallblock/simple_spawn.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_negative.c b/llvm/microbenchmarks/spawnrestructure/base_negative.c
new file mode 100644
index 00000000000000..3718ca3466844c
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_negative.c
@@ -0,0 +1,20 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  cilk_spawn {
+    foo();
+    foo();
+  }
+  bar();
+  return 0;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_negative.ll b/llvm/microbenchmarks/spawnrestructure/base_negative.ll
new file mode 100644
index 00000000000000..b92b96b872d09c
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_negative.ll
@@ -0,0 +1,46 @@
+; ModuleID = 'base_negative.c'
+source_filename = "base_negative.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = bitcast i32 undef to i32
+  %call = call i32 (...) @foo()
+  %call1 = call i32 (...) @bar()
+  store i32 2, i32* %c, align 4
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  detach label %det.achd2, label %det.cont5
+
+det.achd2:                                        ; preds = %det.cont
+  %1 = bitcast i32 undef to i32
+  %call3 = call i32 (...) @foo()
+  %call4 = call i32 (...) @foo()
+  reattach label %det.cont5
+
+det.cont5:                                        ; preds = %det.achd2, %det.cont
+  %call6 = call i32 (...) @bar()
+  ret i32 0
+}
+
+declare i32 @foo(...) #1
+
+declare i32 @bar(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_positive.c b/llvm/microbenchmarks/spawnrestructure/base_positive.c
new file mode 100644
index 00000000000000..7e3d0546efd52b
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_positive.c
@@ -0,0 +1,19 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  cilk_spawn {
+    foo();
+    foo();
+  }
+  return bar();
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/base_positive.ll b/llvm/microbenchmarks/spawnrestructure/base_positive.ll
new file mode 100644
index 00000000000000..8055cdfe786d67
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/base_positive.ll
@@ -0,0 +1,46 @@
+; ModuleID = 'base_positive.c'
+source_filename = "base_positive.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: noinline nounwind ssp uwtable
+define i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %c = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = bitcast i32 undef to i32
+  %call = call i32 (...) @foo()
+  %call1 = call i32 (...) @bar()
+  store i32 2, i32* %c, align 4
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  detach label %det.achd2, label %det.cont5
+
+det.achd2:                                        ; preds = %det.cont
+  %1 = bitcast i32 undef to i32
+  %call3 = call i32 (...) @foo()
+  %call4 = call i32 (...) @foo()
+  reattach label %det.cont5
+
+det.cont5:                                        ; preds = %det.achd2, %det.cont
+  %call6 = call i32 (...) @bar()
+  ret i32 %call6
+}
+
+declare i32 @foo(...) #1
+
+declare i32 @bar(...) #1
+
+attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 4.0.0 (git@github.com:wsmoses/Cilk-Clang cc78c4b6082bb80687e64c8104bf9744e6fa8fdc) (git@github.com:wsmoses/Parallel-IR 52889bc31182f3faebcfce24918670967b5b96f6)"}
diff --git a/llvm/microbenchmarks/spawnrestructure/complex.c b/llvm/microbenchmarks/spawnrestructure/complex.c
new file mode 100644
index 00000000000000..23874168629bd1
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/complex.c
@@ -0,0 +1,32 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    bar();
+    cilk_spawn {
+      cilk_spawn {
+        cilk_spawn {
+          foo();
+        }
+      }
+      bar();
+    }
+    cilk_spawn {
+      cilk_spawn {
+        foo();
+        foo();
+      }
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/multiple_nested.c b/llvm/microbenchmarks/spawnrestructure/multiple_nested.c
new file mode 100644
index 00000000000000..3f9a1f235b183a
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/multiple_nested.c
@@ -0,0 +1,21 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    cilk_spawn {
+      foo();
+      bar();
+      c = 2;
+    }
+    cilk_spawn {
+      foo();
+      foo();
+    }
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/serial.c b/llvm/microbenchmarks/spawnrestructure/serial.c
new file mode 100644
index 00000000000000..12b21b6b0ebc38
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/serial.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  if (c > 0) {
+    bar();
+  } else {
+    foo();
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnrestructure/simple_spawn.c b/llvm/microbenchmarks/spawnrestructure/simple_spawn.c
new file mode 100644
index 00000000000000..41183d94ae8ad0
--- /dev/null
+++ b/llvm/microbenchmarks/spawnrestructure/simple_spawn.c
@@ -0,0 +1,15 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c;
+  cilk_spawn {
+    foo();
+    bar();
+    c = 2;
+  }
+  return c;
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple.c b/llvm/microbenchmarks/spawnunswitch/simple.c
new file mode 100644
index 00000000000000..d817a44c676419
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple.c
@@ -0,0 +1,16 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  int c = foo();
+  int d = bar();
+  cilk_spawn {
+    if (c) {
+      foo();
+    }
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple.ll b/llvm/microbenchmarks/spawnunswitch/simple.ll
new file mode 100644
index 00000000000000..05d3ac9fbbd8ec
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple.ll
@@ -0,0 +1,41 @@
+; ModuleID = 'simple.c'
+source_filename = "simple.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %call1 = tail call i32 (...) @bar() #2
+  detach label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %det.achd
+  %call2 = tail call i32 (...) @foo() #2
+  br label %if.end
+
+if.end:                                           ; preds = %det.achd, %if.then
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %if.end, %entry
+  %call3 = tail call i32 (...) @foo() #2
+  ret i32 %call3
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple2.c b/llvm/microbenchmarks/spawnunswitch/simple2.c
new file mode 100644
index 00000000000000..7e376f1522451d
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple2.c
@@ -0,0 +1,14 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  if (foo()) {
+    cilk_spawn {
+      bar();
+    }
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/simple2.ll b/llvm/microbenchmarks/spawnunswitch/simple2.ll
new file mode 100644
index 00000000000000..a6dfc993f89703
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/simple2.ll
@@ -0,0 +1,37 @@
+; ModuleID = 'simple2.c'
+source_filename = "simple2.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  detach label %det.achd, label %if.end
+
+det.achd:                                         ; preds = %if.then
+  %call1 = tail call i32 (...) @bar() #2
+  reattach label %if.end
+
+if.end:                                           ; preds = %entry, %if.then, %det.achd
+  %call2 = tail call i32 (...) @foo() #2
+  ret i32 %call2
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/spawnunswitch/temp.ll b/llvm/microbenchmarks/spawnunswitch/temp.ll
new file mode 100644
index 00000000000000..1484049381dfc4
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/temp.ll
@@ -0,0 +1,38 @@
+; ModuleID = '<stdin>'
+source_filename = "simple.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define i32 @SpawnUnswitch_main() local_unnamed_addr #0 {
+entry:
+  %call = tail call i32 (...) @foo() #2
+  %call1 = tail call i32 (...) @bar() #2
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  detach label %if.end, label %det.cont
+
+if.end:                                           ; preds = %det.achd
+  %call2 = tail call i32 (...) @foo() #2
+  reattach label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry, %if.end
+  %call3 = tail call i32 (...) @foo() #2
+  ret i32 %call3
+}
+
+declare i32 @foo(...) local_unnamed_addr #1
+
+declare i32 @bar(...) local_unnamed_addr #1
+
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 (git@github.com:wsmoses/Cilk-Clang 5942594810265567795884c83b5a37a8cbc98d3e) (git@github.com:wsmoses/Parallel-IR 8f57e0739bf9fc6736472c89f91a533630efd5c3)"}
diff --git a/llvm/microbenchmarks/spawnunswitch/test.c b/llvm/microbenchmarks/spawnunswitch/test.c
new file mode 100644
index 00000000000000..7228775811b839
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/test.c
@@ -0,0 +1,12 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  cilk_for (int i=0; i < 1000; i++) {
+    foo();
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/spawnunswitch/test2.c b/llvm/microbenchmarks/spawnunswitch/test2.c
new file mode 100644
index 00000000000000..56dd3cb7977f61
--- /dev/null
+++ b/llvm/microbenchmarks/spawnunswitch/test2.c
@@ -0,0 +1,12 @@
+#include <cilk/cilk.h>
+
+int foo();
+
+int bar();
+
+int main() {
+  cilk_spawn {
+    bar();
+  }
+  return foo();
+}
diff --git a/llvm/microbenchmarks/timing/average.py b/llvm/microbenchmarks/timing/average.py
new file mode 100644
index 00000000000000..17dc85395caa7a
--- /dev/null
+++ b/llvm/microbenchmarks/timing/average.py
@@ -0,0 +1,10 @@
+import sys
+f = open("spawn.txt", 'r')
+g = open("simple.txt", 'r')
+total1 = 0
+for line in f.readlines():
+	total1 += int(line[:len(line)-1])
+total2 = 0
+for line in g.readlines():
+        total2 += int(line[:len(line)-1])
+print "Spawn to serial ratio: " + str((total1*1.0)/total2)
diff --git a/llvm/microbenchmarks/timing/ratio.sh b/llvm/microbenchmarks/timing/ratio.sh
new file mode 100644
index 00000000000000..ac4c6a3e239305
--- /dev/null
+++ b/llvm/microbenchmarks/timing/ratio.sh
@@ -0,0 +1 @@
+for i in {1..100};do ./simple >> simple.txt;./spawn >> spawn.txt;done;python average.py;rm *.txt
diff --git a/llvm/microbenchmarks/timing/simple b/llvm/microbenchmarks/timing/simple
new file mode 100755
index 00000000000000..68c3cd94e6a26f
Binary files /dev/null and b/llvm/microbenchmarks/timing/simple differ
diff --git a/llvm/microbenchmarks/timing/simple.c b/llvm/microbenchmarks/timing/simple.c
new file mode 100644
index 00000000000000..c7a90879912060
--- /dev/null
+++ b/llvm/microbenchmarks/timing/simple.c
@@ -0,0 +1,16 @@
+#include <time.h>
+#include <stdio.h>
+
+int main() {
+  int c = 0;
+  int its = 100;
+  clock_t start = clock(), diff;
+  #pragma unroll
+  for (int i = 0; i < its; i++) {
+    c += i;
+  }
+  diff = clock() - start;
+  int msec = (diff * 1000000) / CLOCKS_PER_SEC;
+  printf("%d\n", msec);
+  return c;
+}
diff --git a/llvm/microbenchmarks/timing/spawn b/llvm/microbenchmarks/timing/spawn
new file mode 100755
index 00000000000000..3dc36bce56b1ea
Binary files /dev/null and b/llvm/microbenchmarks/timing/spawn differ
diff --git a/llvm/microbenchmarks/timing/spawn.c b/llvm/microbenchmarks/timing/spawn.c
new file mode 100644
index 00000000000000..1588cfec2f113d
--- /dev/null
+++ b/llvm/microbenchmarks/timing/spawn.c
@@ -0,0 +1,19 @@
+#include <cilk/cilk.h>
+#include <time.h>
+#include <stdio.h>
+
+int main() {
+  int c = 0;
+  int its = 100;
+  clock_t start = clock(), diff;
+  cilk_spawn {
+    for (int i = 0; i < its; i++) {
+      c += i;
+    }
+  }
+  cilk_sync;
+  diff = clock() - start;
+  int msec = (diff * 1000000) / CLOCKS_PER_SEC;
+  printf("%d\n", msec);
+  return c;
+}
diff --git a/llvm/test/Transforms/LoopFuse/fuse.ll b/llvm/test/Transforms/LoopFuse/fuse.ll
new file mode 100644
index 00000000000000..f283778f432028
--- /dev/null
+++ b/llvm/test/Transforms/LoopFuse/fuse.ll
@@ -0,0 +1,87 @@
+; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s
+
+; 'C' equivalent: Partially generated and hand modified.
+; void fuse(int *a, int *b, int *c) {
+;   for (i = 0; i < 1000; ++i)  // L1
+;     c[i] = a[i] + c[i + 1];
+;   for (i = 0; i < 1000; ++i)  // L2
+;     c[i] = a[i] + b[i];
+; }
+; There is no backward dependence from L1 to L2. So it is safe to fuse.
+
+; Test that there are two versions - original loops and fused loop.
+; CHECK: br i1 %memcheck.conflict, label %entry.split, label %entry.split.L1clone
+
+; Test for fusion along fused path.
+; CHECK: for.body.L1clone:                                 ; preds = %for.body.1.L2clone, %entry.split.L1clone
+; CHECK: for.body.1.L2clone:                               ; preds = %for.body.L1clone
+; CHECK: br i1 %exitcond.L1clone, label %for.end.loopexit.1, label %for.body.L1clone, !llvm.loop !1
+
+; Test for merged defs and its uses outside the loops.
+; CHECK: for.end.loopexit.1:                               ; preds = %for.body.1.L2clone, %for.body.1
+; CHECK: %add11.lfuse = phi i32 [ %add11, %for.body.1 ], [ %add11.L2clone, %for.body.1.L2clone ]
+; CHECK: %add4.lfuse = phi i32 [ %add4, %for.body.1 ], [ %add4.L1clone, %for.body.1.L2clone ]
+; CHECK: %outsideUse = add nsw i32 %add11.lfuse, %add4.lfuse
+
+; ModuleID = '1.bc'
+
+; Function Attrs: norecurse nounwind uwtable
+define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %1, %0
+  %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.body.1
+
+for.body.1:                                       ; preds = %for.body.1, %for.end.loopexit
+  %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ]
+  %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1
+  %3 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %2
+  %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1
+  store i32 %add11, i32* %arrayidx12, align 4
+  %indvars.iv.next.1 = add i64 %indvars.iv.1, 1
+  %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000
+  br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4
+
+for.end.loopexit.1:                               ; preds = %for.body.1
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit.1
+  %outsideUse = add nsw i32 %add11, %add4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable }
+attributes #1 = { norecurse nounwind readonly uwtable }
+attributes #2 = { nounwind uwtable }
+attributes #3 = { nounwind readonly }
+attributes #4 = { nounwind }
+attributes #5 = { noreturn nounwind }
+attributes #6 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0"}
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 1}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.unroll.disable"}
+!6 = distinct !{!6, !2, !3}
+!7 = distinct !{!7, !2, !3}
diff --git a/llvm/test/Transforms/LoopFuse/no-fuse.ll b/llvm/test/Transforms/LoopFuse/no-fuse.ll
new file mode 100644
index 00000000000000..7abb67fd622998
--- /dev/null
+++ b/llvm/test/Transforms/LoopFuse/no-fuse.ll
@@ -0,0 +1,78 @@
+; RUN: opt -loop-fuse -verify-loop-info -verify-dom-info %s -S -o - | FileCheck %s
+
+; 'C' equivalent: Partially generated and hand modified.
+; void noFuse(int *a, int *b, int *c) {
+;   for (i = 0; i < 1000; ++i)  // L1
+;     c[i] = a[i] + c[i - 1];
+;   for (i = 0; i < 1000; ++i)  // L2
+;     c[i] = a[i] + b[i];
+; }
+; There is a backward dependence from L1 to L2. So it is unsafe to fuse.
+
+; CHECK: entry:
+; CHECK-NEXT:  br label %for.body
+; CHECK: for.body:                                         ; preds = %for.body, %entry
+; CHECK: for.body.1:
+
+; ModuleID = '1.bc'
+
+; Function Attrs: norecurse nounwind uwtable
+define void @bigLoop(i32* nocapture readonly %a, i32* nocapture readonly %b, i32* nocapture %c) #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %indvars.iv.next.back = add i64 %indvars.iv, -1
+  %arrayidx3 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.next.back
+  %1 = load i32, i32* %arrayidx3, align 4
+  %add4 = add nsw i32 %1, %0
+  %arrayidx6 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv
+  store i32 %add4, i32* %arrayidx6, align 4
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body, !llvm.loop !4
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.body.1
+
+for.body.1:                                       ; preds = %for.body.1, %for.end.loopexit
+  %indvars.iv.1 = phi i64 [ 0, %for.end.loopexit ], [ %indvars.iv.next.1, %for.body.1 ]
+  %arrayidx.1 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.1
+  %2 = load i32, i32* %arrayidx.1, align 4
+  %arrayidx10 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv.1
+  %3 = load i32, i32* %arrayidx10, align 4
+  %add11 = add nsw i32 %3, %2
+  %arrayidx12 = getelementptr inbounds i32, i32* %c, i64 %indvars.iv.1
+  store i32 %add11, i32* %arrayidx12, align 4
+  %indvars.iv.next.1 = add i64 %indvars.iv.1, 1
+  %exitcond.1 = icmp eq i64 %indvars.iv.next.1, 1000
+  br i1 %exitcond.1, label %for.end.loopexit.1, label %for.body.1, !llvm.loop !4
+
+for.end.loopexit.1:                               ; preds = %for.body.1
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit.1
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable }
+attributes #1 = { norecurse nounwind readonly uwtable }
+attributes #2 = { nounwind uwtable }
+attributes #3 = { nounwind readonly }
+attributes #4 = { nounwind }
+attributes #5 = { noreturn nounwind }
+attributes #6 = { nounwind readonly }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.8.0"}
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.width", i32 1}
+!3 = !{!"llvm.loop.interleave.count", i32 1}
+!4 = distinct !{!4, !5}
+!5 = !{!"llvm.loop.unroll.disable"}
+!6 = distinct !{!6, !2, !3}
+!7 = distinct !{!7, !2, !3}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp
new file mode 100644
index 00000000000000..0461b69c99b3a1
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic1.cpp
@@ -0,0 +1,6 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_sync;
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll b/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll
new file mode 100644
index 00000000000000..5615d4c1310d2c
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic1.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'basic1.cpp'
+source_filename = "basic1.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+; CHECK: @_Z4funcv
+  %syncreg = call token @llvm.syncregion.start()
+; CHECK-NOT: sync within %syncreg, label %sync.continue
+  sync within %syncreg, label %sync.continue
+
+sync.continue:                                    ; preds = %entry
+; CHECK-NOT: sync within %syncreg, label %sync.continue
+  sync within %syncreg, label %sync.continue1
+
+; CHECK: sync.continue
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp
new file mode 100644
index 00000000000000..6de0ad05f14611
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic2.cpp
@@ -0,0 +1,8 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_spawn {
+  }
+  cilk_sync;
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll b/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll
new file mode 100644
index 00000000000000..5658771430bc25
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/basic2.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'basic2.cpp'
+source_filename = "basic2.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+; CHECK: @_Z4funcv
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  detach within %syncreg, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  reattach within %syncreg, label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+; CHECK-NOT: sync within %syncreg, label %sync.continue
+  sync within %syncreg, label %sync.continue
+
+; CHECK: sync.continue
+sync.continue:                                    ; preds = %det.cont
+  sync within %syncreg, label %sync.continue1
+
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp
new file mode 100644
index 00000000000000..03c7cb7efdd77d
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail1.cpp
@@ -0,0 +1,9 @@
+#include <cilk/cilk.h>
+
+void func() {
+  int a;
+  cilk_spawn {
+    a = 1;
+  }
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll b/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll
new file mode 100644
index 00000000000000..0638fc2d81c5b9
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail1.ll
@@ -0,0 +1,37 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'fail1.cpp'
+source_filename = "fail1.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+  %a = alloca i32, align 4
+  %syncreg = call token @llvm.syncregion.start()
+  detach within %syncreg, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  store i32 1, i32* %a, align 4
+  reattach within %syncreg, label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  sync within %syncreg, label %sync.continue
+; CHECK: sync within %syncreg, label %sync.continue
+
+sync.continue:                                    ; preds = %det.cont
+  store i32 2, i32* %a, align 4
+  sync within %syncreg, label %sync.continue1
+; CHECK-NOT: sync within %syncreg, label %sync.continue1
+
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+; CHECK: ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp
new file mode 100644
index 00000000000000..779d13b2483954
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail2.cpp
@@ -0,0 +1,10 @@
+#include <cilk/cilk.h>
+
+void func(int *a, int *b) {
+  cilk_spawn {
+    *a = 1;
+  }
+  cilk_sync;
+  *b = 2;
+  cilk_sync;
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll b/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll
new file mode 100644
index 00000000000000..c4d2d395658f34
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/fail2.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'fail2.cpp'
+source_filename = "fail2.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcPiS_(i32* %a, i32* %b) #0 {
+entry:
+  %a.addr = alloca i32*, align 8
+  %b.addr = alloca i32*, align 8
+  %syncreg = call token @llvm.syncregion.start()
+  store i32* %a, i32** %a.addr, align 8
+  store i32* %b, i32** %b.addr, align 8
+  detach within %syncreg, label %det.achd, label %det.cont
+
+det.achd:                                         ; preds = %entry
+  %0 = load i32*, i32** %a.addr, align 8
+  store i32 1, i32* %0, align 4
+  reattach within %syncreg, label %det.cont
+
+det.cont:                                         ; preds = %det.achd, %entry
+  sync within %syncreg, label %sync.continue
+; CHECK: sync within %syncreg, label %sync.continue
+
+sync.continue:                                    ; preds = %det.cont
+  %1 = load i32*, i32** %b.addr, align 8
+  store i32 2, i32* %1, align 4
+  sync within %syncreg, label %sync.continue1
+; CHECK-NOT: sync within %syncreg, label %sync.continue1
+
+sync.continue1:                                   ; preds = %sync.continue
+  ret void
+; CHECK: ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp b/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp
new file mode 100644
index 00000000000000..bcf9db1d5e83f3
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for1.cpp
@@ -0,0 +1,8 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_for (int i = 0; i < 10; i++) {
+  }
+  cilk_for (int i = 0; i < 10; i++) {
+  }
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for1.ll b/llvm/test/Transforms/Tapir/SyncElimination/for1.ll
new file mode 100644
index 00000000000000..394e04b2bc0731
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for1.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+
+; ModuleID = 'for1.cpp'
+source_filename = "for1.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  %__init = alloca i32, align 4
+  %__begin = alloca i32, align 4
+  %__end = alloca i32, align 4
+  %syncreg1 = call token @llvm.syncregion.start()
+  %__init2 = alloca i32, align 4
+  %__begin3 = alloca i32, align 4
+  %__end4 = alloca i32, align 4
+  store i32 0, i32* %__init, align 4
+  store i32 0, i32* %__begin, align 4
+  store i32 10, i32* %__end, align 4
+  br label %pfor.cond
+
+pfor.cond:                                        ; preds = %pfor.inc, %entry
+  %0 = load i32, i32* %__begin, align 4
+  %1 = load i32, i32* %__end, align 4
+  %cmp = icmp slt i32 %0, %1
+  br i1 %cmp, label %pfor.detach, label %pfor.end
+
+pfor.detach:                                      ; preds = %pfor.cond
+  %2 = load i32, i32* %__init, align 4
+  %3 = load i32, i32* %__begin, align 4
+  %mul = mul nsw i32 %3, 1
+  %add = add nsw i32 %2, %mul
+  detach within %syncreg, label %pfor.body.entry, label %pfor.inc
+
+pfor.body.entry:                                  ; preds = %pfor.detach
+  %i = alloca i32, align 4
+  store i32 %add, i32* %i, align 4
+  br label %pfor.body
+
+pfor.body:                                        ; preds = %pfor.body.entry
+  br label %pfor.preattach
+
+pfor.preattach:                                   ; preds = %pfor.body
+  reattach within %syncreg, label %pfor.inc
+
+pfor.inc:                                         ; preds = %pfor.preattach, %pfor.detach
+  %4 = load i32, i32* %__begin, align 4
+  %inc = add nsw i32 %4, 1
+  store i32 %inc, i32* %__begin, align 4
+  br label %pfor.cond, !llvm.loop !1
+
+pfor.end:                                         ; preds = %pfor.cond
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.end
+  store i32 0, i32* %__init2, align 4
+  store i32 0, i32* %__begin3, align 4
+  store i32 10, i32* %__end4, align 4
+  br label %pfor.cond3
+
+; CHECK: pfor.end
+; CHECK-NOT: sync
+; CHECK: pfor.cond
+
+pfor.cond3:                                       ; preds = %pfor.inc8, %pfor.end.continue
+  %5 = load i32, i32* %__begin3, align 4
+  %6 = load i32, i32* %__end4, align 4
+  %cmp6 = icmp slt i32 %5, %6
+  br i1 %cmp6, label %pfor.detach5, label %pfor.end10
+
+pfor.detach5:                                     ; preds = %pfor.cond3
+  %7 = load i32, i32* %__init2, align 4
+  %8 = load i32, i32* %__begin3, align 4
+  %mul8 = mul nsw i32 %8, 1
+  %add9 = add nsw i32 %7, %mul8
+  detach within %syncreg1, label %pfor.body.entry6, label %pfor.inc8
+
+pfor.body.entry6:                                ; preds = %pfor.detach5
+  %i11 = alloca i32, align 4
+  store i32 %add9, i32* %i11, align 4
+  br label %pfor.body6
+
+pfor.body6:                                       ; preds = %pfor.body.entry5
+  br label %pfor.preattach7
+
+pfor.preattach7:                                  ; preds = %pfor.body6
+  reattach within %syncreg1, label %pfor.inc8
+
+pfor.inc8:                                        ; preds = %pfor.preattach7, %pfor.detach5
+  %9 = load i32, i32* %__begin3, align 4
+  %inc15 = add nsw i32 %9, 1
+  store i32 %inc15, i32* %__begin3, align 4
+  br label %pfor.cond3, !llvm.loop !3
+
+pfor.end10:                                       ; preds = %pfor.cond3
+  sync within %syncreg1, label %pfor.end.continue11
+
+pfor.end.continue11:                              ; preds = %pfor.end10
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!1 = distinct !{!1, !2}
+!2 = !{!"tapir.loop.spawn.strategy", i32 1}
+!3 = distinct !{!3, !2}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp b/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp
new file mode 100644
index 00000000000000..5627249702cef6
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for2.cpp
@@ -0,0 +1,8 @@
+#include <cilk/cilk.h>
+
+void func() {
+  cilk_for (int i = 0; i < 100; i++) {
+    cilk_for (int j = 0; j < 3; j++) {
+    }
+  }
+}
diff --git a/llvm/test/Transforms/Tapir/SyncElimination/for2.ll b/llvm/test/Transforms/Tapir/SyncElimination/for2.ll
new file mode 100644
index 00000000000000..91b70b4db95e94
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/SyncElimination/for2.ll
@@ -0,0 +1,78 @@
+; RUN: opt < %s -sync-elimination -S | FileCheck %s
+; XFAIL: *
+
+; ModuleID = 'for2.cpp'
+source_filename = "for2.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define void @_Z4funcv() #0 {
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  br label %pfor.cond
+
+pfor.cond:                                        ; preds = %pfor.inc15, %entry
+  %__begin.0 = phi i32 [ 0, %entry ], [ %inc16, %pfor.inc15 ]
+  %cmp = icmp slt i32 %__begin.0, 100
+  br i1 %cmp, label %pfor.detach, label %pfor.cond.cleanup
+
+pfor.cond.cleanup:                                ; preds = %pfor.cond
+;; The sync before a return is not safe to remove.
+; CHECK: sync within %syncreg, label %pfor.end.continue
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.cond.cleanup
+  ret void
+
+pfor.detach:                                      ; preds = %pfor.cond
+  detach within %syncreg, label %pfor.body.entry, label %pfor.inc15
+
+pfor.body.entry:                                  ; preds = %pfor.detach
+  %syncreg1 = call token @llvm.syncregion.start()
+  br label %pfor.body
+
+pfor.body:                                        ; preds = %pfor.body.entry
+  br label %pfor.cond5
+
+pfor.cond5:                                       ; preds = %pfor.inc, %pfor.body
+  %__begin3.0 = phi i32 [ 0, %pfor.body ], [ %inc, %pfor.inc ]
+  %cmp6 = icmp slt i32 %__begin3.0, 3
+  br i1 %cmp6, label %pfor.detach9, label %pfor.cond.cleanup7
+
+; CHECK: pfor.cond5
+pfor.cond.cleanup7:                               ; preds = %pfor.cond5
+; CHECK-NOT: sync within %syncreg1, label %pfor.end.continue
+  sync within %syncreg1, label %pfor.end.continue8
+; CHECK: pfor.inc15
+
+pfor.end.continue8:                               ; preds = %pfor.cond.cleanup7
+  reattach within %syncreg, label %pfor.inc15
+
+pfor.detach9:                                     ; preds = %pfor.cond5
+  detach within %syncreg1, label %pfor.body.entry12, label %pfor.inc
+
+pfor.body.entry12:                                ; preds = %pfor.detach9
+  br label %pfor.preattach
+
+pfor.preattach:                                   ; preds = %pfor.body.entry12
+  reattach within %syncreg1, label %pfor.inc
+
+pfor.inc:                                         ; preds = %pfor.preattach, %pfor.detach9
+  %inc = add nsw i32 %__begin3.0, 1
+  br label %pfor.cond5, !llvm.loop !2
+
+pfor.inc15:                                       ; preds = %pfor.end.continue8, %pfor.detach
+  %inc16 = add nsw i32 %__begin.0, 1
+  br label %pfor.cond, !llvm.loop !4
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!2 = distinct !{!2, !3}
+!3 = !{!"tapir.loop.spawn.strategy", i32 1}
+!4 = distinct !{!4, !3}
diff --git a/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll b/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll
new file mode 100644
index 00000000000000..a31d07f206846d
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/dac-loopspawning-simple.ll
@@ -0,0 +1,98 @@
+; Test that Tapir's loop spawning pass transforms this simple loop
+; into recursive divide-and-conquer.
+
+; RUN: opt < %s -loop-spawning -S | FileCheck %s
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %n) local_unnamed_addr #0 {
+; CHECK-LABEL: @foo(
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  %cmp5 = icmp sgt i32 %n, 0
+  br i1 %cmp5, label %pfor.detach.preheader, label %pfor.cond.cleanup
+
+pfor.detach.preheader:                            ; preds = %entry
+; CHECK: pfor.detach.preheader:
+; CHECK: [[LIMIT:%[0-9]+]] = add [[TYPE:i[0-9]+]] %n, -1
+; CHECK: call fastcc void @[[OUTLINED:[a-zA-Z0-9._]+]](
+; CHECK: [[TYPE]] 0
+; CHECK: [[TYPE]] [[LIMIT]]
+; CHECK: [[TYPE]] {{[%]?[a-zA-Z0-9._]+}}
+; CHECK-NEXT: br label %pfor.cond.cleanup.loopexit
+  br label %pfor.detach
+
+pfor.cond.cleanup.loopexit:                       ; preds = %pfor.inc
+  br label %pfor.cond.cleanup
+
+pfor.cond.cleanup:                                ; preds = %pfor.cond.cleanup.loopexit, %entry
+; CHECK: pfor.cond.cleanup
+; CHECK-NOT: sync within %syncreg, label %0
+  sync within %syncreg, label %0
+
+; <label>:0:                                      ; preds = %pfor.cond.cleanup
+  ret void
+
+pfor.detach:                                      ; preds = %pfor.detach.preheader, %pfor.inc
+; CHECK: pfor.detach:
+; CHECK: phi i32
+; CHECK-NOT: %pfor.detach.preheader
+; CHECK: detach
+
+; CHECK: define internal fastcc void @[[OUTLINED]](
+; CHECK: [[TYPE]] [[START:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[END:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[GRAIN:%[a-zA-Z0-9._]+]]
+; CHECK: [[NEWSYNCREG:%[a-zA-Z0-9._]+]] = call token @llvm.syncregion.start(
+
+; CHECK: {{^(; <label>:)?}}[[DACSTART:[a-zA-Z0-9._]+]]:
+; CHECK: [[ITERSTART:%[a-zA-Z0-9._]+]] = phi [[TYPE]] [{{.*}}[[START]]{{.*}}]
+; CHECK-NEXT: [[ITERCOUNT:%[a-zA-Z0-9._]+]] = sub [[TYPE]] [[END]], [[ITERSTART]]
+; CHECK-NEXT: [[CMP:%[0-9]+]] = icmp ugt [[TYPE]] [[ITERCOUNT]], [[GRAIN]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[RECUR:[0-9]+]], label %[[BODY:[0-9]+]]
+
+; CHECK: {{^(; <label>:)?}}[[RECUR]]:
+; CHECK-NEXT: [[HALFCOUNT:%[a-zA-Z0-9._]+]] = lshr [[TYPE]] [[ITERCOUNT]], 1
+; CHECK-NEXT: [[MIDITER:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[ITERSTART]], [[HALFCOUNT]]
+; CHECK-NEXT: detach within [[NEWSYNCREG]], label %[[DETACHED:[a-zA-Z0-9._]+]], label %[[CONTINUE:[a-zA-Z0-9._]+]]
+
+; CHECK: {{^(; <label>:)?}}[[DETACHED]]:
+; CHECK-NEXT: call fastcc void @[[OUTLINED]]([[TYPE]] [[ITERSTART]], [[TYPE]] [[MIDITER]], [[TYPE]] [[GRAIN]]
+; CHECK-NEXT: reattach within [[NEWSYNCREG]], label %[[CONTINUE]]
+
+; CHECK: {{^(; <label>:)?}}[[CONTINUE]]:
+; CHECK-NEXT: [[MIDITERP1:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[MIDITER]], 1
+; CHECK-NEXT: br label %[[DACSTART]]
+  %i.06 = phi i32 [ %inc, %pfor.inc ], [ 0, %pfor.detach.preheader ]
+  detach within %syncreg, label %pfor.body, label %pfor.inc
+; CHECK: sync within [[NEWSYNCREG]]
+; CHECK: br label %pfor.body.ls
+
+pfor.body:                                        ; preds = %pfor.detach
+; CHECK: pfor.body.ls:
+  tail call void @bar(i32 %i.06) #2
+; CHECK-NEXT: tail call void @bar(i32 %i.06.ls)
+  reattach within %syncreg, label %pfor.inc
+; CHECK-NEXT: br label %[[INC:[a-zA-Z0-9._]+]]
+
+pfor.inc:                                         ; preds = %pfor.body, %pfor.detach
+; CHECK: {{^(; <label>:)?}}[[INC]]:
+; CHECK-NEXT: [[LOCALCMP:%[0-9]+]] = icmp ult {{.*}} [[LOCALITER:%[a-zA-Z0-9._]+]], [[END]]
+  %inc = add nuw nsw i32 %i.06, 1
+; CHECK-NEXT: add {{.*}} [[LOCALITER]], 1
+  %exitcond = icmp eq i32 %inc, %n
+; CHECK: br i1 [[LOCALCMP]]
+  br i1 %exitcond, label %pfor.cond.cleanup.loopexit, label %pfor.detach, !llvm.loop !1
+}
+
+declare void @bar(i32) local_unnamed_addr #1
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #3
+
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+attributes #3 = { argmemonly nounwind }
+
+!1 = distinct !{!1, !2}
+!2 = !{!"tapir.loop.spawn.strategy", i32 1}
diff --git a/llvm/test/Transforms/Tapir/looplimit.ll b/llvm/test/Transforms/Tapir/looplimit.ll
new file mode 100644
index 00000000000000..4604eeb36e0ec9
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/looplimit.ll
@@ -0,0 +1,96 @@
+; Test that Tapir's loop spawning pass correctly transforms a loop
+; that reads its original end iteration count.
+
+; RUN: opt < %s -loop-spawning -S | FileCheck %s
+
+source_filename = "looplimittest.c"
+
+@.str = private unnamed_addr constant [13 x i8] c"Limit is %d\0A\00", align 1
+@str = private unnamed_addr constant [9 x i8] c"Starting\00"
+@str.3 = private unnamed_addr constant [9 x i8] c"Finished\00"
+
+; Function Attrs: noinline nounwind uwtable
+define void @foo(i32 %limit) local_unnamed_addr #0 {
+entry:
+  %syncreg = tail call token @llvm.syncregion.start()
+  %cmp9 = icmp slt i32 %limit, 0
+  br i1 %cmp9, label %pfor.cond.cleanup, label %pfor.detach
+
+; CHECK: pfor.detach.preheader:
+; CHECK: call fastcc void @[[OUTLINED:[a-zA-Z0-9._]+]](
+; CHECK: [[TYPE:i[0-9]+]] 0
+; CHECK: [[TYPE]] [[LOOPLIMIT:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] {{[%]?[a-zA-Z0-9._]+}}
+; CHECK: i32 %limit
+
+pfor.cond.cleanup:                                ; preds = %pfor.inc, %entry
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.cond.cleanup
+  ret void
+
+; CHECK: define internal fastcc void @[[OUTLINED]](
+; CHECK: [[TYPE]] [[START:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[END:%[a-zA-Z0-9._]+]]
+; CHECK: [[TYPE]] [[GRAIN:%[a-zA-Z0-9._]+]]
+; CHECK: i32 [[LIMITARG:%[a-zA-Z0-9._]+]]
+
+; CHECK: [[NEWSYNCREG:%[a-zA-Z0-9._]+]] = tail call token @llvm.syncregion.start(
+
+; CHECK: {{^(; <label>:)?}}[[DACSTART:[a-zA-Z0-9._]+]]:
+; CHECK: [[ITERSTART:%[a-zA-Z0-9._]+]] = phi [[TYPE]] [{{.*}}[[START]]{{.*}}]
+; CHECK-NEXT: [[ITERCOUNT:%[a-zA-Z0-9._]+]] = sub [[TYPE]] [[END]], [[ITERSTART]]
+; CHECK-NEXT: [[CMP:%[0-9]+]] = icmp ugt [[TYPE]] [[ITERCOUNT]], [[GRAIN]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[RECUR:[0-9]+]], label %[[BODY:[0-9]+]]
+
+; CHECK: {{^(; <label>:)?}}[[RECUR]]:
+; CHECK-NEXT: [[HALFCOUNT:%[a-zA-Z0-9._]+]] = lshr [[TYPE]] [[ITERCOUNT]], 1
+; CHECK-NEXT: [[MIDITER:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[ITERSTART]], [[HALFCOUNT]]
+; CHECK-NEXT: detach within [[NEWSYNCREG]], label %[[DETACHED:[a-zA-Z0-9._]+]], label %[[CONTINUE:[a-zA-Z0-9._]+]]
+
+; CHECK: {{^(; <label>:)?}}[[DETACHED]]:
+; CHECK-NEXT: call fastcc void @[[OUTLINED]]([[TYPE]] [[ITERSTART]], [[TYPE]] [[MIDITER]], [[TYPE]] [[GRAIN]], i32 [[LIMITARG]]
+; CHECK-NEXT: reattach within [[NEWSYNCREG]], label %[[CONTINUE]]
+
+; CHECK: {{^(; <label>:)?}}[[CONTINUE]]:
+; CHECK-NEXT: [[MIDITERP1:%[a-zA-Z0-9._]+]] = add {{.*}} [[TYPE]] [[MIDITER]], 1
+; CHECK-NEXT: br label %[[DACSTART]]
+
+pfor.detach:                                      ; preds = %entry, %pfor.inc
+  %__begin.010 = phi i32 [ %inc, %pfor.inc ], [ 0, %entry ]
+  detach within %syncreg, label %pfor.body, label %pfor.inc
+
+pfor.body:                                        ; preds = %pfor.detach
+; CHECK: {{^(; <label>:)?}}[[BODY]]:
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 %limit)
+; CHECK: call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0), i32 [[LIMITARG]])
+  reattach within %syncreg, label %pfor.inc
+; CHECK: br label %[[INC:[a-zA-Z0-9._]+]]
+
+pfor.inc:                                         ; preds = %pfor.body, %pfor.detach
+; CHECK: {{^(; <label>:)?}}[[INC]]:
+; CHECK-NEXT: [[LOCALCMP:%[0-9]+]] = icmp ult {{.*}} [[LOCALITER:%[a-zA-Z0-9._]+]], [[END]]
+  %inc = add nuw nsw i32 %__begin.010, 1
+; CHECK-NEXT: add {{.*}} [[LOCALITER]], 1
+  %exitcond = icmp eq i32 %__begin.010, %limit
+; CHECK: br i1 [[LOCALCMP]]
+  br i1 %exitcond, label %pfor.cond.cleanup, label %pfor.detach, !llvm.loop !2
+}
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #1
+
+; Function Attrs: nounwind
+declare i32 @printf(i8* nocapture readonly, ...) local_unnamed_addr #2
+
+; Function Attrs: nounwind
+declare i32 @puts(i8* nocapture readonly) #4
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!2 = distinct !{!2, !3}
+!3 = !{!"tapir.loop.spawn.strategy", i32 1}
diff --git a/llvm/test/Transforms/Tapir/tapir-licm.ll b/llvm/test/Transforms/Tapir/tapir-licm.ll
new file mode 100644
index 00000000000000..0241dfc5dcc7ea
--- /dev/null
+++ b/llvm/test/Transforms/Tapir/tapir-licm.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -licm -S | FileCheck %s
+
+; Function Attrs: noinline nounwind uwtable
+define void @normalize(double* noalias %out, double* noalias %in, i32 %n) #0 {
+; CHECK-LABEL: @normalize(
+entry:
+  %syncreg = call token @llvm.syncregion.start()
+  %cmp1 = icmp slt i32 0, %n
+  br i1 %cmp1, label %pfor.detach.lr.ph, label %pfor.end
+
+pfor.detach.lr.ph:                                ; preds = %entry
+; CHECK: pfor.detach.lr.ph:
+; CHECK-NEXT: %call = call double @norm(double* %in, i32 %n)
+  br label %pfor.detach
+
+pfor.detach:                                      ; preds = %pfor.detach.lr.ph, %pfor.inc
+  %i.02 = phi i32 [ 0, %pfor.detach.lr.ph ], [ %inc, %pfor.inc ]
+  detach within %syncreg, label %pfor.body, label %pfor.inc
+
+pfor.body:                                        ; preds = %pfor.detach
+; CHECK-NOT: call double @norm(
+  %idxprom = sext i32 %i.02 to i64
+  %arrayidx = getelementptr inbounds double, double* %in, i64 %idxprom
+  %0 = load double, double* %arrayidx, align 8
+  ;; Should have hoisted this call
+  %call = call double @norm(double* %in, i32 %n) #2
+  %div = fdiv double %0, %call
+  %idxprom1 = sext i32 %i.02 to i64
+  %arrayidx2 = getelementptr inbounds double, double* %out, i64 %idxprom1
+  store double %div, double* %arrayidx2, align 8
+  reattach within %syncreg, label %pfor.inc
+
+pfor.inc:                                         ; preds = %pfor.body, %pfor.detach
+  %inc = add nsw i32 %i.02, 1
+  %cmp = icmp slt i32 %inc, %n
+  br i1 %cmp, label %pfor.detach, label %pfor.cond.pfor.end_crit_edge, !llvm.loop !1
+
+pfor.cond.pfor.end_crit_edge:                     ; preds = %pfor.inc
+  br label %pfor.end
+
+pfor.end:                                         ; preds = %pfor.cond.pfor.end_crit_edge, %entry
+  sync within %syncreg, label %pfor.end.continue
+
+pfor.end.continue:                                ; preds = %pfor.end
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare double @norm(double*, i32) #1
+
+; Function Attrs: argmemonly nounwind
+declare token @llvm.syncregion.start() #3
+
+attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+attributes #3 = { argmemonly nounwind }
+
+!1 = distinct !{!1, !2}
+!2 = !{!"tapir.loop.spawn.strategy", i32 1}
diff --git a/llvm/tools/bugpoint/CMakeLists.txt b/llvm/tools/bugpoint/CMakeLists.txt
index 654ecc496a919b..fc43a22385248c 100644
--- a/llvm/tools/bugpoint/CMakeLists.txt
+++ b/llvm/tools/bugpoint/CMakeLists.txt
@@ -13,6 +13,7 @@ set(LLVM_LINK_COMPONENTS
   ObjCARCOpts
   ScalarOpts
   Support
+  TapirOpts
   Target
   TransformUtils
   Vectorize
diff --git a/llvm/tools/bugpoint/LLVMBuild.txt b/llvm/tools/bugpoint/LLVMBuild.txt
index 68ecb8c8f4f912..6c9568ffd50640 100644
--- a/llvm/tools/bugpoint/LLVMBuild.txt
+++ b/llvm/tools/bugpoint/LLVMBuild.txt
@@ -30,4 +30,5 @@ required_libraries =
  Linker
  ObjCARC
  Scalar
+ TapirOpts
  all-targets
diff --git a/llvm/tools/bugpoint/bugpoint.cpp b/llvm/tools/bugpoint/bugpoint.cpp
index f6b7d08455d438..ff0001b5223c9f 100644
--- a/llvm/tools/bugpoint/bugpoint.cpp
+++ b/llvm/tools/bugpoint/bugpoint.cpp
@@ -127,6 +127,7 @@ int main(int argc, char **argv) {
   initializeScalarOpts(Registry);
   initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
+  initializeTapirOpts(Registry);
   initializeIPO(Registry);
   initializeAnalysis(Registry);
   initializeTransformUtils(Registry);
diff --git a/llvm/tools/clang b/llvm/tools/clang
new file mode 160000
index 00000000000000..51d7b71ff6cb4c
--- /dev/null
+++ b/llvm/tools/clang
@@ -0,0 +1 @@
+Subproject commit 51d7b71ff6cb4c026e18ea212e57b979e7b78896
diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index f03d11516657f1..071f83eda8bb0c 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -14,6 +14,7 @@ set(LLVM_LINK_COMPONENTS
   ObjCARCOpts
   ScalarOpts
   Support
+  TapirOpts
   Target
   TransformUtils
   Vectorize
diff --git a/llvm/tools/opt/LLVMBuild.txt b/llvm/tools/opt/LLVMBuild.txt
index 047719042de964..defabe2f0f70c7 100644
--- a/llvm/tools/opt/LLVMBuild.txt
+++ b/llvm/tools/opt/LLVMBuild.txt
@@ -28,6 +28,7 @@ required_libraries =
  IPO
  Instrumentation
  Scalar
+ TapirOpts
  ObjCARC
  Passes
  all-targets
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index a4967a234d9cec..3a39e304ca4971 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -446,6 +446,7 @@ int main(int argc, char **argv) {
   initializeScalarOpts(Registry);
   initializeObjCARCOpts(Registry);
   initializeVectorization(Registry);
+  initializeTapirOpts(Registry);
   initializeIPO(Registry);
   initializeAnalysis(Registry);
   initializeTransformUtils(Registry);
diff --git a/llvm/utils/emacs/llvm-mode.el b/llvm/utils/emacs/llvm-mode.el
index cde66d122866f3..fa3000e70f0b54 100644
--- a/llvm/utils/emacs/llvm-mode.el
+++ b/llvm/utils/emacs/llvm-mode.el
@@ -35,7 +35,7 @@
    ;; Unnamed variable slots
    '("%[-]?[0-9]+" . font-lock-variable-name-face)
    ;; Types
-   `(,(regexp-opt '("void" "i1" "i8" "i16" "i32" "i64" "i128" "float" "double" "type" "label" "opaque") 'symbols) . font-lock-type-face)
+   `(,(regexp-opt '("void" "i1" "i8" "i16" "i32" "i64" "i128" "float" "double" "type" "label" "opaque" "token") 'symbols) . font-lock-type-face)
    ;; Integer literals
    '("\\b[-]?[0-9]+\\b" . font-lock-preprocessor-face)
    ;; Floating point constants
@@ -64,9 +64,9 @@
    ;; Floating-point operators
    `(,(regexp-opt '("fadd" "fsub" "fneg" "fmul" "fdiv" "frem") 'symbols) . font-lock-keyword-face)
    ;; Special instructions
-   `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad") 'symbols) . font-lock-keyword-face)
+   `(,(regexp-opt '("phi" "tail" "call" "select" "to" "shl" "lshr" "ashr" "fcmp" "icmp" "va_arg" "landingpad" "within") 'symbols) . font-lock-keyword-face)
    ;; Control instructions
-   `(,(regexp-opt '("ret" "br" "switch" "invoke" "resume" "unwind" "unreachable" "indirectbr") 'symbols) . font-lock-keyword-face)
+   `(,(regexp-opt '("ret" "br" "switch" "invoke" "resume" "unwind" "unreachable" "indirectbr" "detach" "reattach" "sync") 'symbols) . font-lock-keyword-face)
    ;; Memory operators
    `(,(regexp-opt '("malloc" "alloca" "free" "load" "store" "getelementptr" "fence" "cmpxchg" "atomicrmw") 'symbols) . font-lock-keyword-face)
    ;; Casts