diff --git a/deps/llvm-3.3.patch b/deps/llvm-3.3.patch
index 2027d6db20976..d46b6e6f26682 100644
--- a/deps/llvm-3.3.patch
+++ b/deps/llvm-3.3.patch
@@ -178,3 +178,420 @@ diff -u -r -N llvm-3.3.src/lib/Support/Host.cpp llvm-3.3/lib/Support/Host.cpp
        case 28: // Most 45 nm Intel Atom processors
        case 38: // 45 nm Atom Lincroft
        case 39: // 32 nm Atom Medfield
+diff -u -r -N llvm-3.3.src/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm-3.3/lib/Transforms/Vectorize/SLPVectorizer.cpp
+--- llvm-3.3.src/lib/Transforms/Vectorize/SLPVectorizer.cpp	2013-04-30 16:04:51.000000000 -0500
++++ llvm-3.3/lib/Transforms/Vectorize/SLPVectorizer.cpp	2014-04-15 16:26:12.200527154 -0500
+@@ -19,6 +19,7 @@
+ #define DEBUG_TYPE SV_NAME
+ 
+ #include "VecUtils.h"
++#include "llvm/ADT/SmallSet.h"
+ #include "llvm/Transforms/Vectorize.h"
+ #include "llvm/Analysis/AliasAnalysis.h"
+ #include "llvm/Analysis/ScalarEvolution.h"
+@@ -128,7 +129,7 @@
+   bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
+ 
+   /// \brief Try to vectorize a list of operands.
+-  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
++  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, InsertElementInst *IE = 0);
+ 
+   /// \brief Try to vectorize a chain that may start at the operands of \V;
+   bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R);
+@@ -179,7 +180,7 @@
+   return tryToVectorizeList(VL, R);
+ }
+ 
+-bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
++bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R, InsertElementInst *IE) {
+   DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+ 
+   // Check that all of the parts are scalar.
+@@ -191,10 +192,12 @@
+ 
+   int Cost = R.getTreeCost(VL);
+   int ExtrCost = R.getScalarizationCost(VL);
+-  DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
++  DEBUG(dbgs()<<"SLP: Cost of list:" << Cost <<
+         " Cost of extract:" << ExtrCost << ".\n");
+   if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
+-  DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
++  DEBUG(dbgs()<<"SLP: Vectorizing list.\n");
++  if (IE)
++    R.movePrematureInserts(VL,IE);
+   R.vectorizeArith(VL);
+   return true;
+ }
+@@ -237,9 +240,49 @@
+   return 0;
+ }
+ 
++/// \brief Recognize construction of vectors like
++///  %ra = insertelement <4 x float> undef, float %s0, i32 0
++///  %rb = insertelement <4 x float> %ra, float %s1, i32 1
++///  %rc = insertelement <4 x float> %rb, float %s2, i32 2
++///  %rd = insertelement <4 x float> %rc, float %s3, i32 3
++///
++/// Returns true if it matches.  Sets \p Ops to the values inserted
++/// and \p Inserts to the insertelement instructions.
++///
++static bool findBuildVector(InsertElementInst *IE,
++                            SmallVectorImpl<Value *> &Ops) {
++  if (!isa<UndefValue>(IE->getOperand(0)))
++    return false;
++
++  while (true) {
++    Ops.push_back(IE->getOperand(1));
++
++    if (IE->use_empty())
++      return false;
++
++    InsertElementInst *NextUse = dyn_cast<InsertElementInst>(IE->use_back());
++    if (!NextUse)
++      return true;
++
++    // If this isn't the final use, make sure the next insertelement is the only
++    // use. It's OK if the final constructed vector is used multiple times
++    if (!IE->hasOneUse())
++      return false;
++
++    IE = NextUse;
++  }
++
++  return false;
++}
++
+ bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+   bool Changed = false;
++  SmallSet<Value *, 16> VisitedInstrs;
+   for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
++    // We may go through BB multiple times so skip the one we have checked.
++    if (!VisitedInstrs.insert(it))
++      continue;
++
+     if (isa<DbgInfoIntrinsic>(it)) continue;
+ 
+     // Try to vectorize reductions that use PHINodes.
+@@ -271,6 +314,21 @@
+           Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
+       continue;
+     }
++
++    // Try to vectorize trees that start at insertelement instructions.
++    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(it)) {
++      SmallVector<Value *, 8> Ops;
++      if (!findBuildVector(IE, Ops))
++        continue;
++
++      if (tryToVectorizeList(Ops, R, IE)) {
++        Changed = true;
++        it = BB->begin();
++        e = BB->end();
++      }
++
++      continue;
++    }
+   }
+ 
+   return Changed;
+diff -u -r -N llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.cpp llvm-3.3/lib/Transforms/Vectorize/VecUtils.cpp
+--- llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.cpp	2013-04-21 03:05:59.000000000 -0500
++++ llvm-3.3/lib/Transforms/Vectorize/VecUtils.cpp	2014-04-15 17:44:02.710904964 -0500
+@@ -243,7 +243,8 @@
+   LaneMap.clear();
+   MultiUserVals.clear();
+   MustScalarize.clear();
+-
++  AllScalarized = true;
++ 
+   // Scan the tree and find which value is used by which lane, and which values
+   // must be scalarized.
+   getTreeUses_rec(VL, 0);
+@@ -275,7 +276,8 @@
+   }
+ 
+   // Now calculate the cost of vectorizing the tree.
+-  return getTreeCost_rec(VL, 0);
++  int treeCost = getTreeCost_rec(VL, 0);
++  return AllScalarized ? max_cost : treeCost;
+ }
+ 
+ void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
+@@ -380,6 +382,55 @@
+   }
+ }
+ 
++#ifndef NDEBUG
++/// \returns The opcode if all of the Instructions in \p VL have the same
++/// opcode, or zero.
++static unsigned getSameOpcode(ArrayRef<Value *> VL) {
++  Instruction *I0 = dyn_cast<Instruction>(VL[0]);
++  if (!I0)
++    return 0;
++  unsigned Opcode = I0->getOpcode();
++  for (int i = 1, e = VL.size(); i < e; i++) {
++    Instruction *I = dyn_cast<Instruction>(VL[i]);
++    if (!I || Opcode != I->getOpcode())
++      return 0;
++  }
++  return Opcode;
++}
++#endif /*NDEBUG*/
++
++/// \returns True if the ExtractElement instructions in VL can be vectorized
++/// to use the original vector.
++static bool CanReuseExtract(ArrayRef<Value *> VL) {
++  assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode");
++  // Check if all of the extracts come from the same vector and from the
++  // correct offset.
++  Value *VL0 = VL[0];
++  ExtractElementInst *E0 = cast<ExtractElementInst>(VL0);
++  Value *Vec = E0->getOperand(0);
++
++  // We have to extract from the same vector type.
++  unsigned NElts = Vec->getType()->getVectorNumElements();
++
++  if (NElts != VL.size())
++    return false;
++
++  // Check that all of the indices extract from the correct offset.
++  ConstantInt *CI = dyn_cast<ConstantInt>(E0->getOperand(1));
++  if (!CI || CI->getZExtValue())
++    return false;
++
++  for (unsigned i = 1, e = VL.size(); i < e; ++i) {
++    ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
++    ConstantInt *CI = dyn_cast<ConstantInt>(E->getOperand(1));
++
++    if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec)
++      return false;
++  }
++
++  return true;
++}
++
+ int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
+   Type *ScalarTy = VL[0]->getType();
+ 
+@@ -457,6 +508,20 @@
+   }
+ 
+   switch (Opcode) {
++  case Instruction::ExtractElement: { 
++    if (CanReuseExtract(VL)) {
++      int DeadCost = 0;
++      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
++        ExtractElementInst *E = cast<ExtractElementInst>(VL[i]);
++        if (E->hasOneUse())
++          // Take credit for instruction that will become dead.
++          DeadCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
++      }
++      DEBUG(dbgs() << "SLP: taking credit of " << -DeadCost << "\n");
++      return -DeadCost;
++    }
++    return getScalarizationCost(VecTy);
++  }
+   case Instruction::ZExt:
+   case Instruction::SExt:
+   case Instruction::FPToUI:
+@@ -490,6 +555,7 @@
+     VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+     int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+     Cost += (VecCost - ScalarCost);
++    AllScalarized = false;
+     return Cost;
+   }
+   case Instruction::Add:
+@@ -528,6 +594,7 @@
+ 
+     int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+     Cost += (VecCost - ScalarCost);
++    AllScalarized = false;
+     return Cost;
+   }
+   case Instruction::Load: {
+@@ -540,6 +607,7 @@
+     int ScalarLdCost = VecTy->getNumElements() *
+       TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+     int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
++    AllScalarized = false;
+     return VecLdCost - ScalarLdCost;
+   }
+   case Instruction::Store: {
+@@ -556,6 +624,7 @@
+     }
+ 
+     int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
++    AllScalarized = false;
+     return TotalCost;
+   }
+   default:
+@@ -630,6 +699,16 @@
+   }
+ 
+   switch (Opcode) {
++  case Instruction::ExtractElement: {
++    Value *S;
++    if (CanReuseExtract(VL) ) {
++      S = VL0->getOperand(0);
++    } else {
++      S = Scalarize(VL, VecTy);
++    }
++    VectorizedValues[VL0] = S;
++    return S;
++  }
+   case Instruction::ZExt:
+   case Instruction::SExt:
+   case Instruction::FPToUI:
+@@ -727,4 +806,32 @@
+   }
+ }
+ 
++void BoUpSLP::movePrematureInserts(ArrayRef<Value *> VL, InsertElementInst *IE) {
++  Instruction *VL0 = cast<Instruction>(VL[0]);
++  int MyLastIndex = InstrIdx[VL0];
++  for (unsigned i = 1, e = VL.size(); i < e; ++i )
++    MyLastIndex = std::max(MyLastIndex, InstrIdx[VL[i]]);
++  BasicBlock *BB = cast<Instruction>(VL0)->getParent();
++  bool moved = false;
++  DEBUG(dbgs() << "SLP: Moving premature inserts\n");
++  Instruction* x = InstrVec[MyLastIndex];
++  while (IE->getParent()==BB) {
++    int UserIndex = InstrIdx[IE];
++    if (UserIndex >= MyLastIndex) {
++      // Walked past transformed region
++      break;
++    }
++    IE->removeFromParent();
++    IE->insertAfter(x);
++    DEBUG(dbgs() << "SLP:    Rescheduled: " << *IE << ".\n");
++    moved = true;  
++    x = IE;
++    IE = dyn_cast<InsertElementInst>(IE->use_back());
++    if (!IE) 
++      break;
++  } 
++  if (moved) 
++    numberInstructions(); 
++}
++
+ } // end of namespace
+diff -u -r -N llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.h llvm-3.3/lib/Transforms/Vectorize/VecUtils.h
+--- llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.h	2013-04-20 04:49:10.000000000 -0500
++++ llvm-3.3/lib/Transforms/Vectorize/VecUtils.h	2014-04-15 17:38:41.549776444 -0500
+@@ -77,6 +77,9 @@
+   /// sequences.
+   ValueList &getGatherSeqInstructions() {return GatherInstructions; }
+ 
++  /// \brief Move InsertElement instructions with indices preceding LastIndex
++  /// \p IE is the root of a chain identified by findBuildVector. 
++  void movePrematureInserts(ArrayRef<Value *> VL, InsertElementInst *IE);
+ private:
+   /// \brief This method contains the recursive part of getTreeCost.
+   int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
+@@ -126,6 +129,8 @@
+   /// by multiple lanes, or by users outside the tree.
+   /// NOTICE: The vectorization methods also use this set.
+   ValueSet MustScalarize;
++  /// True if all values must be scalarized, i.e. vectorization is worthless.
++  bool AllScalarized;
+ 
+   /// Contains a list of values that are used outside the current tree. This
+   /// set must be reset between runs.
+diff -u -r -N llvm-3.3.src/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll llvm-3.3/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll
+--- llvm-3.3.src/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll	1969-12-31 18:00:00.000000000 -0600
++++ llvm-3.3/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll	2014-04-15 14:13:12.738690168 -0500
+@@ -0,0 +1,53 @@
++target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128"
++
++target triple = "x86_64-apple-macosx10.8.0"
++
++; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s | FileCheck %s 
++
++; Check that cost model for vectorization takes credit for
++; instructions that are erased.
++define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: @take_credit(
++; CHECK: %1 = fadd <4 x float> %a, %b
++  %a0 = extractelement <4 x float> %a, i32 0
++  %b0 = extractelement <4 x float> %b, i32 0
++  %c0 = fadd float %a0, %b0
++  %a1 = extractelement <4 x float> %a, i32 1
++  %b1 = extractelement <4 x float> %b, i32 1
++  %c1 = fadd float %a1, %b1
++  %a2 = extractelement <4 x float> %a, i32 2
++  %b2 = extractelement <4 x float> %b, i32 2
++  %c2 = fadd float %a2, %b2
++  %a3 = extractelement <4 x float> %a, i32 3
++  %b3 = extractelement <4 x float> %b, i32 3
++  %c3 = fadd float %a3, %b3
++  %v0 = insertelement <4 x float> undef, float %c0, i32 0
++  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
++  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
++  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
++  ret <4 x float> %v3
++}
++
++; Make sure that vectorization happens even if extractelement operations
++; must be rescheduled.  The case here is from compiling Julia.
++define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) {
++; CHECK-LABEL: @reschedule_extract(
++; CHECK: %1 = fadd <4 x float> %a, %b
++  %a0 = extractelement <4 x float> %a, i32 0
++  %b0 = extractelement <4 x float> %b, i32 0
++  %c0 = fadd float %a0, %b0
++  %v0 = insertelement <4 x float> undef, float %c0, i32 0
++  %a1 = extractelement <4 x float> %a, i32 1
++  %b1 = extractelement <4 x float> %b, i32 1
++  %c1 = fadd float %a1, %b1
++  %v1 = insertelement <4 x float> %v0, float %c1, i32 1
++  %a2 = extractelement <4 x float> %a, i32 2
++  %b2 = extractelement <4 x float> %b, i32 2
++  %c2 = fadd float %a2, %b2
++  %v2 = insertelement <4 x float> %v1, float %c2, i32 2
++  %a3 = extractelement <4 x float> %a, i32 3
++  %b3 = extractelement <4 x float> %b, i32 3
++  %c3 = fadd float %a3, %b3
++  %v3 = insertelement <4 x float> %v2, float %c3, i32 3
++  ret <4 x float> %v3
++}
+diff -u -r -N llvm-3.3.src/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll llvm-3.3/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll
+--- llvm-3.3.src/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll	1969-12-31 18:00:00.000000000 -0600
++++ llvm-3.3/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll	2014-04-15 15:42:30.448560440 -0500
+@@ -0,0 +1,36 @@
++; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000
++
++target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
++target triple = "nvptx--nvidiacl"
++
++; CTLZ cannot be vectorized currently because the second argument is a scalar
++; for both the scalar and vector forms of the intrinsic. In the future it
++; should be possible to vectorize such functions.
++; Test causes an assert if LLVM tries to vectorize CTLZ.
++
++define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
++entry:
++  %0 = extractelement <2 x i8> %x, i32 0
++  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
++  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
++  %1 = extractelement <2 x i8> %x, i32 1
++  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
++  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
++  ret <2 x i8> %vecinit2
++}
++
++define <2 x i8> @cltz_test2(<2 x i8> %x) #1 {
++entry:
++  %0 = extractelement <2 x i8> %x, i32 0
++  %1 = extractelement <2 x i8> %x, i32 1
++  %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
++  %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false)
++  %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0
++  %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1
++  ret <2 x i8> %vecinit2
++}
++
++declare i8 @llvm.ctlz.i8(i8, i1) #3
++
++attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
++attributes #1 = { nounwind readnone }
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 5bcdfa0c489db..90a7c3f6d807b 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -4695,6 +4695,8 @@ static void init_julia_llvm_env(Module *m)
     jl_TargetMachine->addAnalysisPasses(*FPM);
 #endif
     FPM->add(createTypeBasedAliasAnalysisPass());
+    if (jl_compileropts.opt_level>=1)
+        FPM->add(createBasicAliasAnalysisPass());
     // list of passes from vmkit
     FPM->add(createCFGSimplificationPass()); // Clean up disgusting code
     FPM->add(createPromoteMemoryToRegisterPass());// Kill useless allocas
@@ -4740,11 +4742,11 @@ static void init_julia_llvm_env(Module *m)
 #else
     FPM->add(createLoopUnrollPass());           // Unroll small loops
 #endif
-    //FPM->add(createLoopStrengthReducePass());   // (jwb added)
-
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 3 && !defined(INSTCOMBINE_BUG)
+#if LLVM33 && !LLVM35 && !defined(INSTCOMBINE_BUG)
     FPM->add(createLoopVectorizePass());        // Vectorize loops
 #endif
+    //FPM->add(createLoopStrengthReducePass());   // (jwb added)
+
 #ifndef INSTCOMBINE_BUG
     FPM->add(createInstructionCombiningPass()); // Clean up after the unroller
 #endif
@@ -4761,8 +4763,20 @@ static void init_julia_llvm_env(Module *m)
 #endif
     FPM->add(createJumpThreadingPass());         // Thread jumps
     FPM->add(createDeadStoreEliminationPass());  // Delete dead stores
+#if LLVM33 && !defined(INSTCOMBINE_BUG)
+    if (jl_compileropts.opt_level>=1)
+        FPM->add(createSLPVectorizerPass());     // Vectorize straight-line code
+#endif
 
     FPM->add(createAggressiveDCEPass());         // Delete dead instructions
+#if LLVM33 && !defined(INSTCOMBINE_BUG)
+    if (jl_compileropts.opt_level>=1)
+        FPM->add(createInstructionCombiningPass());   // Clean up after SLP loop vectorizer
+#endif
+#if LLVM35
+    FPM->add(createLoopVectorizePass());         // Vectorize loops
+    FPM->add(createInstructionCombiningPass());  // Clean up after loop vectorizer
+#endif
     //FPM->add(createCFGSimplificationPass());     // Merge & remove BBs
 
     FPM->doInitialization();
diff --git a/src/init.c b/src/init.c
index 1e7d59f2a2e90..9519582e2c661 100644
--- a/src/init.c
+++ b/src/init.c
@@ -85,7 +85,8 @@ jl_compileropts_t jl_compileropts = { NULL, // build_path
                                       JL_COMPILEROPT_CHECK_BOUNDS_DEFAULT,
                                       JL_COMPILEROPT_DUMPBITCODE_OFF,
                                       0,    // int_literals
-                                      JL_COMPILEROPT_COMPILE_DEFAULT
+                                      JL_COMPILEROPT_COMPILE_DEFAULT,
+                                      0     // opt_level
 };
 
 int jl_boot_file_loaded = 0;
diff --git a/src/julia.h b/src/julia.h
index 4f032204c1454..c349317a1eb58 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -1334,6 +1334,7 @@ typedef struct {
     int8_t dumpbitcode;
     int int_literals;
     int8_t compile_enabled;
+    int8_t opt_level;
 } jl_compileropts_t;
 
 extern DLLEXPORT jl_compileropts_t jl_compileropts;
diff --git a/ui/repl.c b/ui/repl.c
index be9c44edb55c5..31634902aacbc 100644
--- a/ui/repl.c
+++ b/ui/repl.c
@@ -85,12 +85,13 @@ static const char *opts =
     " --track-allocation={none|user|all}\n"
     "                          Count bytes allocated by each source line\n"
     " --check-bounds={yes|no}  Emit bounds checks always or never (ignoring declarations)\n"
+    " -O, --optimize           Run time-intensive code optimizations\n"
     " --int-literals={32|64}   Select integer literal size independent of platform\n"
     " --dump-bitcode={yes|no}  Dump bitcode for the system image (used with --build)\n";
 
 void parse_opts(int *argcp, char ***argvp)
 {
-    static char* shortopts = "+H:T:hJ:";
+    static char* shortopts = "+H:T:hJ:O";
     static struct option longopts[] = {
         { "home",          required_argument, 0, 'H' },
         { "tab",           required_argument, 0, 'T' },
@@ -101,6 +102,7 @@ void parse_opts(int *argcp, char ***argvp)
         { "code-coverage", optional_argument, 0, 'c' },
         { "track-allocation",required_argument, 0, 'm' },
         { "check-bounds",  required_argument, 0, 300 },
+        { "optimize",      no_argument,       0, 'O' },
         { "int-literals",  required_argument, 0, 301 },
         { "dump-bitcode",  required_argument, 0, 302 },
         { "compile",       required_argument, 0, 303 },
@@ -135,6 +137,9 @@ void parse_opts(int *argcp, char ***argvp)
         case 'h':
             printf("%s%s", usage, opts);
             exit(0);
+        case 'O':
+            jl_compileropts.opt_level = 1;
+            break;
         case 'c':
             if (optarg != NULL) {
                 if (!strcmp(optarg,"user"))