diff --git a/deps/llvm-3.3.patch b/deps/llvm-3.3.patch index 2027d6db20976..d46b6e6f26682 100644 --- a/deps/llvm-3.3.patch +++ b/deps/llvm-3.3.patch @@ -178,3 +178,420 @@ diff -u -r -N llvm-3.3.src/lib/Support/Host.cpp llvm-3.3/lib/Support/Host.cpp case 28: // Most 45 nm Intel Atom processors case 38: // 45 nm Atom Lincroft case 39: // 32 nm Atom Medfield +diff -u -r -N llvm-3.3.src/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm-3.3/lib/Transforms/Vectorize/SLPVectorizer.cpp +--- llvm-3.3.src/lib/Transforms/Vectorize/SLPVectorizer.cpp 2013-04-30 16:04:51.000000000 -0500 ++++ llvm-3.3/lib/Transforms/Vectorize/SLPVectorizer.cpp 2014-04-15 16:26:12.200527154 -0500 +@@ -19,6 +19,7 @@ + #define DEBUG_TYPE SV_NAME + + #include "VecUtils.h" ++#include "llvm/ADT/SmallSet.h" + #include "llvm/Transforms/Vectorize.h" + #include "llvm/Analysis/AliasAnalysis.h" + #include "llvm/Analysis/ScalarEvolution.h" +@@ -128,7 +129,7 @@ + bool tryToVectorizePair(Value *A, Value *B, BoUpSLP &R); + + /// \brief Try to vectorize a list of operands. +- bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R); ++ bool tryToVectorizeList(ArrayRef VL, BoUpSLP &R, InsertElementInst *IE = 0); + + /// \brief Try to vectorize a chain that may start at the operands of \V; + bool tryToVectorize(BinaryOperator *V, BoUpSLP &R); +@@ -179,7 +180,7 @@ + return tryToVectorizeList(VL, R); + } + +-bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R) { ++bool SLPVectorizer::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, InsertElementInst *IE) { + DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n"); + + // Check that all of the parts are scalar. +@@ -191,10 +192,12 @@ + + int Cost = R.getTreeCost(VL); + int ExtrCost = R.getScalarizationCost(VL); +- DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost << ++ DEBUG(dbgs()<<"SLP: Cost of list:" << Cost << + " Cost of extract:" << ExtrCost << ".\n"); + if ((Cost+ExtrCost) >= -SLPCostThreshold) return false; +- DEBUG(dbgs()<<"SLP: Vectorizing pair.\n"); ++ DEBUG(dbgs()<<"SLP: Vectorizing list.\n"); ++ if (IE) ++ R.movePrematureInserts(VL,IE); + R.vectorizeArith(VL); + return true; + } +@@ -237,9 +240,49 @@ + return 0; + } + ++/// \brief Recognize construction of vectors like ++/// %ra = insertelement <4 x float> undef, float %s0, i32 0 ++/// %rb = insertelement <4 x float> %ra, float %s1, i32 1 ++/// %rc = insertelement <4 x float> %rb, float %s2, i32 2 ++/// %rd = insertelement <4 x float> %rc, float %s3, i32 3 ++/// ++/// Returns true if it matches. Sets \p Ops to the values inserted ++/// and \p Inserts to the insertelement instructions. ++/// ++static bool findBuildVector(InsertElementInst *IE, ++ SmallVectorImpl &Ops) { ++ if (!isa(IE->getOperand(0))) ++ return false; ++ ++ while (true) { ++ Ops.push_back(IE->getOperand(1)); ++ ++ if (IE->use_empty()) ++ return false; ++ ++ InsertElementInst *NextUse = dyn_cast(IE->use_back()); ++ if (!NextUse) ++ return true; ++ ++ // If this isn't the final use, make sure the next insertelement is the only ++ // use. It's OK if the final constructed vector is used multiple times ++ if (!IE->hasOneUse()) ++ return false; ++ ++ IE = NextUse; ++ } ++ ++ return false; ++} ++ + bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) { + bool Changed = false; ++ SmallSet VisitedInstrs; + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { ++ // We may go through BB multiple times so skip the one we have checked. ++ if (!VisitedInstrs.insert(it)) ++ continue; ++ + if (isa(it)) continue; + + // Try to vectorize reductions that use PHINodes. +@@ -271,6 +314,21 @@ + Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R); + continue; + } ++ ++ // Try to vectorize trees that start at insertelement instructions. ++ if (InsertElementInst *IE = dyn_cast(it)) { ++ SmallVector Ops; ++ if (!findBuildVector(IE, Ops)) ++ continue; ++ ++ if (tryToVectorizeList(Ops, R, IE)) { ++ Changed = true; ++ it = BB->begin(); ++ e = BB->end(); ++ } ++ ++ continue; ++ } + } + + return Changed; +diff -u -r -N llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.cpp llvm-3.3/lib/Transforms/Vectorize/VecUtils.cpp +--- llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.cpp 2013-04-21 03:05:59.000000000 -0500 ++++ llvm-3.3/lib/Transforms/Vectorize/VecUtils.cpp 2014-04-15 17:44:02.710904964 -0500 +@@ -243,7 +243,8 @@ + LaneMap.clear(); + MultiUserVals.clear(); + MustScalarize.clear(); +- ++ AllScalarized = true; ++ + // Scan the tree and find which value is used by which lane, and which values + // must be scalarized. + getTreeUses_rec(VL, 0); +@@ -275,7 +276,8 @@ + } + + // Now calculate the cost of vectorizing the tree. +- return getTreeCost_rec(VL, 0); ++ int treeCost = getTreeCost_rec(VL, 0); ++ return AllScalarized ? max_cost : treeCost; + } + + void BoUpSLP::getTreeUses_rec(ArrayRef VL, unsigned Depth) { +@@ -380,6 +382,55 @@ + } + } + ++#ifndef NDEBUG ++/// \returns The opcode if all of the Instructions in \p VL have the same ++/// opcode, or zero. ++static unsigned getSameOpcode(ArrayRef VL) { ++ Instruction *I0 = dyn_cast(VL[0]); ++ if (!I0) ++ return 0; ++ unsigned Opcode = I0->getOpcode(); ++ for (int i = 1, e = VL.size(); i < e; i++) { ++ Instruction *I = dyn_cast(VL[i]); ++ if (!I || Opcode != I->getOpcode()) ++ return 0; ++ } ++ return Opcode; ++} ++#endif /*NDEBUG*/ ++ ++/// \returns True if the ExtractElement instructions in VL can be vectorized ++/// to use the original vector. ++static bool CanReuseExtract(ArrayRef VL) { ++ assert(Instruction::ExtractElement == getSameOpcode(VL) && "Invalid opcode"); ++ // Check if all of the extracts come from the same vector and from the ++ // correct offset. ++ Value *VL0 = VL[0]; ++ ExtractElementInst *E0 = cast(VL0); ++ Value *Vec = E0->getOperand(0); ++ ++ // We have to extract from the same vector type. ++ unsigned NElts = Vec->getType()->getVectorNumElements(); ++ ++ if (NElts != VL.size()) ++ return false; ++ ++ // Check that all of the indices extract from the correct offset. ++ ConstantInt *CI = dyn_cast(E0->getOperand(1)); ++ if (!CI || CI->getZExtValue()) ++ return false; ++ ++ for (unsigned i = 1, e = VL.size(); i < e; ++i) { ++ ExtractElementInst *E = cast(VL[i]); ++ ConstantInt *CI = dyn_cast(E->getOperand(1)); ++ ++ if (!CI || CI->getZExtValue() != i || E->getOperand(0) != Vec) ++ return false; ++ } ++ ++ return true; ++} ++ + int BoUpSLP::getTreeCost_rec(ArrayRef VL, unsigned Depth) { + Type *ScalarTy = VL[0]->getType(); + +@@ -457,6 +508,20 @@ + } + + switch (Opcode) { ++ case Instruction::ExtractElement: { ++ if (CanReuseExtract(VL)) { ++ int DeadCost = 0; ++ for (unsigned i = 0, e = VL.size(); i < e; ++i) { ++ ExtractElementInst *E = cast(VL[i]); ++ if (E->hasOneUse()) ++ // Take credit for instruction that will become dead. ++ DeadCost += TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i); ++ } ++ DEBUG(dbgs() << "SLP: taking credit of " << -DeadCost << "\n"); ++ return -DeadCost; ++ } ++ return getScalarizationCost(VecTy); ++ } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: +@@ -490,6 +555,7 @@ + VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size()); + int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy); + Cost += (VecCost - ScalarCost); ++ AllScalarized = false; + return Cost; + } + case Instruction::Add: +@@ -528,6 +594,7 @@ + + int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy); + Cost += (VecCost - ScalarCost); ++ AllScalarized = false; + return Cost; + } + case Instruction::Load: { +@@ -540,6 +607,7 @@ + int ScalarLdCost = VecTy->getNumElements() * + TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); + int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0); ++ AllScalarized = false; + return VecLdCost - ScalarLdCost; + } + case Instruction::Store: { +@@ -556,6 +624,7 @@ + } + + int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1); ++ AllScalarized = false; + return TotalCost; + } + default: +@@ -630,6 +699,16 @@ + } + + switch (Opcode) { ++ case Instruction::ExtractElement: { ++ Value *S; ++ if (CanReuseExtract(VL) ) { ++ S = VL0->getOperand(0); ++ } else { ++ S = Scalarize(VL, VecTy); ++ } ++ VectorizedValues[VL0] = S; ++ return S; ++ } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: +@@ -727,4 +806,32 @@ + } + } + ++void BoUpSLP::movePrematureInserts(ArrayRef VL, InsertElementInst *IE) { ++ Instruction *VL0 = cast(VL[0]); ++ int MyLastIndex = InstrIdx[VL0]; ++ for (unsigned i = 1, e = VL.size(); i < e; ++i ) ++ MyLastIndex = std::max(MyLastIndex, InstrIdx[VL[i]]); ++ BasicBlock *BB = cast(VL0)->getParent(); ++ bool moved = false; ++ DEBUG(dbgs() << "SLP: Moving premature inserts\n"); ++ Instruction* x = InstrVec[MyLastIndex]; ++ while (IE->getParent()==BB) { ++ int UserIndex = InstrIdx[IE]; ++ if (UserIndex >= MyLastIndex) { ++ // Walked past transformed region ++ break; ++ } ++ IE->removeFromParent(); ++ IE->insertAfter(x); ++ DEBUG(dbgs() << "SLP: Rescheduled: " << *IE << ".\n"); ++ moved = true; ++ x = IE; ++ IE = dyn_cast(IE->use_back()); ++ if (!IE) ++ break; ++ } ++ if (moved) ++ numberInstructions(); ++} ++ + } // end of namespace +diff -u -r -N llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.h llvm-3.3/lib/Transforms/Vectorize/VecUtils.h +--- llvm-3.3.src/lib/Transforms/Vectorize/VecUtils.h 2013-04-20 04:49:10.000000000 -0500 ++++ llvm-3.3/lib/Transforms/Vectorize/VecUtils.h 2014-04-15 17:38:41.549776444 -0500 +@@ -77,6 +77,9 @@ + /// sequences. + ValueList &getGatherSeqInstructions() {return GatherInstructions; } + ++ /// \brief Move InsertElement instructions with indices preceding LastIndex ++ /// \p IE is the root of a chain identified by findBuildVector. ++ void movePrematureInserts(ArrayRef VL, InsertElementInst *IE); + private: + /// \brief This method contains the recursive part of getTreeCost. + int getTreeCost_rec(ArrayRef VL, unsigned Depth); +@@ -126,6 +129,8 @@ + /// by multiple lanes, or by users outside the tree. + /// NOTICE: The vectorization methods also use this set. + ValueSet MustScalarize; ++ /// True if all values must be scalarized, i.e. vectorization is worthless. ++ bool AllScalarized; + + /// Contains a list of values that are used outside the current tree. This + /// set must be reset between runs. +diff -u -r -N llvm-3.3.src/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll llvm-3.3/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll +--- llvm-3.3.src/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll 1969-12-31 18:00:00.000000000 -0600 ++++ llvm-3.3/test/Transforms/SLPVectorizer/X86/insert-element-build-vector.ll 2014-04-15 14:13:12.738690168 -0500 +@@ -0,0 +1,53 @@ ++target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-n8:16:32:64-S128" ++ ++target triple = "x86_64-apple-macosx10.8.0" ++ ++; RUN: opt -S -slp-vectorizer -slp-threshold=0 < %s | FileCheck %s ++ ++; Check that cost model for vectorization takes credit for ++; instructions that are erased. ++define <4 x float> @take_credit(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: @take_credit( ++; CHECK: %1 = fadd <4 x float> %a, %b ++ %a0 = extractelement <4 x float> %a, i32 0 ++ %b0 = extractelement <4 x float> %b, i32 0 ++ %c0 = fadd float %a0, %b0 ++ %a1 = extractelement <4 x float> %a, i32 1 ++ %b1 = extractelement <4 x float> %b, i32 1 ++ %c1 = fadd float %a1, %b1 ++ %a2 = extractelement <4 x float> %a, i32 2 ++ %b2 = extractelement <4 x float> %b, i32 2 ++ %c2 = fadd float %a2, %b2 ++ %a3 = extractelement <4 x float> %a, i32 3 ++ %b3 = extractelement <4 x float> %b, i32 3 ++ %c3 = fadd float %a3, %b3 ++ %v0 = insertelement <4 x float> undef, float %c0, i32 0 ++ %v1 = insertelement <4 x float> %v0, float %c1, i32 1 ++ %v2 = insertelement <4 x float> %v1, float %c2, i32 2 ++ %v3 = insertelement <4 x float> %v2, float %c3, i32 3 ++ ret <4 x float> %v3 ++} ++ ++; Make sure that vectorization happens even if extractelement operations ++; must be rescheduled. The case here is from compiling Julia. ++define <4 x float> @reschedule_extract(<4 x float> %a, <4 x float> %b) { ++; CHECK-LABEL: @reschedule_extract( ++; CHECK: %1 = fadd <4 x float> %a, %b ++ %a0 = extractelement <4 x float> %a, i32 0 ++ %b0 = extractelement <4 x float> %b, i32 0 ++ %c0 = fadd float %a0, %b0 ++ %v0 = insertelement <4 x float> undef, float %c0, i32 0 ++ %a1 = extractelement <4 x float> %a, i32 1 ++ %b1 = extractelement <4 x float> %b, i32 1 ++ %c1 = fadd float %a1, %b1 ++ %v1 = insertelement <4 x float> %v0, float %c1, i32 1 ++ %a2 = extractelement <4 x float> %a, i32 2 ++ %b2 = extractelement <4 x float> %b, i32 2 ++ %c2 = fadd float %a2, %b2 ++ %v2 = insertelement <4 x float> %v1, float %c2, i32 2 ++ %a3 = extractelement <4 x float> %a, i32 3 ++ %b3 = extractelement <4 x float> %b, i32 3 ++ %c3 = fadd float %a3, %b3 ++ %v3 = insertelement <4 x float> %v2, float %c3, i32 3 ++ ret <4 x float> %v3 ++} +diff -u -r -N llvm-3.3.src/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll llvm-3.3/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll +--- llvm-3.3.src/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll 1969-12-31 18:00:00.000000000 -0600 ++++ llvm-3.3/test/Transforms/SLPVectorizer/X86/non-vectorizable-intrinsic.ll 2014-04-15 15:42:30.448560440 -0500 +@@ -0,0 +1,36 @@ ++; RUN: opt < %s -slp-vectorizer -o - -S -slp-threshold=-1000 ++ ++target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64" ++target triple = "nvptx--nvidiacl" ++ ++; CTLZ cannot be vectorized currently because the second argument is a scalar ++; for both the scalar and vector forms of the intrinsic. In the future it ++; should be possible to vectorize such functions. ++; Test causes an assert if LLVM tries to vectorize CTLZ. ++ ++define <2 x i8> @cltz_test(<2 x i8> %x) #0 { ++entry: ++ %0 = extractelement <2 x i8> %x, i32 0 ++ %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) ++ %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0 ++ %1 = extractelement <2 x i8> %x, i32 1 ++ %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) ++ %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 ++ ret <2 x i8> %vecinit2 ++} ++ ++define <2 x i8> @cltz_test2(<2 x i8> %x) #1 { ++entry: ++ %0 = extractelement <2 x i8> %x, i32 0 ++ %1 = extractelement <2 x i8> %x, i32 1 ++ %call.i = call i8 @llvm.ctlz.i8(i8 %0, i1 false) ++ %call.i4 = call i8 @llvm.ctlz.i8(i8 %1, i1 false) ++ %vecinit = insertelement <2 x i8> undef, i8 %call.i, i32 0 ++ %vecinit2 = insertelement <2 x i8> %vecinit, i8 %call.i4, i32 1 ++ ret <2 x i8> %vecinit2 ++} ++ ++declare i8 @llvm.ctlz.i8(i8, i1) #3 ++ ++attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } ++attributes #1 = { nounwind readnone } diff --git a/src/codegen.cpp b/src/codegen.cpp index 5bcdfa0c489db..90a7c3f6d807b 100644 --- a/src/codegen.cpp +++ b/src/codegen.cpp @@ -4695,6 +4695,8 @@ static void init_julia_llvm_env(Module *m) jl_TargetMachine->addAnalysisPasses(*FPM); #endif FPM->add(createTypeBasedAliasAnalysisPass()); + if (jl_compileropts.opt_level>=1) + FPM->add(createBasicAliasAnalysisPass()); // list of passes from vmkit FPM->add(createCFGSimplificationPass()); // Clean up disgusting code FPM->add(createPromoteMemoryToRegisterPass());// Kill useless allocas @@ -4740,11 +4742,11 @@ static void init_julia_llvm_env(Module *m) #else FPM->add(createLoopUnrollPass()); // Unroll small loops #endif - //FPM->add(createLoopStrengthReducePass()); // (jwb added) - -#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR >= 3 && !defined(INSTCOMBINE_BUG) +#if LLVM33 && !LLVM35 && !defined(INSTCOMBINE_BUG) FPM->add(createLoopVectorizePass()); // Vectorize loops #endif + //FPM->add(createLoopStrengthReducePass()); // (jwb added) + #ifndef INSTCOMBINE_BUG FPM->add(createInstructionCombiningPass()); // Clean up after the unroller #endif @@ -4761,8 +4763,20 @@ static void init_julia_llvm_env(Module *m) #endif FPM->add(createJumpThreadingPass()); // Thread jumps FPM->add(createDeadStoreEliminationPass()); // Delete dead stores +#if LLVM33 && !defined(INSTCOMBINE_BUG) + if (jl_compileropts.opt_level>=1) + FPM->add(createSLPVectorizerPass()); // Vectorize straight-line code +#endif FPM->add(createAggressiveDCEPass()); // Delete dead instructions +#if LLVM33 && !defined(INSTCOMBINE_BUG) + if (jl_compileropts.opt_level>=1) + FPM->add(createInstructionCombiningPass()); // Clean up after SLP loop vectorizer +#endif +#if LLVM35 + FPM->add(createLoopVectorizePass()); // Vectorize loops + FPM->add(createInstructionCombiningPass()); // Clean up after loop vectorizer +#endif //FPM->add(createCFGSimplificationPass()); // Merge & remove BBs FPM->doInitialization(); diff --git a/src/init.c b/src/init.c index 1e7d59f2a2e90..9519582e2c661 100644 --- a/src/init.c +++ b/src/init.c @@ -85,7 +85,8 @@ jl_compileropts_t jl_compileropts = { NULL, // build_path JL_COMPILEROPT_CHECK_BOUNDS_DEFAULT, JL_COMPILEROPT_DUMPBITCODE_OFF, 0, // int_literals - JL_COMPILEROPT_COMPILE_DEFAULT + JL_COMPILEROPT_COMPILE_DEFAULT, + 0 // opt_level }; int jl_boot_file_loaded = 0; diff --git a/src/julia.h b/src/julia.h index 4f032204c1454..c349317a1eb58 100644 --- a/src/julia.h +++ b/src/julia.h @@ -1334,6 +1334,7 @@ typedef struct { int8_t dumpbitcode; int int_literals; int8_t compile_enabled; + int8_t opt_level; } jl_compileropts_t; extern DLLEXPORT jl_compileropts_t jl_compileropts; diff --git a/ui/repl.c b/ui/repl.c index be9c44edb55c5..31634902aacbc 100644 --- a/ui/repl.c +++ b/ui/repl.c @@ -85,12 +85,13 @@ static const char *opts = " --track-allocation={none|user|all}\n" " Count bytes allocated by each source line\n" " --check-bounds={yes|no} Emit bounds checks always or never (ignoring declarations)\n" + " -O, --optimize Run time-intensive code optimizations\n" " --int-literals={32|64} Select integer literal size independent of platform\n" " --dump-bitcode={yes|no} Dump bitcode for the system image (used with --build)\n"; void parse_opts(int *argcp, char ***argvp) { - static char* shortopts = "+H:T:hJ:"; + static char* shortopts = "+H:T:hJ:O"; static struct option longopts[] = { { "home", required_argument, 0, 'H' }, { "tab", required_argument, 0, 'T' }, @@ -101,6 +102,7 @@ void parse_opts(int *argcp, char ***argvp) { "code-coverage", optional_argument, 0, 'c' }, { "track-allocation",required_argument, 0, 'm' }, { "check-bounds", required_argument, 0, 300 }, + { "optimize", no_argument, 0, 'O' }, { "int-literals", required_argument, 0, 301 }, { "dump-bitcode", required_argument, 0, 302 }, { "compile", required_argument, 0, 303 }, @@ -135,6 +137,9 @@ void parse_opts(int *argcp, char ***argvp) case 'h': printf("%s%s", usage, opts); exit(0); + case 'O': + jl_compileropts.opt_level = 1; + break; case 'c': if (optarg != NULL) { if (!strcmp(optarg,"user"))