From 8bce605503af86692e00f5d0ca9d2e72b6efa919 Mon Sep 17 00:00:00 2001 From: "Dongjia \"toka\" Zhang" Date: Tue, 16 Apr 2024 16:51:28 +0200 Subject: [PATCH] DDFuzz (#2056) * push * add * book * ddg * works --- libafl_cc/build.rs | 11 + libafl_cc/src/clang.rs | 5 + libafl_cc/src/ddg-instr.cc | 786 +++++++++++++++++++++++++++++++++ libafl_cc/src/ddg-utils.cc | 114 +++++ libafl_cc/src/ddg-utils.h | 120 +++++ libafl_targets/src/cmplog.c | 6 +- libafl_targets/src/coverage.c | 3 + libafl_targets/src/coverage.rs | 9 + scripts/fmt_all.sh | 6 +- 9 files changed, 1053 insertions(+), 7 deletions(-) create mode 100644 libafl_cc/src/ddg-instr.cc create mode 100644 libafl_cc/src/ddg-utils.cc create mode 100644 libafl_cc/src/ddg-utils.h diff --git a/libafl_cc/build.rs b/libafl_cc/build.rs index f88ac511cc..7ce0c269f9 100644 --- a/libafl_cc/build.rs +++ b/libafl_cc/build.rs @@ -402,6 +402,17 @@ pub const LIBAFL_CC_LLVM_VERSION: Option = None; ldflags.push(&sdk_path); }; + build_pass( + bindir_path, + out_dir, + &cxxflags, + &ldflags, + src_dir, + "ddg-instr.cc", + Some(&vec!["ddg-utils.cc"]), + false, + ); + for pass in &[ "cmplog-routines-pass.cc", "autotokens-pass.cc", diff --git a/libafl_cc/src/clang.rs b/libafl_cc/src/clang.rs index 45bc4eb786..200c8ab4bd 100644 --- a/libafl_cc/src/clang.rs +++ b/libafl_cc/src/clang.rs @@ -41,6 +41,8 @@ pub enum LLVMPasses { CmpLogInstructions, /// Instrument caller for sancov coverage Ctx, + /// Data dependency instrumentation + DDG, } impl LLVMPasses { @@ -64,6 +66,9 @@ impl LLVMPasses { LLVMPasses::Ctx => { PathBuf::from(env!("OUT_DIR")).join(format!("ctx-pass.{}", dll_extension())) } + LLVMPasses::DDG => { + PathBuf::from(env!("OUT_DIR")).join(format!("ddg-instr.{}", dll_extension())) + } } } } diff --git a/libafl_cc/src/ddg-instr.cc b/libafl_cc/src/ddg-instr.cc new file mode 100644 index 0000000000..b238ae127f --- /dev/null +++ b/libafl_cc/src/ddg-instr.cc @@ -0,0 +1,786 @@ +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Comdat.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" +#include +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/ASanStackFrameLayout.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" + +// #include "WPA/WPAPass.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ddg-utils.h" +#include "common-llvm.h" + +#define MAX_DEPTH 3 +#define MIN_FCN_SIZE 1 +#define VAR_NAME_LEN 264 + +#define MAP_SIZE LIBAFL_EDGES_MAP_SIZE +// #define MAP_SIZE 65536 +#define ALL_BIT_SET (MAP_SIZE - 1) +// #define MAP_SIZE 255 + +// #define INTERPROCEDURAL 1 // unset if you want only intraprocedural ret +// values management BUT #define LOAD_INSTR // considers loads as +// stores + +// #define DEBUG 1 // set if you want debug prints enabled + +#define AFL_SR(s) (srandom(s)) +#define AFL_R(x) (random() % (x)) + +#ifdef DEBUG + #define DEBUG(X) \ + do { \ + X; \ + } while (false) +#else + #define DEBUG(X) ((void)0) +#endif + +using namespace llvm; +// using namespace svf; + +class DDGInstrModulePass : public PassInfoMixin { + private: + void InsertDataFlow(Value *Operand, Value *Res) { + std::map>::iterator it = + this->DataFlowTracker.begin(); + while (it != this->DataFlowTracker.end()) { + std::vector Slice = it->second; + std::vector::iterator jt; + for (jt = Slice.begin(); jt != Slice.end(); ++jt) { + if (Operand == *jt) { + this->DataFlowTracker[it->first].push_back(Res); + break; + } + } + it++; + } + } + + void RetrieveDataFlow(Value *V, std::vector *Dependencies) { + std::map>::iterator it = + this->DataFlowTracker.begin(); + while (it != this->DataFlowTracker.end()) { + std::vector Slice = it->second; + std::vector::iterator jt; + for (jt = Slice.begin(); jt != Slice.end(); ++jt) { + if (V == *jt) { + Dependencies->push_back(it->first); + break; + } + } + it++; + } + } + + bool isSourceCodeVariable(Value *Variable) { + std::map>::iterator it = + this->DataFlowTracker.find(Variable); + return it != this->DataFlowTracker.end(); + } + + bool isLLVMVariable(Value *Variable, + std::map *LLVMVariables) { + std::map::iterator it = + LLVMVariables->find(Variable); + return it != LLVMVariables->end(); + } + + void CreateDataFlow(Value *Variable) { + std::map>::iterator it = + this->DataFlowTracker.find(Variable); + if (it == this->DataFlowTracker.end()) { + this->DataFlowTracker[Variable].push_back(Variable); + } + } + + // When we have `Store A, B`, we want to know that exactly B reperensents. In + // the default case, it is a source code variable and so we're done. BUT, in + // many cases B could represent the field of a struct, or a location whithin a + // buffer. So, we need to recover what B represents to be more precise when we + // define the dependency relationship. + void RetrieveAccessedVariable(Value *Variable, std::vector *Flows, + std::map *LLVMVariables, + Value **ActualSrcVariable) { + if (isLLVMVariable(Variable, LLVMVariables)) { + // If it is an LLVM variable (mostly for struct fields), we have it + // tracked down in the LLVMVariables list, so we just need to parse the + // GEP inst + Instruction *DefiningInstruction = (*LLVMVariables)[Variable]; + // For now we only handle the GEP instructions, maybe in future + // it could be useful to implement other instructions + if (auto GEP = dyn_cast(DefiningInstruction)) { + Value *PtrOperand = GEP->getPointerOperand(); + Variable = PtrOperand; + *ActualSrcVariable = PtrOperand; + if (isSourceCodeVariable(PtrOperand)) { + // We finally could connect an LLVM variable to an actual Source code + // Variable! + for (unsigned int i = 1; i < DefiningInstruction->getNumOperands(); + i++) { // Starts from 1, since 0 is thr PtrOperand + Value *Op = DefiningInstruction->getOperand(i); + if (!isa(Op)) { RetrieveDataFlow(Op, Flows); } + } + return; + } else { + // Re-itereate the Variable analysis + RetrieveAccessedVariable(Variable, Flows, LLVMVariables, + ActualSrcVariable); + } + for (unsigned int i = 1; i < DefiningInstruction->getNumOperands(); + i++) { // Starts from 1, since 0 is thr PtrOperand + Value *Op = DefiningInstruction->getOperand(i); + if (!isa(Op)) { RetrieveDataFlow(Op, Flows); } + } + } + } else { + // If it is not a GEP-defined llvm variable, we basically use the DataFlow + // Tracker, to retrieve the dependency of this variable. The idea is that, + // if this llvm variable is not GEP-depending, it should be easier to + // retrieve what it does represent + std::vector TmpFlow; + RetrieveDataFlow(Variable, &TmpFlow); + if (TmpFlow.size() == 1) { + *ActualSrcVariable = TmpFlow[0]; + // We found a Source Code variable (Variable->getName()) + return; + } else if (TmpFlow.size() > 1) { + *ActualSrcVariable = TmpFlow[0]; + DEBUG(errs() << "[Warning] multiple flows for the same GEP access, " + "choosing the first one\n"); + } else { + return; + } + } + } + + public: + static char ID; + FunctionCallee logger; + Type *VoidTy; + std::map> DataFlowTracker; + + PreservedAnalyses run(Module &M, ModuleAnalysisManager &MAM) { + LLVMContext &C = M.getContext(); + + auto &FAM = + MAM.getResult(M).getManager(); + auto DTCallback = [&FAM](Function &F) -> DominatorTree * { + return &FAM.getResult(F); + }; + + auto PDTCallback = [&FAM](Function &F) -> PostDominatorTree * { + return &FAM.getResult(F); + }; + + auto LICallback = [&FAM](Function &F) -> LoopInfo * { + return &FAM.getResult(F); + }; + + IntegerType *Int16Ty = IntegerType::getInt16Ty(C); + IntegerType *Int8Ty = IntegerType::getInt8Ty(C); + // IntegerType *Int32Ty = IntegerType::getInt32Ty(C); + ConstantInt *Zero = ConstantInt::get(Int8Ty, 0); + ConstantInt *One = ConstantInt::get(Int8Ty, 1); + unsigned int instrumentedLocations = 0; + + std::map BlocksLocs; + std::map VisitedBlocks; + ConstantInt *Visited = ConstantInt::get(Int16Ty, 0xff); + ConstantInt *NonVisited = ConstantInt::get(Int16Ty, 0); + ConstantInt *CurLoc; + char *name = nullptr; + unsigned BBCounter = 0; + + unsigned bb_count = 0; + unsigned int cur_loc = 0; + uint32_t map_size = MAP_SIZE; + + struct timeval tv; + struct timezone tz; + unsigned int rand_seed; + + /* Setup random() so we get Actually Random(TM) outputs from AFL_R() */ + gettimeofday(&tv, &tz); + rand_seed = tv.tv_sec ^ tv.tv_usec ^ getpid(); + AFL_SR(rand_seed); + + GlobalVariable *DDGMapPtr = M.getGlobalVariable("__ddg_area_ptr"); + if (DDGMapPtr == nullptr) + DDGMapPtr = + new GlobalVariable(M, PointerType::get(Int8Ty, 0), false, + GlobalValue::ExternalLinkage, 0, "__ddg_area_ptr"); + +#ifdef INTERPROCEDURAL + // For each function we store the return Values + std::map> ReturnValues; + + for (auto &F : M) { + if (F.size() < MIN_FCN_SIZE) continue; + + for (auto &BB : F) { + for (auto &I : BB) { + if (auto RI = dyn_cast(&I)) { + Value *RetVal = RI->getReturnValue(); + if (RetVal) { ReturnValues[&F].push_back(RI); } + } + } + } + } +#endif + + for (auto &F : M) { + if (F.size() < MIN_FCN_SIZE) continue; + + std::map> + Stores; // Represents the nodes of our DataDep Graph + std::vector> + StoreEdges; // Contains the edges of the DDG + std::map> + IncomingEdges; // Map s.t. key is a BB and value is a set of BBs + // whose data flow reaches the key + std::map + LLVMVariables; // LLVM IR Variables which are used as Operands for + // the store (for instance, the ones resulting from a + // GEP) + + BasicBlock &EntryBB = F.getEntryBlock(); + Instruction *FirstInst = EntryBB.getFirstNonPHIOrDbg(); + + // First we add the function params to track the dataflow + for (Function::arg_iterator arg_it = F.arg_begin(); arg_it != F.arg_end(); + arg_it++) { + Argument *Arg = arg_it; + if (Value *ArgVariable = dyn_cast(Arg)) { + CreateDataFlow(ArgVariable); + FlowWriteInstruction *MyStore = + new FlowWriteInstruction(&EntryBB, FirstInst, declaration); + Stores[ArgVariable].push_back(MyStore); + } + } + + LoopInfo *LI = LICallback(F); + DominatorTree *DT = DTCallback(F); + PostDominatorTree *PT = PDTCallback(F); + + // We basically want to track data flow between memory instructions + // and call instructions (i.e., the arguments) + + // Here we extract the data dependence info for function F + for (auto &BB : F) { + BBCounter += 1; + for (auto &I : BB) { + // We track all variables "Alloca" derived and we add them to the + // RootNode + if (auto AI = dyn_cast(&I)) { + Value *Variable = static_cast(AI); + CreateDataFlow(Variable); + } + + if (auto LOI = dyn_cast(&I)) { + Value *Variable = LOI->getPointerOperand(); + CreateDataFlow(Variable); +#ifdef LOAD_INSTR + std::vector Flows; + RetrieveDataFlow(Variable, &Flows); + + // If `Variable` does not directly represent a Src code variable, we + // fetch what it represents (e.g., the field of a struct) + if (!isSourceCodeVariable(Variable)) { + Value *ActualSrcVariable = nullptr; + RetrieveAccessedVariable(Variable, &Flows, &LLVMVariables, + &ActualSrcVariable); + if (ActualSrcVariable) Variable = ActualSrcVariable; + } + + for (std::vector::iterator it = Flows.begin(); + it != Flows.end(); ++it) { + Value *Dependency = *it; + + // First we find the edges between the current store and the + // previous ones (i.e., when we wrote into `c` and `b` if the + // current store is `a = c + b`) + std::vector AllStoresPerVariable = + Stores[Dependency]; + unsigned ConsideredStores = 0; + bool *ReachingStores = isReachableByStore( + &AllStoresPerVariable, LOI, &DT, &LI, &ConsideredStores); + + // ReachingStores[0] refers to the last Store instruction that we + // met (i.e., the last in `AllStoresPerVariable` This is why we + // iterate the vector in a reverse way BUT the array in the + // forward + unsigned i = 0; + for (std::vector::reverse_iterator it = + AllStoresPerVariable.rbegin(); + it != AllStoresPerVariable.rend(); it++) { + if (ReachingStores[i] && (i < ConsideredStores)) { + Instruction *Src = (*it)->I; + if (Src == + LOI) // Already managed in the `reachableByStores` method + continue; + if (Src->getParent() != LOI->getParent()) { + StoreEdges.push_back(edge); + IncomingEdges[LOI->getParent()].insert(LOI->getParent()); + DEBUG(errs() << "+++++++++++\nAdding edge\n"); + DEBUG(debug_instruction(Src)); + DEBUG(debug_instruction(LOI)); + DEBUG(errs() << "-----------\n"); + } + } + i++; + } + + delete[] ReachingStores; + } + // Then we insert the new Store in our map that contains all the + // stores, so we build forward deps + FlowWriteInstruction *MyStore = + new FlowWriteInstruction(LOI->getParent(), LOI, declaration); + Stores[Variable].push_back(MyStore); +#endif + } + + if (auto GEP = dyn_cast( + &I)) { // We dedicate an list for GEPs defined llvm vars. + Value *Var = static_cast( + &I); // For other LLVM variables, we use the DataflowTracker + LLVMVariables[Var] = GEP; + } + + // We propagate the dependency info + Value *Result = static_cast(&I); + if (Result and + !isa( + I)) { // We exclude CallInst, as they're managed separately + // (Not excluding them now, would introduce a double + // dependency leading to the same value) + for (unsigned int i = 0; i < I.getNumOperands(); i++) { + Value *Op = I.getOperand(i); + if (!isa(Op)) InsertDataFlow(Op, Result); + } + } +#ifdef INTERPROCEDURAL + else if (Result and isa(I)) { + CallInst *CI = dyn_cast(&I); + Function *CalledFunction = CI->getCalledFunction(); + std::map>::iterator it = + ReturnValues.find(CalledFunction); + if (it != ReturnValues.end()) { + std::vector RetValsInstrs = it->second; + for (std::vector::iterator jt = + RetValsInstrs.begin(); + jt != RetValsInstrs.end(); jt++) { + Instruction *In = *jt; + ReturnInst *Ret = static_cast(In); + Value *RV = Ret->getReturnValue(); + CreateDataFlow(RV); + InsertDataFlow(RV, Result); // We indicate dependency between + // retval and call site + Stores[RV].push_back(new FlowWriteInstruction( + Ret->getParent(), Ret, declaration)); + } + } + } +#endif + // We create the actual DDG depending on mem accesses and Call + // instructions + if (auto ST = dyn_cast(&I)) { + Value *Variable = ST->getPointerOperand(); // Where we're writing + Value *Access = ST->getValueOperand(); // What we're writing, this + // gives us the dependencies + // The current Store is writing `Access` into `Variable` + + std::vector Flows; + RetrieveDataFlow(Access, &Flows); + + // If `Variable` does not directly represent a Src code variable, we + // fetch what it represents (e.g., the field of a struct) + if (!isSourceCodeVariable(Variable)) { + Value *ActualSrcVariable = nullptr; + RetrieveAccessedVariable(Variable, &Flows, &LLVMVariables, + &ActualSrcVariable); + if (ActualSrcVariable) Variable = ActualSrcVariable; + } + + StoreType Type = declaration; // Usually we have `a = c + b` + for (std::vector::iterator it = Flows.begin(); + it != Flows.end(); ++it) { + Value *Dependency = *it; + if (Dependency == Variable) // If we fall into `a += c + b`, we + // manage differently + Type = modification; // Probably we dont need this distinction + // anymore, but keep it for future + // experiments + + // First we find the edges between the current store and the + // previous ones (i.e., when we wrote into `c` and `b` if the + // current store is `a = c + b`) + std::vector AllStoresPerVariable = + Stores[Dependency]; + unsigned ConsideredStores = 0; + bool *ReachingStores = isReachableByStore( + &AllStoresPerVariable, ST, DT, LI, &ConsideredStores); + + // ReachingStores[0] refers to the last Store instruction that we + // met (i.e., the last in `AllStoresPerVariable` This is why we + // iterate the vector in a reverse way BUT the array in the + // forward + unsigned i = 0; + for (std::vector::reverse_iterator it = + AllStoresPerVariable.rbegin(); + it != AllStoresPerVariable.rend(); it++) { + if (ReachingStores[i] && (i < ConsideredStores)) { + Instruction *Src = (*it)->I; + if (Src == + ST) // Already managed in the `reachableByStores` method + continue; + if (isPredecessorBB(Src, + ST)) // Already managed by edge coverage + continue; +#if LLVM_VERSION_MAJOR == 9 + BasicBlock *SrcParent = Src->getParent(); + BasicBlock *STParent = ST->getParent(); + if (PT->dominates(SrcParent, STParent)) +#else + if (PT->dominates(Src, ST)) +#endif + continue; + if (Src->getParent() != ST->getParent()) { + std::tuple edge = + decltype(edge){Src->getParent(), ST->getParent()}; + StoreEdges.push_back(edge); + IncomingEdges[ST->getParent()].insert(Src->getParent()); + DEBUG(errs() << "+++++++++++\nAdding edge\n"); + DEBUG(debug_instruction(Src)); + DEBUG(debug_instruction(ST)); + DEBUG(errs() << "-----------\n"); + } + } + i++; + } + + delete[] ReachingStores; + } + // Then we insert the new Store in our map that contains all the + // stores, so we build forward deps + FlowWriteInstruction *MyStore = + new FlowWriteInstruction(ST->getParent(), ST, Type); + Stores[Variable].push_back(MyStore); + + } + // Three major cases: + // 1) a = foo(x) => a depends on the result of foo() applied + // on x and x depends on its previous values and return value 2) + // memcpy(src, dst, N) => dst depends on src and N && the triple src, + // dst, N depends on their previous value (memcpy or any other API) 3) + // foo(x, out_y, out_z) => out_x, out_y are writen within foo + // depending on x. Thus here the dependency is managed internally to + // the function when passing on it + else if (CallInst *Call = dyn_cast(&I)) { + FlowWriteInstruction *MyStore = nullptr; + Value *Variable = nullptr; + Function *FC = Call->getCalledFunction(); + // DEBUG(errs() << "Looking for dependencies when calling " << + // FC->getName() << "\n"); + int argStart = + 0; // In some cases, we dont want to track dependencies for + // each argument. For instance, for memcpy(src, dst, n), we + // can ignore previous `src` dependencies, since it is being + // written. Rather, for this specific case, we generate a + // FlowWriteInstruction object to save the fact that `src` + // internal value has been modified according to `dst` and + // `n` + + if (FC == nullptr) continue; + if (FC->isIntrinsic()) { + switch (FC->getIntrinsicID()) { + case Intrinsic::memcpy: { + Variable = Call->getArgOperand(0); + std::vector Flows; + RetrieveDataFlow(Variable, &Flows); + if (Flows.size() != 0) Variable = Flows[0]; + MyStore = new FlowWriteInstruction(Call->getParent(), Call, + declaration); + argStart = 1; + break; + } + case Intrinsic::memset: { + // memset does not produce a real dataflow + // errs() << "memset to implement\n"; + break; + } + case Intrinsic::memmove: { + Variable = Call->getArgOperand(0); + std::vector Flows; + RetrieveDataFlow(Variable, &Flows); + if (Flows.size() != 0) Variable = Flows[0]; + MyStore = new FlowWriteInstruction(Call->getParent(), Call, + declaration); + argStart = 1; + break; + } + default: { + // errs() << "Not implemented/interesting intrinsic for data + // flow\n"; + break; + } + } + } + for (unsigned int i = argStart; i < Call->arg_size(); i++) { + Value *ArgOp = Call->getArgOperand(i); + if (!isa(ArgOp)) { + std::vector Flows; + RetrieveDataFlow(ArgOp, &Flows); + + for (std::vector::iterator it = Flows.begin(); + it != Flows.end(); ++it) { + Value *Dependency = *it; + // DEBUG(errs() << "Call depending on: {" << + // Dependency->getName() << "}\n"); + std::vector AllStoresPerVariable = + Stores[Dependency]; + unsigned ConsideredStores = 0; + bool *ReachingStores = isReachableByStore( + &AllStoresPerVariable, Call, DT, LI, &ConsideredStores); + unsigned i = 0; + for (std::vector::reverse_iterator + it = AllStoresPerVariable.rbegin(); + it != AllStoresPerVariable.rend(); it++) { + if (ReachingStores[i] && (i < ConsideredStores)) { + Instruction *Src = (*it)->I; + if (Src == Call) // Already managed in the + // `reachableByStores` method + continue; + if (isPredecessorBB(Src, Call)) continue; +#if LLVM_VERSION_MAJOR == 9 + BasicBlock *SrcParent = Src->getParent(); + BasicBlock *CallParent = Call->getParent(); + if (PT->dominates(SrcParent, CallParent)) +#else + if (PT->dominates(Src, Call)) +#endif + continue; + if (Src->getParent() != Call->getParent()) { + std::tuple edge = + decltype(edge){Src->getParent(), Call->getParent()}; + StoreEdges.push_back(edge); + IncomingEdges[Call->getParent()].insert( + Src->getParent()); + DEBUG(errs() << "+++++++++++\nAdding edge\n"); + DEBUG(debug_instruction(Src)); + DEBUG(debug_instruction(Call)); + DEBUG(errs() << "-----------\n"); + } + } + i++; + } + } + } + } + if (Variable != nullptr && MyStore != nullptr) { + Stores[Variable].push_back(MyStore); + } + } else + continue; + } + } + + // Instrument the locations in the function + BasicBlock::iterator IP = EntryBB.getFirstInsertionPt(); + IRBuilder<> IRB(&(*IP)); + Value *IsCurrentBlockVisited; + + for (auto &BB : F) { + bb_count++; + name = new char[VAR_NAME_LEN]; + memset(name, 0, VAR_NAME_LEN); + snprintf(name, VAR_NAME_LEN, "my_var_%d", BBCounter++); + AllocaInst *AllocaIsCurrentlyBlockVisited = + IRB.CreateAlloca(Int16Ty, nullptr, StringRef(name)); + AllocaIsCurrentlyBlockVisited->setMetadata(M.getMDKindID("nosanitize"), + MDNode::get(C, None)); + IsCurrentBlockVisited = + static_cast(AllocaIsCurrentlyBlockVisited); + StoreInst *InitializeVisited; + if (&EntryBB == &BB) + InitializeVisited = IRB.CreateStore(Visited, IsCurrentBlockVisited); + else + InitializeVisited = + IRB.CreateStore(NonVisited, IsCurrentBlockVisited); + + if (InitializeVisited) + InitializeVisited->setMetadata(M.getMDKindID("nosanitize"), + MDNode::get(C, None)); + + VisitedBlocks[&BB] = IsCurrentBlockVisited; + + // errs() << "MAP SIZE " << std::to_string(map_size) << "\n"; + cur_loc = AFL_R(map_size); + CurLoc = ConstantInt::get(Int16Ty, cur_loc); + BlocksLocs[&BB] = CurLoc; + } + + for (auto &BB : F) { + if (&BB == &EntryBB) continue; + + IP = BB.getFirstInsertionPt(); + IRBuilder<> IRB(&(*IP)); + IsCurrentBlockVisited = VisitedBlocks[&BB]; + + StoreInst *StoreIsVisited = + IRB.CreateStore(Visited, IsCurrentBlockVisited); + StoreIsVisited->setMetadata(M.getMDKindID("nosanitize"), + MDNode::get(C, None)); + + Value *HashedLoc = nullptr; + if (IncomingEdges[&BB].size() <= 1) continue; + for (std::set::iterator it = IncomingEdges[&BB].begin(); + it != IncomingEdges[&BB].end(); ++it) { + Value *isVisited = VisitedBlocks[*it]; + ConstantInt *PotentiallyPreviousLoc = BlocksLocs[*it]; + if (!isVisited or !PotentiallyPreviousLoc) continue; + LoadInst *LoadIsVisited = + IRB.CreateLoad(isVisited->getType(), isVisited); + LoadIsVisited->setMetadata(M.getMDKindID("nosanitize"), + MDNode::get(C, None)); + + Value *PrevLocIfVisited = + IRB.CreateAnd(LoadIsVisited, PotentiallyPreviousLoc); + CurLoc = BlocksLocs[&BB]; + if (HashedLoc == nullptr) + HashedLoc = IRB.CreateXor(CurLoc, PrevLocIfVisited); + else + HashedLoc = IRB.CreateXor(HashedLoc, PrevLocIfVisited); + } + if (HashedLoc == nullptr) continue; + + HashedLoc = IRB.CreateZExt(HashedLoc, IRB.getInt32Ty()); + + LoadInst *MapPtr = + IRB.CreateLoad(PointerType::get(Int8Ty, 0), DDGMapPtr); + MapPtr->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None)); + + Value *MapPtrIdx = IRB.CreateGEP(Int8Ty, MapPtr, HashedLoc); + LoadInst *Counter = IRB.CreateLoad(Int8Ty, MapPtrIdx); + Counter->setMetadata(M.getMDKindID("nosanitize"), MDNode::get(C, None)); + + Value *Incr = IRB.CreateAdd(Counter, One); + auto cf = IRB.CreateICmpEQ(Incr, Zero); + auto carry = IRB.CreateZExt(cf, Int8Ty); + Incr = IRB.CreateAdd(Incr, carry); + + StoreInst *StoreMapPtr = IRB.CreateStore(Incr, MapPtrIdx); + StoreMapPtr->setMetadata(M.getMDKindID("nosanitize"), + MDNode::get(C, None)); + + instrumentedLocations++; + } + } + + errs() << "DDG - Instrumented " << instrumentedLocations + << " locations over a total of " << bb_count << " \t\n"; + + auto PA = PreservedAnalyses::all(); + return PA; + } +}; + +extern "C" ::llvm::PassPluginLibraryInfo LLVM_ATTRIBUTE_WEAK +llvmGetPassPluginInfo() { + return {LLVM_PLUGIN_API_VERSION, "DDGInstrPass", "v0.1", + /* lambda to insert our pass into the pass pipeline. */ + [](PassBuilder &PB) { + +#if LLVM_VERSION_MAJOR <= 13 + using OptimizationLevel = typename PassBuilder::OptimizationLevel; +#endif + PB.registerOptimizerLastEPCallback( + [](ModulePassManager &MPM, OptimizationLevel OL) { + MPM.addPass(DDGInstrModulePass()); + }); + }}; +} diff --git a/libafl_cc/src/ddg-utils.cc b/libafl_cc/src/ddg-utils.cc new file mode 100644 index 0000000000..e04fceddbf --- /dev/null +++ b/libafl_cc/src/ddg-utils.cc @@ -0,0 +1,114 @@ +#include "llvm/Analysis/CFG.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" +#include +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/ASanStackFrameLayout.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ddg-utils.h" + +#define BB_THRESHOLD 16 + +void debug_instruction(Instruction *I) { + DILocation *D = I->getDebugLoc(); + + if (D != NULL) { + errs() << "Line: " << D->getLine() << "\n"; + return; + } + errs() << "[DEBUG] No dbg info recovered\n"; +} + +// void debug_DDG(std::map> graph) { +// std::map>::iterator it = +// graph.begin(); while(it != graph.end()) { +// CustomDDGNode* src = it->first; +// std::vector sinks = it->second; +// +// it++; +// } +// } + +// Checks if Src is in the predecessor BB of To +bool isPredecessorBB(Instruction *Src, Instruction *To) { + BasicBlock *ToParent = To->getParent(); + BasicBlock *SrcParent = Src->getParent(); + for (auto it = pred_begin(ToParent); it != pred_end(ToParent); ++it) { + BasicBlock *predecessor = *it; + if (predecessor == SrcParent) return true; + } + return false; +} + +bool *isReachableByStore(std::vector *From, + Instruction *To, DominatorTree *DT, LoopInfo *LI, + unsigned *ConsideredStores) { + size_t NumberOfStores = From->size(); + unsigned bb_threshold = + NumberOfStores < BB_THRESHOLD ? NumberOfStores : BB_THRESHOLD; + *ConsideredStores = bb_threshold; + FlowWriteInstruction *TopNstores[bb_threshold]; + bool *ReachingStores = new bool[bb_threshold]; + SmallPtrSet ExclusionSet; + unsigned idx = 0; + for (std::vector::reverse_iterator it = + From->rbegin(); + it != From->rend(); it++) { + FlowWriteInstruction *MyStore = *it; + // TopNStores contains the last N stores, which are the ones that we check + // if are reachable. These are put in reverse order, i.e., the position `0` + // (TopNstores[0]) is the last store that we met (which is the last in the + // vector From) + TopNstores[idx] = MyStore; + ExclusionSet.insert(MyStore->BB); + idx++; + if (idx >= bb_threshold) break; + } + + // We need the ExclusionSet to be complete, before startintg with the actual + // check loop + for (int i = 0; i < bb_threshold; i++) { + Instruction *FromInstruction = TopNstores[i]->I; + if (TopNstores[i]->BB == To->getParent()) { + // If the two BBs are the same, we discard this flow. It is not + // interesting since if we reach the BB we cover it + ReachingStores[i] = false; + // continue; // RE-ENABLE THIS WHEN NO DEBUGGING IS NEEDED; + } + ExclusionSet.erase(TopNstores[i]->BB); + if (FromInstruction != To) { + bool r = + isPotentiallyReachable(FromInstruction, To, &ExclusionSet, DT, LI); + // errs() << "isPotentiallyReachable " << r << "\n"; + ReachingStores[i] = r; + } else + ReachingStores[i] = false; // Same instruction not reachable by itself + ExclusionSet.insert(TopNstores[i]->BB); + } + // ReachingStores[0] refers to the last Store instruction that we met + + return ReachingStores; +} \ No newline at end of file diff --git a/libafl_cc/src/ddg-utils.h b/libafl_cc/src/ddg-utils.h new file mode 100644 index 0000000000..b00b1548ab --- /dev/null +++ b/libafl_cc/src/ddg-utils.h @@ -0,0 +1,120 @@ + +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PassManager.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Analysis/MemoryBuiltins.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Comdat.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DIBuilder.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InlineAsm.h" +#include "llvm/IR/InstVisitor.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/Verifier.h" +#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/LegacyPassManager.h" +#include "llvm/MC/MCSectionMachO.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" +#include +#include "llvm/Transforms/Instrumentation.h" +#include "llvm/Transforms/Utils/ASanStackFrameLayout.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" +#include "llvm/Transforms/Utils/PromoteMemToReg.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace llvm; + +enum StoreType { declaration, modification }; + +struct FlowWriteInstruction { + BasicBlock *BB; + Instruction *I; + // Value* WrittenVar; + // Value* WhatWeAreWriting; + // std::vector* WhatWeAreDepending; + StoreType Type; + + FlowWriteInstruction(BasicBlock *_BB, Instruction *_I, StoreType _T) { + this->BB = _BB; + this->I = _I; + this->Type = _T; + } + + FlowWriteInstruction(struct FlowWriteInstruction *S) { + this->BB = S->BB; + this->I = S->I; + this->Type = S->Type; + } +}; + +// Debug + +void debug_instruction(Instruction *I); +// void debug_DDG(std::map> graph); + +// Other util methods + +bool *isReachableByStore(std::vector *From, + Instruction *To, DominatorTree *DT, LoopInfo *LI, + unsigned *ConsideredStores); +bool isPredecessorBB(Instruction *Src, Instruction *To); diff --git a/libafl_targets/src/cmplog.c b/libafl_targets/src/cmplog.c index d78fc15c11..9a2b175204 100644 --- a/libafl_targets/src/cmplog.c +++ b/libafl_targets/src/cmplog.c @@ -15,9 +15,9 @@ void *__libafl_asan_region_is_poisoned(void *beg, size_t size) { return NULL; } -#pragma comment( \ - linker, \ - "/alternatename:__asan_region_is_poisoned=__libafl_asan_region_is_poisoned") + #pragma comment( \ + linker, \ + "/alternatename:__asan_region_is_poisoned=__libafl_asan_region_is_poisoned") #elif defined(__unix__) || (defined(__APPLE__) && defined(__MACH__)) diff --git a/libafl_targets/src/coverage.c b/libafl_targets/src/coverage.c index e6231c454c..1749d0f395 100644 --- a/libafl_targets/src/coverage.c +++ b/libafl_targets/src/coverage.c @@ -11,6 +11,9 @@ typedef uint32_t prev_loc_t; extern uint8_t __afl_area_ptr_local[EDGES_MAP_SIZE]; uint8_t *__afl_area_ptr = __afl_area_ptr_local; +extern uint8_t __ddg_area_ptr_local[EDGES_MAP_SIZE]; +uint8_t *__ddg_area_ptr = __ddg_area_ptr_local; + extern uint32_t __afl_acc_memop_ptr_local[ACCOUNTING_MAP_SIZE]; uint32_t *__afl_acc_memop_ptr = __afl_acc_memop_ptr_local; diff --git a/libafl_targets/src/coverage.rs b/libafl_targets/src/coverage.rs index 4d82ea7ea1..9aa48f9538 100644 --- a/libafl_targets/src/coverage.rs +++ b/libafl_targets/src/coverage.rs @@ -12,6 +12,11 @@ use crate::{ACCOUNTING_MAP_SIZE, EDGES_MAP_SIZE}; pub static mut __afl_area_ptr_local: [u8; EDGES_MAP_SIZE] = [0; EDGES_MAP_SIZE]; pub use __afl_area_ptr_local as EDGES_MAP; +/// The map for data dependency +#[no_mangle] +pub static mut __ddg_area_ptr_local: [u8; EDGES_MAP_SIZE] = [0; EDGES_MAP_SIZE]; +pub use __ddg_area_ptr_local as DDG_MAP; + /// The map for accounting mem writes. #[no_mangle] pub static mut __afl_acc_memop_ptr_local: [u32; ACCOUNTING_MAP_SIZE] = [0; ACCOUNTING_MAP_SIZE]; @@ -24,6 +29,9 @@ extern "C" { /// The area pointer points to the edges map. pub static mut __afl_area_ptr: *mut u8; + /// The area pointer points to the data flow map + pub static mut __ddg_area_ptr: *mut u8; + /// The area pointer points to the accounting mem operations map. pub static mut __afl_acc_memop_ptr: *mut u32; @@ -37,6 +45,7 @@ extern "C" { } pub use __afl_acc_memop_ptr as ACCOUNTING_MEMOP_MAP_PTR; pub use __afl_area_ptr as EDGES_MAP_PTR; +pub use __ddg_area_ptr as DDG_MAP_PTR; /// Return Tokens from the compile-time token section #[cfg(any(target_os = "linux", target_vendor = "apple"))] diff --git a/scripts/fmt_all.sh b/scripts/fmt_all.sh index 948fbdc467..545a65dc44 100755 --- a/scripts/fmt_all.sh +++ b/scripts/fmt_all.sh @@ -11,9 +11,7 @@ cargo +nightly fmt echo "[*] Formatting C(pp) files" # shellcheck disable=SC2046 -clang-format-17 -i --style=file $(find . -type f \( -name '*.cpp' -o -iname '*.hpp' -o -name '*.cc' -o -name '*.cxx' -o -name '*.cc' -o -name '*.c' -o -name '*.h' \) | grep -v '/target/' | grep -v 'libpng-1\.6\.37' | grep -v 'stb_image\.h' | grep -v 'dlmalloc\.c') - - +clang-format-18 -i --style=file $(find . -type f \( -name '*.cpp' -o -iname '*.hpp' -o -name '*.cc' -o -name '*.cxx' -o -name '*.cc' -o -name '*.c' -o -name '*.h' \) | grep -v '/target/' | grep -v 'libpng-1\.6\.37' | grep -v 'stb_image\.h' | grep -v 'dlmalloc\.c') fuzzers=$(find ./fuzzers -maxdepth 1 -type d) backtrace_fuzzers=$(find ./fuzzers/backtrace_baby_fuzzers -maxdepth 1 -type d) @@ -32,4 +30,4 @@ pushd "libafl_libfuzzer/libafl_libfuzzer_runtime" || exit 1 cargo +nightly fmt --all popd || exit 1 -echo "[*] Done :)" \ No newline at end of file +echo "[*] Done :)"