From 556b61234edaec30a7634769ddd65e4ab5c7ed78 Mon Sep 17 00:00:00 2001
From: taranbis <mikigrm@yahoo.com>
Date: Wed, 2 Sep 2020 21:05:21 +0200
Subject: [PATCH] perf: add implementation for storing call stacks as a prefix
 tree

- Implementations for storing call stack information will just have to
  implement 4 functions: make_trace(), get_current_element(),
  insert_function_element() and remove_function_element();
- PrefixTree_StackDepot is the first version of adapting a prefix tree
  to be used for storing call stack information. It uses hash maps
  instead of constant size arrays;
- PrefixTreeDepot.h represents the cache efficient optimized version for
  the prefix tree implementation.
---
 standalone/CMakeLists.txt                     |   1 -
 standalone/binarydecoder/BinaryDecoder.cpp    |  10 +-
 standalone/detectors/fasttrack/CMakeLists.txt |   2 +
 .../detectors/fasttrack/include/MemoryPool.h  |  61 ++++
 .../fasttrack/include/PoolAllocator.h         | 199 +++++++++++++
 .../fasttrack/include/PrefixTreeDepot.h       | 278 ++++++++++++++++++
 .../fasttrack/include/PrefixTree_StackDepot.h |  95 ++++++
 .../detectors/fasttrack/include/fasttrack.h   |   9 +-
 .../detectors/fasttrack/include/stacktrace.h  |   8 -
 .../detectors/fasttrack/include/threadstate.h |   4 +-
 .../detectors/fasttrack/include/vectorclock.h |  11 +
 .../detectors/fasttrack/src/MemoryPool.cpp    |  31 ++
 .../fasttrack/test/fasttrack_test.cpp         |   6 +-
 standalone/ft_benchmark/ft_benchmark.cpp      |  16 +-
 standalone/helper/CMakeLists.txt              |  16 -
 standalone/helper/src/main.cpp                |  46 ---
 16 files changed, 705 insertions(+), 88 deletions(-)
 create mode 100644 standalone/detectors/fasttrack/include/MemoryPool.h
 create mode 100644 standalone/detectors/fasttrack/include/PoolAllocator.h
 create mode 100644 standalone/detectors/fasttrack/include/PrefixTreeDepot.h
 create mode 100644 standalone/detectors/fasttrack/include/PrefixTree_StackDepot.h
 create mode 100644 standalone/detectors/fasttrack/src/MemoryPool.cpp
 delete mode 100644 standalone/helper/CMakeLists.txt
 delete mode 100644 standalone/helper/src/main.cpp

diff --git a/standalone/CMakeLists.txt b/standalone/CMakeLists.txt
index f913da6..da7aeae 100644
--- a/standalone/CMakeLists.txt
+++ b/standalone/CMakeLists.txt
@@ -21,5 +21,4 @@ endif()
 
 add_subdirectory("detectors/fasttrack")
 add_subdirectory("binarydecoder")
-add_subdirectory("helper")
 add_subdirectory("ft_benchmark")
diff --git a/standalone/binarydecoder/BinaryDecoder.cpp b/standalone/binarydecoder/BinaryDecoder.cpp
index 0369bd2..518f712 100644
--- a/standalone/binarydecoder/BinaryDecoder.cpp
+++ b/standalone/binarydecoder/BinaryDecoder.cpp
@@ -19,7 +19,6 @@
 #include "DetectorOutput.h"
 
 int main(int argc, char** argv) {
-  //    std::string detec = "drace.detector.tsan.dll";
   std::string detec = "drace.detector.fasttrack.standalone.dll";
   std::string file = "trace.bin";
 
@@ -46,14 +45,19 @@ int main(int argc, char** argv) {
     std::vector<ipc::event::BufferEntry> buffer(
         (size_t)(size / sizeof(ipc::event::BufferEntry)));
 
-    __debugbreak();
+    /**
+     * \note debug breaks are placed to measure only the performance of the
+     * detection algorithm, however, on a single thread. They are placed right
+     * before the first operation of the detector and right after the last one
+     */
+    // __debugbreak();
     DetectorOutput output(detec.c_str());
     if (in_file.read((char*)(buffer.data()), size).good()) {
       for (auto it = buffer.begin(); it != buffer.end(); ++it) {
         ipc::event::BufferEntry tmp = *it;
         output.makeOutput(&tmp);
       }
-      __debugbreak();
+      // __debugbreak();
     }
   } catch (const std::exception& e) {
     std::cerr << "Could not load detector: " << e.what() << std::endl;
diff --git a/standalone/detectors/fasttrack/CMakeLists.txt b/standalone/detectors/fasttrack/CMakeLists.txt
index 965b607..f9c79a4 100644
--- a/standalone/detectors/fasttrack/CMakeLists.txt
+++ b/standalone/detectors/fasttrack/CMakeLists.txt
@@ -11,12 +11,14 @@
 set(FT_SOURCES
     "src/stacktrace"
     "src/varstate"
+    "src/MemoryPool"
     "src/threadstate")
 
 set(FT_TEST_SOURCES
     "test/fasttrack_test"
     "src/stacktrace"
     "src/varstate"
+    "src/MemoryPool"
     "src/threadstate")
 
 include(GenerateExportHeader)
diff --git a/standalone/detectors/fasttrack/include/MemoryPool.h b/standalone/detectors/fasttrack/include/MemoryPool.h
new file mode 100644
index 0000000..d40ba50
--- /dev/null
+++ b/standalone/detectors/fasttrack/include/MemoryPool.h
@@ -0,0 +1,61 @@
+#ifndef MEMORY_POOL_HEADER_H
+#define MEMORY_POOL_HEADER_H 1
+#pragma once
+
+/*
+ * DRace, a dynamic data race detector
+ *
+ * Copyright 2020 Siemens AG
+ *
+ * Authors:
+ *   Mihai Robescu <mihai-gabriel.robescu@siemens.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <assert.h>
+#include <iostream>
+
+struct Chunk {
+  Chunk *next;  // pointer to the next Chunk, when chunk is free
+};
+
+// For each allocator there will be a separate instantion of the memory pool
+class MemoryPool {
+ private:
+  /// pointer to the first free address
+  Chunk *free_pointer = nullptr;
+
+  /// number of chunks in a block
+  size_t num_chunks;
+
+  /// chunk size equivalent to sizeof(T) from template specialization
+  size_t chunk_size;
+
+  /// block size
+  size_t block_size;
+
+  /// holds how many chunks were allocated until now
+  size_t chunks_allocated = 0;
+
+ public:
+  MemoryPool() = default;
+  MemoryPool(size_t chunkSize) : chunk_size(chunkSize) {}
+  MemoryPool(size_t chunkSize, size_t numChunks)
+      : chunk_size(chunkSize), num_chunks(numChunks) {
+    block_size = chunk_size * num_chunks;
+    free_pointer = get_more_memory();
+  }
+  Chunk *allocate() { return do_allocation(); }
+  void deallocate(void *ptr) { do_deallocation(ptr); }
+  void print_used_memory() {
+    std::cout << "Memory Allocated: " << chunks_allocated * chunk_size
+              << std::endl;
+  }
+
+ private:
+  Chunk *do_allocation();
+  Chunk *get_more_memory();  // allocate 1 block of chunks
+  void do_deallocation(void *ptr);
+};
+#endif  // !MEMORY_POOL_HEADER_H
diff --git a/standalone/detectors/fasttrack/include/PoolAllocator.h b/standalone/detectors/fasttrack/include/PoolAllocator.h
new file mode 100644
index 0000000..d9c48c3
--- /dev/null
+++ b/standalone/detectors/fasttrack/include/PoolAllocator.h
@@ -0,0 +1,199 @@
+#ifndef POOL_ALLOCATOR_HEADER_H
+#define POOL_ALLOCATOR_HEADER_H 1
+#pragma once
+
+/*
+ * DRace, a dynamic data race detector
+ *
+ * Copyright 2020 Siemens AG
+ *
+ * Authors:
+ *   Mihai Robescu <mihai-gabriel.robescu@siemens.com>
+ *   Felix Moessbauer <felix.moessbauer@siemens.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <iostream>
+#include <limits>
+
+#include <ipc/spinlock.h>
+#include "MemoryPool.h"
+
+/**
+ *------------------------------------------------------------------------------
+ *
+ * Header File implementing a pool allocator. It can also be used
+ * in combination with the Segregator class, such as on line 136.
+ *
+ * There is also a thread-safe version. This is currently only
+ * experimental
+ *
+ *------------------------------------------------------------------------------
+ */
+
+template <class T, int threshold, class SmallAllocator, class LargeAllocator>
+class Segregator {
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using void_pointer = void*;
+  using const_void_pointer = const void*;
+
+  T* allocate(size_t n) {
+    size_t size = n * sizeof(T);
+    if (size < threshold) {
+      // return new (reinterpret_cast<void*>(SmallAllocator::allocate())) T();
+      return reinterpret_cast<void*>(SmallAllocator::allocate());
+    } else {
+      return reinterpret_cast<void*>(LargeAllocator::allocate());
+    }
+  }
+  void deallocate(T* p, std::size_t n) noexcept {
+    size_t size = n * sizeof(T);
+    if (size < threshold) {
+      return SmallAllocator::deallocate(reinterpret_cast<void*>(p));
+    } else {
+      return LargeAllocator::deallocate(reinterpret_cast<void*>(p));
+    }
+  }
+  template <typename U>
+  struct rebind {
+    using other = Segregator<U, threshold, SmallAllocator, LargeAllocator>;
+  };
+};
+
+template <typename T, size_t numChunks = 512>
+class PoolAllocator {
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using void_pointer = void*;
+  using const_void_pointer = const void*;
+
+  PoolAllocator<T, numChunks>() = default;
+  ~PoolAllocator<T, numChunks>() = default;
+
+  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+
+  static pointer allocate() {
+    return reinterpret_cast<pointer>(mem_pool.allocate());
+  }
+
+  static void deallocate(void* ptr) {
+    mem_pool.deallocate(ptr);
+  }
+
+  void usedMemory() { mem_pool.print_used_memory(); }
+
+  template <class U>
+  PoolAllocator(const PoolAllocator<U, numChunks>& other) {}
+
+  template <typename U>
+  struct rebind {
+    using other = PoolAllocator<U, numChunks>;
+  };
+
+ private:
+  static MemoryPool mem_pool;
+};
+template <typename T, size_t numChunks>
+MemoryPool PoolAllocator<T, numChunks>::mem_pool(sizeof(T), numChunks);
+
+template <size_t size, size_t numChunks = 512>
+class SizePoolAllocator {
+ public:
+  SizePoolAllocator<size, numChunks>() = default;
+  ~SizePoolAllocator<size, numChunks>() = default;
+  using size_type = size_t;
+
+  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+
+  static void* allocate() {
+    return reinterpret_cast<void*>(mem_pool.allocate());
+  }
+
+  static void deallocate(void* ptr) { mem_pool.deallocate(ptr); }
+
+  static void usedMemory() { mem_pool.print_used_memory(); }
+
+ private:
+  static MemoryPool mem_pool;
+};
+template <size_t size, size_t numChunks>
+MemoryPool SizePoolAllocator<size, numChunks>::mem_pool(size, numChunks);
+
+template <class T>
+using DRaceAllocator = Segregator<
+    T, 5, SizePoolAllocator<4>,
+    Segregator<
+        T, 9, SizePoolAllocator<8>,
+        Segregator<
+            T, 17, SizePoolAllocator<16>,
+            Segregator<T, 65, SizePoolAllocator<64>,
+                       Segregator<T, 257, SizePoolAllocator<256>,
+                                  Segregator<T, 1025, SizePoolAllocator<1024>,
+                                             std::allocator<T>>>>>>>;
+
+template <typename T, size_t numChunks = 512>
+class ThreadSafePoolAllocator {
+ private:
+  std::atomic<Chunk*> free_pointer{nullptr};    // pointer to the first free
+  size_t num_chunks = numChunks;                // number of chunks in a block
+  size_t chunk_size = sizeof(T);                // chunk size equivalent
+  size_t block_size = num_chunks * chunk_size;  // block size
+  size_t chunks_allocated = 0;  // how much memory was allocated until now
+
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using difference_type = ptrdiff_t;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using void_pointer = void*;
+  using const_void_pointer = const void*;
+
+  ThreadSafePoolAllocator<T, numChunks>() = default;
+  ~ThreadSafePoolAllocator<T, numChunks>() = default;
+
+  size_type max_size() const { return std::numeric_limits<size_type>::max(); }
+
+  pointer allocate() {
+    if (free_pointer.load(std::memory_order_release) == nullptr) {
+      free_pointer.store(get_more_memory(), std::memory_order_acquire);
+    }
+    // now we can for sure allocate all the objects.
+    Chunk* allocated = free_pointer.load(std::memory_order_release);
+    free_pointer.store(free_pointer.load(std::memory_order_release),
+                       std::memory_order_acquire);
+    chunks_allocated++;
+    return reinterpret_cast<pointer>(allocated);
+  }
+
+  Chunk* get_more_memory() {
+    Chunk* start = reinterpret_cast<Chunk*>(operator new(block_size));
+    Chunk* it = start;
+    for (size_t i = 0; i < num_chunks - 1; ++i) {
+      it->next =
+          reinterpret_cast<Chunk*>(reinterpret_cast<char*>(it) + chunk_size);
+      it = it->next;
+    }
+    it->next = nullptr;
+    return start;
+  }
+
+  void deallocate(void* ptr) {
+    Chunk* c = reinterpret_cast<Chunk*>(ptr);
+    c->next = free_pointer.load(std::memory_order_release);
+    free_pointer.store(c, std::memory_order_acquire);
+    chunks_allocated--;
+  }
+};
+
+#endif  //! POOL_ALLOCATOR_HEADER_H
\ No newline at end of file
diff --git a/standalone/detectors/fasttrack/include/PrefixTreeDepot.h b/standalone/detectors/fasttrack/include/PrefixTreeDepot.h
new file mode 100644
index 0000000..cecff0c
--- /dev/null
+++ b/standalone/detectors/fasttrack/include/PrefixTreeDepot.h
@@ -0,0 +1,278 @@
+
+#ifndef TREEDEPOT_HEADER_H
+#define TREEDEPOT_HEADER_H 1
+#pragma once
+
+/*
+ * DRace, a dynamic data race detector
+ *
+ * Copyright 2020 Siemens AG
+ *
+ * Authors:
+ *   Mihai Robescu <mihai-gabriel.robescu@siemens.com>
+ *   Felix Moessbauer <felix.moessbauer@siemens.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <deque>
+#include <memory>
+#include "PoolAllocator.h"
+
+/**
+ *------------------------------------------------------------------------------
+ *
+ * Header File that implements a prefix tree data structure to
+ * store call stack elements. This is one is optimized to be
+ * cache efficient, making each node a multiple of the size of the
+ * cache line
+ *
+ *------------------------------------------------------------------------------
+ */
+
+class INode {
+ public:
+  size_t pc = -1;           // 8 bytes
+  INode* parent = nullptr;  // 8 bytes
+
+  virtual INode* fast_check(size_t pc) const {
+    throw std::runtime_error("Not implemented");
+    return nullptr;
+  }
+
+  virtual size_t size() const {
+    throw std::runtime_error("Not implemented");
+    return -1;
+  }
+
+  virtual ~INode() {}
+
+  virtual bool add_child_node(INode* next, size_t pc) {
+    throw std::runtime_error("Not implemented");
+    return 1;
+  }
+
+  virtual void change_child_node(INode* tmp, INode* _curr_elem) {
+    throw std::runtime_error("Not implemented");
+  }
+
+  virtual void change_parent_node(INode* tmp) {
+    throw std::runtime_error("Not implemented");
+  }
+};
+
+template <size_t N>
+class Node : public INode {
+ public:
+  std::array<size_t, N> child_values;  // N * 8 bytes
+  std::array<INode*, N> child_nodes;   // N * 8 bytes
+
+  ~Node() = default;
+  Node& operator=(const Node& other) = default;
+  Node(const Node& other) = delete;
+
+  explicit Node() {
+    pc = -1;
+    parent = nullptr;
+    for (int i = 0; i < N; ++i) {
+      child_values[i] = -1;
+    }
+    for (int i = 0; i < N; ++i) {
+      child_nodes[i] = nullptr;
+    }
+  }
+
+  size_t size() const final { return N; }
+
+  INode* fast_check(size_t pc) const final {
+    for (int i = 0; i < N; ++i) {
+      if (child_values[i] == pc) {
+        return child_nodes[i];
+      }
+    }
+    return nullptr;
+  }
+
+  bool add_child_node(INode* next, size_t pc) final {
+    for (int i = 0; i < N; ++i) {
+      if (child_values[i] == -1) {
+        child_values[i] = pc;
+        child_nodes[i] = next;
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void change_child_node(INode* tmp, INode* _curr_elem) final {
+    for (int i = 0; i < N; ++i) {
+      if (child_nodes[i] == _curr_elem) {
+        child_nodes[i] = tmp;
+        return;
+      }
+    }  // replace new pointer in the parent list
+  }
+
+  void change_parent_node(INode* tmp) final {
+    for (int i = 0; i < N; ++i) {
+      child_nodes[i]->parent = tmp;
+    }
+  }
+};
+
+template <class T>
+class SelectAllocator {
+ public:
+  static constexpr int threshold1 = 2;
+  using Allocator1 = PoolAllocator<Node<threshold1>, 8192>;
+  static constexpr int threshold2 = 6;
+  using Allocator2 = PoolAllocator<Node<threshold2>, 4096>;
+  static constexpr int threshold3 = 10;
+  using Allocator3 = PoolAllocator<Node<threshold3>, 64>;
+  static constexpr int threshold4 = 38;
+  using Allocator4 = PoolAllocator<Node<threshold4>, 64>;
+  static constexpr int threshold5 = 198;
+  using Allocator5 = PoolAllocator<Node<threshold5>, 32>;
+  static constexpr int threshold6 = 1000;
+  using LargeAllocator = std::allocator<Node<threshold6>>;
+
+  Allocator1 al1;
+  Allocator2 al2;
+  Allocator3 al3;
+  Allocator4 al4;
+  Allocator5 al5;
+  LargeAllocator alL;
+
+  T* allocate(size_t size) {
+    if (size < threshold1) {
+      return new (reinterpret_cast<void*>(al1.allocate())) Node<threshold1>();
+    } else if (size < threshold2) {
+      return new (reinterpret_cast<void*>(al2.allocate())) Node<threshold2>();
+    } else if (size < threshold3) {
+      return new (reinterpret_cast<void*>(al3.allocate())) Node<threshold3>();
+    } else if (size < threshold4) {
+      return new (reinterpret_cast<void*>(al4.allocate())) Node<threshold4>();
+    } else if (size < threshold5) {
+      return new (reinterpret_cast<void*>(al5.allocate())) Node<threshold5>();
+    } else {  // allocate just 1;
+      Node<threshold6>* new_t =
+          std::allocator_traits<LargeAllocator>::allocate(alL, 1);
+      std::allocator_traits<LargeAllocator>::construct(alL, new_t);
+      return new_t;
+    }
+  }
+
+  void deallocate(INode* ptr, size_t size) {
+    if (size < threshold1) {
+      // Node<threshold1>* tmp = dynamic_cast<Node<threshold1>*>(ptr);
+      // tmp->~Node<threshold1>(); //doens't work when I am calling destructor
+      al1.deallocate(ptr);
+    } else if (size < threshold2) {
+      al2.deallocate(ptr);
+    } else if (size < threshold3) {
+      al3.deallocate(ptr);
+    } else if (size < threshold4) {
+      al4.deallocate(ptr);
+    } else if (size < threshold5) {
+      al5.deallocate(ptr);
+    } else {  // deallocate just 1;
+      Node<threshold6>* tmp = dynamic_cast<Node<threshold6>*>(ptr);
+      std::allocator_traits<LargeAllocator>::destroy(alL, tmp);
+      std::allocator_traits<LargeAllocator>::deallocate(alL, tmp, 1);
+    }
+  }
+};
+
+using Allocator = SelectAllocator<INode>;
+extern ipc::spinlock read_write_lock;
+
+class TreeDepot {
+  INode* _curr_elem = nullptr;
+  Allocator al;
+
+ public:
+  INode* get_current_element() {
+    // std::lock_guard<ipc::spinlock> lg(_read_write_lock);
+    return _curr_elem;
+  }
+
+  void insert_function_element(size_t pc) {
+    std::lock_guard<ipc::spinlock> lg(read_write_lock);
+
+    if (_curr_elem == nullptr) {
+      // the root function has to be called with a big size
+      _curr_elem = al.allocate(5);
+      _curr_elem->parent = nullptr;
+      _curr_elem->pc = pc;
+      return;
+    }
+
+    if (pc == _curr_elem->pc) return;  // done for recursive functions;
+    // no need to use more nodes for same function
+
+    INode* next = _curr_elem->fast_check(pc);
+    if (next) {
+      _curr_elem = next;
+      return;
+    } else {  // it is not the current node or any of the child nodes
+      next = al.allocate(1);
+      next->pc = pc;
+      next->parent = _curr_elem;
+      if (_curr_elem->add_child_node(next, pc)) {
+        _curr_elem = next;
+        return;
+      }
+    }
+    // If we got to here, it means that the current node should be of a
+    // bigger size => allocate next big thing;
+    INode* tmp = al.allocate(_curr_elem->size());
+    *tmp = *_curr_elem;  // might be really slow, use move instead of copy
+                         // assignment operator
+    INode* parent = _curr_elem->parent;
+    if (parent) {
+      parent->change_child_node(tmp, _curr_elem);
+    }
+    // replace so that children of current node point to the new value
+    _curr_elem->change_parent_node(tmp);
+
+    // TODO: MUST replace it too in ThreadState::read_write
+    // Allocator::deallocate(_curr_elem, _curr_elem->size() - 1);
+
+    _curr_elem = tmp;
+    next->parent = _curr_elem;
+
+    // we already know that here we can go to the end.
+    _curr_elem->add_child_node(next, pc);
+    _curr_elem = next;
+  }
+
+  void remove_function_element() {
+    std::lock_guard<ipc::spinlock> lg(read_write_lock);
+
+    if (_curr_elem == nullptr) return;  // func_exit before func_enter
+
+    if (_curr_elem->parent == nullptr) {  // exiting the root function
+      // Allocator::deallocate(_curr_elem, _curr_elem->size() - 1);
+      // not deallocating anymore because node might be used
+      _curr_elem = nullptr;
+      return;
+    }
+    _curr_elem = _curr_elem->parent;
+  }
+
+  std::deque<size_t> make_trace(const std::pair<size_t, INode*>& data) const {
+    std::lock_guard<ipc::spinlock> lg(read_write_lock);
+
+    std::deque<size_t> this_stack;
+    this_stack.emplace_front(data.first);
+
+    INode* iter = data.second;
+    while (iter != nullptr) {
+      this_stack.emplace_front(iter->pc);
+      iter = iter->parent;
+    }
+    return std::move(this_stack);
+  }
+};
+
+#endif
diff --git a/standalone/detectors/fasttrack/include/PrefixTree_StackDepot.h b/standalone/detectors/fasttrack/include/PrefixTree_StackDepot.h
new file mode 100644
index 0000000..65bc66d
--- /dev/null
+++ b/standalone/detectors/fasttrack/include/PrefixTree_StackDepot.h
@@ -0,0 +1,95 @@
+#ifndef PREFIXTREE_STACKDEPOT_HEADER_H
+#define PREFIXTREE_STACKDEPOT_HEADER_H 1
+#pragma once
+
+/*
+ * DRace, a dynamic data race detector
+ *
+ * Copyright 2020 Siemens AG
+ *
+ * Authors:
+ *   Mihai Robescu <mihai-gabriel.robescu@siemens.com>
+ *   Felix Moessbauer <felix.moessbauer@siemens.com>
+ *
+ * SPDX-License-Identifier: MIT
+ */
+
+#include <ipc/spinlock.h>
+#include <deque>
+#include <iostream>
+#include <memory>
+#include "PoolAllocator.h"
+#include "parallel_hashmap/phmap.h"
+
+/**
+ *------------------------------------------------------------------------------
+ *
+ * Header File that provides the initial implementation of a Prefix
+ * Tree for being able to store call stacks. Check "PrefixTreeDepot.h" for a
+ * cache efficient implementation
+ *
+ *------------------------------------------------------------------------------
+ */
+
+typedef struct TrieNode {
+  size_t pc = -1;
+  TrieNode* parent = nullptr;
+  phmap::node_hash_map<size_t, TrieNode> _childNodes;
+
+  TrieNode() = default;
+} TrieNode;
+
+class TrieStackDepot {
+  TrieNode* _curr_elem = nullptr;
+
+ public:
+  TrieNode* get_current_element() { return _curr_elem; }
+
+  void InsertFunction(size_t pc) {
+    if (_curr_elem == nullptr) {
+      // using new is slow => PoolAllocator in "PrefixTreeDepot.h"
+      _curr_elem = new TrieNode();
+      _curr_elem->parent = nullptr;
+      _curr_elem->pc = pc;
+    }
+    if (pc == _curr_elem->pc) return;  // done for recursive functions;
+    // no need to use more nodes for same function
+
+    auto it = _curr_elem->_childNodes.find((size_t)pc);
+    if (it == _curr_elem->_childNodes.end()) {
+      it = _curr_elem->_childNodes.emplace_hint(it, pc, TrieNode());
+    }
+
+    TrieNode* next = &(it->second);
+    next->parent = _curr_elem;
+    _curr_elem = next;
+    _curr_elem->pc = pc;
+  }
+
+  void ExitFunction() {
+    if (_curr_elem == nullptr) return;  // func_exit before func_enter
+
+    if (_curr_elem->parent == nullptr) {  // exiting the root function
+      // delete _curr_elem; !! MEMORY IS NEVER FREED DOING THIS
+      // as there can be still slements pointing to it;
+      // TODO: remove all elements pointing to it as well
+      _curr_elem = nullptr;
+      return;
+    }
+    _curr_elem = _curr_elem->parent;
+  }
+
+  std::deque<size_t> make_trace(
+      const std::pair<size_t, TrieNode*>& data) const {
+    std::deque<size_t> this_stack;
+    this_stack.emplace_front(data.first);
+
+    TrieNode* iter = data.second;
+    while (iter != nullptr) {
+      this_stack.emplace_front(iter->pc);
+      iter = iter->parent;
+    }
+    return std::move(this_stack);
+  }
+};
+#endif
diff --git a/standalone/detectors/fasttrack/include/fasttrack.h b/standalone/detectors/fasttrack/include/fasttrack.h
index 9957f54..db5e7fc 100644
--- a/standalone/detectors/fasttrack/include/fasttrack.h
+++ b/standalone/detectors/fasttrack/include/fasttrack.h
@@ -800,7 +800,8 @@ class Fasttrack : public Detector {
       if (VectorClock<>::make_thread_num(it->second.get_read_epoch()) ==
               th_num &&
           VectorClock<>::make_thread_num(it->second.get_read_epoch()) ==
-              th_num) {  // the memory address was accessed only by this thread
+              th_num) {  // the memory address was accessed only by this
+                         // thread
         auto tmp = it;
         it++;
         vars.erase(tmp);
@@ -925,7 +926,8 @@ class Fasttrack : public Detector {
     }
   }
 
-  /// removes memory addresses by choosing every 3 the one with the lowest clock
+  /// removes memory addresses by choosing every 3 the one with the lowest
+  /// clock
   void remove_memory_addresses_by_lowest_clock() {
     if (log_flag) {
       log_count.remove_memory_addresses_by_lowest_clock_calls++;
@@ -937,7 +939,8 @@ class Fasttrack : public Detector {
     auto remove_it = it;
 
     while (it != vars.end()) {
-      // gather data from 3 variables and remove the one with the lowest clock.
+      // gather data from 3 variables and remove the one with the lowest
+      // clock.
       if (it->second.get_write_clock() < min_clock) {
         remove_it = it;
         min_clock = it->second.get_write_clock();
diff --git a/standalone/detectors/fasttrack/include/stacktrace.h b/standalone/detectors/fasttrack/include/stacktrace.h
index 58a9ff4..342213f 100644
--- a/standalone/detectors/fasttrack/include/stacktrace.h
+++ b/standalone/detectors/fasttrack/include/stacktrace.h
@@ -48,14 +48,6 @@ class StackTrace {
   uint16_t pop_count = 0;
 
   mutable ipc::spinlock lock;
-
-  /**
-   * \note locking was moved to ThreadState
-   * \note Locking is necessary if and only if elements are removed from the
-   * tree. As long as no elements are removed locking is not necessary: mutable
-   * ipc::spinlock lock;
-   */
-
   /**
    * \brief cleanup unreferenced nodes in callstack tree
    * \warning very expensive
diff --git a/standalone/detectors/fasttrack/include/threadstate.h b/standalone/detectors/fasttrack/include/threadstate.h
index 8a7eda1..04d271d 100644
--- a/standalone/detectors/fasttrack/include/threadstate.h
+++ b/standalone/detectors/fasttrack/include/threadstate.h
@@ -16,6 +16,8 @@
 #include <atomic>
 #include <memory>
 
+#include "PrefixTreeDepot.h"
+#include "PrefixTree_StackDepot.h"
 #include "stacktrace.h"
 #include "vectorclock.h"
 #include "xvector.h"
@@ -114,7 +116,7 @@ class ThreadState : public VectorClock<> {
 
   /**
    * \brief returns a stack trace of a memory location for handing it over to
-   * drace \note theadsafe
+   * drace \note threadsafe
    */
   std::deque<size_t> return_stack_trace(std::size_t address) const;
 };
diff --git a/standalone/detectors/fasttrack/include/vectorclock.h b/standalone/detectors/fasttrack/include/vectorclock.h
index e43c998..a7d9f73 100644
--- a/standalone/detectors/fasttrack/include/vectorclock.h
+++ b/standalone/detectors/fasttrack/include/vectorclock.h
@@ -12,6 +12,7 @@
  *
  * SPDX-License-Identifier: MIT
  */
+
 #include "parallel_hashmap/phmap.h"
 
 /**
@@ -168,6 +169,16 @@ class VectorClock {
     }
     return id;
   }
+
+ private:
+  static bool _thread_no_initiliazation;
+  static bool ThreadNoInitialization() {
+    for (int i = MAX_TH_NUM; i >= 1;
+         --i) {  //!! thread numbers start from 1 because of is_rw_sh_race
+      thread_nums.emplace(i);
+    }
+    return id;
+  }
 };
 VectorClock<>::ThreadNum VectorClock<>::thread_no = 1;
 phmap::flat_hash_map<VectorClock<>::ThreadNum, VectorClock<>::TID>
diff --git a/standalone/detectors/fasttrack/src/MemoryPool.cpp b/standalone/detectors/fasttrack/src/MemoryPool.cpp
new file mode 100644
index 0000000..43e3ad2
--- /dev/null
+++ b/standalone/detectors/fasttrack/src/MemoryPool.cpp
@@ -0,0 +1,31 @@
+#include "MemoryPool.h"
+
+Chunk* MemoryPool::do_allocation() {
+  if (free_pointer == nullptr) {
+    free_pointer = get_more_memory();
+  }
+  // now we can for sure allocate all the objects.
+  Chunk* allocated = free_pointer;
+  free_pointer = free_pointer->next;
+  chunks_allocated++;
+  return allocated;
+}
+
+Chunk* MemoryPool::get_more_memory() {
+  Chunk* start = reinterpret_cast<Chunk*>(operator new(block_size));
+  Chunk* it = start;
+  for (size_t i = 0; i < num_chunks - 1; ++i) {
+    it->next =
+        reinterpret_cast<Chunk*>(reinterpret_cast<char*>(it) + chunk_size);
+    it = it->next;
+  }
+  it->next = nullptr;
+  return start;
+}
+
+void MemoryPool::do_deallocation(void* ptr) {
+  Chunk* c = reinterpret_cast<Chunk*>(ptr);
+  c->next = free_pointer;
+  free_pointer = c;
+  chunks_allocated--;
+}
\ No newline at end of file
diff --git a/standalone/detectors/fasttrack/test/fasttrack_test.cpp b/standalone/detectors/fasttrack/test/fasttrack_test.cpp
index 59b0ad2..3cac688 100644
--- a/standalone/detectors/fasttrack/test/fasttrack_test.cpp
+++ b/standalone/detectors/fasttrack/test/fasttrack_test.cpp
@@ -367,7 +367,7 @@ TEST(FasttrackTest, Drop_State_Indicate_Shared_Read_Write_Race) {
   const char* argv_mock[] = {"ft_test", "--size", "2"};
   void* tls[3];  // storage for TLS data
 
-  ft->init(1, argv_mock, rc_clb, nullptr);
+  ft->init(3, argv_mock, rc_clb, nullptr);
   ft->fork(0, 1, &tls[0]);  // t0
   ft->fork(0, 2, &tls[1]);  // t1
   ft->fork(0, 3, &tls[2]);  // t2
@@ -404,7 +404,7 @@ TEST(FasttrackTest, Write_Write_Race) {
   void* tls[3];                      // storage for TLS data
   void* mtx[2] = {(void*)0x123ull, (void*)0x1234ull};
 
-  ft->init(1, argv_mock, rc_clb, nullptr);
+  ft->init(3, argv_mock, rc_clb, nullptr);
   ft->fork(0, 1, &tls[0]);  // t0
   ft->fork(0, 2, &tls[1]);  // t1
   ft->fork(0, 3, &tls[2]);  // t2
@@ -548,4 +548,4 @@ TEST(FasttrackTest, Fasttrack_Race_And_StackTrace) {
   ft->func_enter(tls[1], (void*)0x70ull);
   // here, we expect the race. Handled in callback
   ft->finalize();
-}
\ No newline at end of file
+}
diff --git a/standalone/ft_benchmark/ft_benchmark.cpp b/standalone/ft_benchmark/ft_benchmark.cpp
index dbb9033..a78b81e 100644
--- a/standalone/ft_benchmark/ft_benchmark.cpp
+++ b/standalone/ft_benchmark/ft_benchmark.cpp
@@ -1,9 +1,10 @@
 /*
  * DRace, a dynamic data race detector
  *
- * Copyright 2018 Siemens AG
+ * Copyright 2020 Siemens AG
  *
  * Authors:
+ *   Mihai Robescu <mihai-gabriel.robescu@siemens.com>
  *   Felix Moessbauer <felix.moessbauer@siemens.com>
  *
  * SPDX-License-Identifier: MIT
@@ -19,11 +20,12 @@
 #include <thread>
 #include <vector>
 
+
 std::mutex mx;
 static std::set<int> random_reads;
 static std::set<int> random_writes;
 static std::random_device rd{};
-std::mt19937 gen{ 0 };
+std::mt19937 gen{0};
 
 void generate_block(int i,
                     std::vector<std::pair<uintptr_t*, uintptr_t*>>* blocks) {
@@ -52,7 +54,7 @@ void read_from_block(std::vector<std::pair<uintptr_t*, uintptr_t*>>* blocks) {
   } catch (const std::exception& e) {
     std::cout << e.what() << std::endl;
   } catch (...) {
-    std::cout << "Something!" << std::endl;
+    std::cout << "Failure!" << std::endl;
   }
 }
 
@@ -73,15 +75,15 @@ void write_to_block(std::vector<std::pair<uintptr_t*, uintptr_t*>>* blocks) {
   } catch (const std::exception& e) {
     std::cout << e.what() << std::endl;
   } catch (...) {
-    std::cout << "Something!" << std::endl;
+    std::cout << "Failure!" << std::endl;
   }
 }
 
 int CountPossibleDataRaces();
 
 /**
- * Test tool to check for memory corruption and layout.
- * To also check the race reporting, we try to enforce data-races
+ * \brief benchmarking program used to test the performance of the FastTrack2
+ * algorithm implementation
  */
 int main(int argc, char** argv) {
   std::vector<std::pair<uintptr_t*, uintptr_t*>> blocks;
@@ -120,7 +122,7 @@ int main(int argc, char** argv) {
 
   std::cout << "No. of possible data races: " << std::setw(3)
             << no_of_data_races << std::endl;
-  //std::cin.get();
+  // std::cin.get();
 }
 
 int CountPossibleDataRaces() {
diff --git a/standalone/helper/CMakeLists.txt b/standalone/helper/CMakeLists.txt
deleted file mode 100644
index a0ffd06..0000000
--- a/standalone/helper/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-
-set(HELPER_SOURCES "src/main.cpp")
-
-add_executable("helper" ${HELPER_SOURCES})
-
-set_target_properties("helper" PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON)
-
-
-if(NOT TARGET Threads::Threads)
-    find_package(Threads REQUIRED)
-endif()
-
-target_include_directories("helper" PUBLIC "include")
-
-
-target_link_libraries("helper" PUBLIC Threads::Threads)
diff --git a/standalone/helper/src/main.cpp b/standalone/helper/src/main.cpp
deleted file mode 100644
index 8b7c63d..0000000
--- a/standalone/helper/src/main.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-#include <iostream>
-#include <mutex>
-#include <thread>
-#include <vector>
-
-std::mutex mtx;
-
-#define STR(x) #x
-
-template <typename T>
-void increment(T& ptr, int NUM) {
-  for (int j = 0; j < NUM; j++) {
-    //mtx.lock();
-    ptr++;
-    //mtx.unlock();
-  }
-}
-
-int main() {
-  std::vector<std::thread> threads;
-  size_t size = 2;
-  threads.reserve(size);
-
-  int NUM = 5000;
-  int i = 0;
-
-  for (size_t j = 0; j < size; ++j) {
-    threads.emplace_back(std::thread(increment<int>, std::ref(i), NUM));
-  }
-
-  for (int j = 0; j < NUM; j++) {
-    mtx.lock();
-    i++;
-    mtx.unlock();
-  }
-
-  for (size_t i = 0; i < size; ++i) {
-    threads[i].join();
-  }
-
-  std::cout << STR(i) << " =  " << i << std::endl;
-
-  std::cin >> i;
-
-  return 0;
-}