diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
index 3bf470a..846b8d0 100644
--- a/.github/workflows/go.yml
+++ b/.github/workflows/go.yml
@@ -12,7 +12,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        go: ['1.16', '1.17', '1.18', '1.19', '1.20', '1.21', '1.22', 'tip']
+        go: ['1.18', '1.19', '1.20', '1.21', '1.22', 'tip']
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -21,11 +21,7 @@ jobs:
         uses: actions/setup-go@v4
         with:
           go-version: ${{ matrix.go }}
-      - name: Downgrade go.mod for go 1.17 and go 1.16
-        if:  matrix.go == '1.17' || matrix.go == '1.16'
-        run: make go/mod_16_for_testing
       - name: Run go/mod
-        if:  matrix.go != '1.17' && matrix.go != '1.16'
         run: make go/mod && git diff --exit-code
       - name: Install Go stable
         if: matrix.go == 'tip'
diff --git a/Makefile b/Makefile
index abbaf43..7c2167d 100644
--- a/Makefile
+++ b/Makefile
@@ -15,8 +15,3 @@ go/mod:
 	cd godeltaprof/  && GO111MODULE=on go mod download
 	cd godeltaprof/ && GO111MODULE=on go mod tidy
 
-.PHONY: go/mod_16_for_testing
-go/mod_16_for_testing:
-	rm -rf godeltaprof/compat/go.mod godeltaprof/compat/go.sum godeltaprof/go.mod godeltaprof/go.sum go.work otelpyroscope/
-	cat go.mod_go16_test.txt > go.mod
-	go mod tidy
diff --git a/go.mod b/go.mod
index c6ab79c..4fe966d 100644
--- a/go.mod
+++ b/go.mod
@@ -6,4 +6,4 @@ replace github.com/grafana/pyroscope-go/godeltaprof => ./godeltaprof
 
 require github.com/grafana/pyroscope-go/godeltaprof v0.1.6
 
-require github.com/klauspost/compress v1.17.3 // indirect
+require github.com/klauspost/compress v1.17.8 // indirect
diff --git a/go.mod_go16_test.txt b/go.mod_go16_test.txt
deleted file mode 100644
index 0f0f0dc..0000000
--- a/go.mod_go16_test.txt
+++ /dev/null
@@ -1,12 +0,0 @@
-module github.com/grafana/pyroscope-go
-
-go 1.16
-
-require (
-    github.com/davecgh/go-spew v1.1.1 // indirect
-    github.com/google/pprof v0.0.0-20231127191134-f3a68a39ae15
-    github.com/stretchr/testify v1.7.0
-    golang.org/x/mod v0.14.0 // indirect
-    golang.org/x/tools v0.12.0
-    gopkg.in/yaml.v3 v3.0.1 // indirect
-)
diff --git a/go.sum b/go.sum
index 0d8e8f5..734f917 100644
--- a/go.sum
+++ b/go.sum
@@ -1,2 +1,2 @@
-github.com/klauspost/compress v1.17.3 h1:qkRjuerhUU1EmXLYGkSH6EZL+vPSxIrYjLNAK4slzwA=
-github.com/klauspost/compress v1.17.3/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM=
+github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU=
+github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
diff --git a/go.work.sum b/go.work.sum
index e663de3..2922f3a 100644
--- a/go.work.sum
+++ b/go.work.sum
@@ -6,17 +6,12 @@ github.com/chromedp/sysutil v1.0.0 h1:+ZxhTpfpZlmchB58ih/LBHX52ky7w2VhQVKQMucy3I
 github.com/chromedp/sysutil v1.0.0/go.mod h1:kgWmDdq8fTzXYcKIBqIYvRRTnYb9aNS9moAV0xufSww=
 github.com/chzyer/readline v1.5.1 h1:upd/6fQk4src78LMRzh5vItIt361/o4uq553V8B5sGI=
 github.com/chzyer/readline v1.5.1/go.mod h1:Eh+b79XXUwfKfcPLepksvw2tcLE/Ct21YObkaSkeBlk=
-github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0=
-github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
 github.com/gobwas/httphead v0.1.0 h1:exrUm0f4YX0L7EBwZHuCF4GDp8aJfVeBrlLQrs6NqWU=
 github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM=
 github.com/gobwas/pool v0.2.1 h1:xfeeEhW7pwmX8nuLVlqbzVc7udMDrwetjEv+TZIz1og=
 github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw=
 github.com/gobwas/ws v1.2.1 h1:F2aeBZrm2NDsc7vbovKrWSogd4wvfAxg0FQ89/iqOTk=
 github.com/gobwas/ws v1.2.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY=
-github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
-github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
-github.com/google/go-cmp v0.6.0/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
 github.com/ianlancetaylor/demangle v0.0.0-20230524184225-eabc099b10ab h1:BA4a7pe6ZTd9F8kXETBoijjFJ/ntaa//1wiH9BZu4zU=
 github.com/ianlancetaylor/demangle v0.0.0-20230524184225-eabc099b10ab/go.mod h1:gx7rwoVhcfuVKG5uya9Hs3Sxj7EIvldVofAWIUtGouw=
 github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
@@ -27,7 +22,6 @@ github.com/stretchr/objx v0.5.0 h1:1zr/of2m5FGMsad5YfcqgdqdWrIhu+EBEJRhR1U7z/c=
 github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
 github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE=
 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
-golang.org/x/crypto v0.0.0-20210921155107-089bfa567519 h1:7I4JAnoQBe7ZtJcBaYHi5UtiO8tQHbUSXxL+pnGRANg=
 golang.org/x/net v0.19.0 h1:zTwKpTd2XuCqf8huc7Fo2iSy+4RHPd10s4KzeTnVr1c=
 golang.org/x/net v0.19.0/go.mod h1:CfAk/cbD4CthTvqiEl8NpboMuiuOYsAr/7NOjZJtv1U=
 golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
diff --git a/godeltaprof/compat/compression_test.go b/godeltaprof/compat/compression_test.go
index e9305e4..b5923e3 100644
--- a/godeltaprof/compat/compression_test.go
+++ b/godeltaprof/compat/compression_test.go
@@ -2,33 +2,27 @@ package compat
 
 import (
 	"io"
-	"math/rand"
 	"runtime"
 	"testing"
-
-	"github.com/grafana/pyroscope-go/godeltaprof/internal/pprof"
 )
 
 func BenchmarkHeapCompression(b *testing.B) {
-	opt := &pprof.ProfileBuilderOptions{
-		GenericsFrames: true,
-		LazyMapping:    true,
-	}
-	dh := new(pprof.DeltaHeapProfiler)
-	fs := generateMemProfileRecords(512, 32, 239)
-	rng := rand.NewSource(239)
-	objSize := fs[0].AllocBytes / fs[0].AllocObjects
-	nMutations := int(uint(rng.Int63())) % len(fs)
+	h := newHeapTestHelper()
+	fs := h.generateMemProfileRecords(512, 32)
+	h.rng.Seed(239)
+	nmutations := int(h.rng.Int63() % int64(len(fs)))
+
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		_ = WriteHeapProto(dh, opt, io.Discard, fs, int64(runtime.MemProfileRate))
-		for j := 0; j < nMutations; j++ {
-			idx := int(uint(rng.Int63())) % len(fs)
-			fs[idx].AllocObjects += 1
-			fs[idx].AllocBytes += objSize
-			fs[idx].FreeObjects += 1
-			fs[idx].FreeBytes += objSize
+		if i == 1000 {
+			v := h.rng.Int63()
+			if v != 7817861117094116717 {
+				b.Errorf("unexpected random value: %d. "+
+					"The bench should be deterministic for better comparision.", v)
+			}
 		}
+		_ = WriteHeapProto(h.dp, h.opt, io.Discard, fs, int64(runtime.MemProfileRate))
+		h.mutate(nmutations, fs)
 	}
 }
 
@@ -43,38 +37,25 @@ func BenchmarkMutexCompression(b *testing.B) {
 			runtime.SetMutexProfileFraction(5)
 			defer runtime.SetMutexProfileFraction(prevMutexProfileFraction)
 
-			opt := &pprof.ProfileBuilderOptions{
-				GenericsFrames: true,
-				LazyMapping:    true,
-			}
-			dh := new(pprof.DeltaMutexProfiler)
-			fs := generateBlockProfileRecords(512, 32, 239)
-			rng := rand.NewSource(239)
-			nMutations := int(uint(rng.Int63())) % len(fs)
-			oneBlockCycles := fs[0].Cycles / fs[0].Count
+			h := newMutexTestHelper()
+			h.scaler = scaler
+			fs := h.generateBlockProfileRecords(512, 32)
+			h.rng.Seed(239)
+			nmutations := int(h.rng.Int63() % int64(len(fs)))
 			b.ResetTimer()
 
 			for i := 0; i < b.N; i++ {
-				_ = PrintCountCycleProfile(dh, opt, io.Discard, scaler, fs)
-				for j := 0; j < nMutations; j++ {
-					idx := int(uint(rng.Int63())) % len(fs)
-					fs[idx].Count += 1
-					fs[idx].Cycles += oneBlockCycles
+				if i == 1000 {
+					v := h.rng.Int63()
+					if v != 7817861117094116717 {
+						b.Errorf("unexpected random value: %d. "+
+							"The bench should be deterministic for better comparision.", v)
+					}
 				}
+				_ = PrintCountCycleProfile(h.dp, h.opt, io.Discard, scaler, fs)
+				h.mutate(nmutations, fs)
 			}
 		})
 
 	}
 }
-
-func WriteHeapProto(dp *pprof.DeltaHeapProfiler, opt *pprof.ProfileBuilderOptions, w io.Writer, p []runtime.MemProfileRecord, rate int64) error {
-	stc := pprof.HeapProfileConfig(rate)
-	b := pprof.NewProfileBuilder(w, opt, stc)
-	return dp.WriteHeapProto(b, p, rate)
-}
-
-func PrintCountCycleProfile(d *pprof.DeltaMutexProfiler, opt *pprof.ProfileBuilderOptions, w io.Writer, scaler pprof.MutexProfileScaler, records []runtime.BlockProfileRecord) error {
-	stc := pprof.MutexProfileConfig()
-	b := pprof.NewProfileBuilder(w, opt, stc)
-	return d.PrintCountCycleProfile(b, scaler, records)
-}
diff --git a/godeltaprof/compat/delta_test.go b/godeltaprof/compat/delta_test.go
index 1dded69..2822ef4 100644
--- a/godeltaprof/compat/delta_test.go
+++ b/godeltaprof/compat/delta_test.go
@@ -1,13 +1,14 @@
 package compat
 
 import (
-	"bytes"
-	"math/rand"
-	"runtime"
-	"testing"
-
+	"fmt"
+	gprofile "github.com/google/pprof/profile"
 	"github.com/grafana/pyroscope-go/godeltaprof/internal/pprof"
 	"github.com/stretchr/testify/assert"
+	"reflect"
+	"runtime"
+	"strings"
+	"testing"
 )
 
 var (
@@ -15,14 +16,51 @@ var (
 	stack0Marker string
 	stack1       = [32]uintptr{}
 	stack1Marker string
+	stack2       = [32]uintptr{}
+	stack2Marker string
+	stack3       = [32]uintptr{}
+	stack4       = [32]uintptr{}
 )
 
 func init() {
-	fs := getFunctionPointers()
-	stack0 = [32]uintptr{fs[0], fs[1]}
-	stack1 = [32]uintptr{fs[2], fs[3]}
-	stack0Marker = runtime.FuncForPC(fs[1]).Name() + ";" + runtime.FuncForPC(fs[0]).Name()
-	stack1Marker = runtime.FuncForPC(fs[3]).Name() + ";" + runtime.FuncForPC(fs[2]).Name()
+	stack0 = [32]uintptr{
+		reflect.ValueOf(assert.Truef).Pointer(),
+		reflect.ValueOf(assert.CallerInfo).Pointer(),
+	}
+	stack1 = [32]uintptr{
+		reflect.ValueOf(assert.Condition).Pointer(),
+		reflect.ValueOf(assert.Conditionf).Pointer(),
+	}
+	stack2 = [32]uintptr{
+		reflect.ValueOf(runtime.GC).Pointer(),
+		reflect.ValueOf(runtime.FuncForPC).Pointer(),
+		reflect.ValueOf(TestDeltaBlockProfile).Pointer(),
+		reflect.ValueOf(TestDeltaHeap).Pointer(),
+	}
+	stack3 = [32]uintptr{ // equal , but difference in runtime
+		reflect.ValueOf(runtime.GC).Pointer() + 1,
+		reflect.ValueOf(runtime.FuncForPC).Pointer(),
+		reflect.ValueOf(TestDeltaBlockProfile).Pointer(),
+		reflect.ValueOf(TestDeltaHeap).Pointer(),
+	}
+
+	stack4 = [32]uintptr{ // equal , but difference in non runtime frame
+		reflect.ValueOf(runtime.GC).Pointer(),
+		reflect.ValueOf(runtime.FuncForPC).Pointer(),
+		reflect.ValueOf(TestDeltaBlockProfile).Pointer() + 1,
+		reflect.ValueOf(TestDeltaHeap).Pointer(),
+	}
+	marker := func(stk []uintptr) string {
+		res := []string{}
+		for i := range stk {
+			f := stk[len(stk)-1-i]
+			res = append(res, runtime.FuncForPC(f).Name())
+		}
+		return strings.Join(res, ";")
+	}
+	stack0Marker = marker(stack0[:2])
+	stack1Marker = marker(stack1[:2])
+	stack2Marker = marker(stack2[2:4])
 }
 
 func TestDeltaHeap(t *testing.T) {
@@ -40,56 +78,41 @@ func TestDeltaHeap(t *testing.T) {
 	const testMemProfileRate = 524288
 	const testObjectSize = 327680
 
-	dh := new(pprof.DeltaHeapProfiler)
-	opt := new(pprof.ProfileBuilderOptions)
-	dump := func(r ...runtime.MemProfileRecord) *bytes.Buffer {
-		buf := bytes.NewBuffer(nil)
-		err := WriteHeapProto(dh, opt, buf, r, testMemProfileRate)
-		assert.NoError(t, err)
-		return buf
-	}
-	r := func(AllocObjects, AllocBytes, FreeObjects, FreeBytes int64, s [32]uintptr) runtime.MemProfileRecord {
-		return runtime.MemProfileRecord{
-			AllocObjects: AllocObjects,
-			AllocBytes:   AllocBytes,
-			FreeBytes:    FreeBytes,
-			FreeObjects:  FreeObjects,
-			Stack0:       s,
-		}
-	}
+	h := newHeapTestHelper()
+	h.rate = testMemProfileRate
 
-	p1 := dump(
-		r(0, 0, 0, 0, stack0),
-		r(0, 0, 0, 0, stack1),
+	p1 := h.dump(
+		h.r(0, 0, 0, 0, stack0),
+		h.r(0, 0, 0, 0, stack1),
 	)
 	expectEmptyProfile(t, p1)
 
-	p2 := dump(
-		r(5, 5*testObjectSize, 0, 0, stack0),
-		r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
+	p2 := h.dump(
+		h.r(5, 5*testObjectSize, 0, 0, stack0),
+		h.r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
 	)
 	expectStackFrames(t, p2, stack0Marker, 10, 3525422, 10, 3525422)
 	expectStackFrames(t, p2, stack1Marker, 6, 2115253, 0, 0)
 
 	for i := 0; i < 3; i++ {
 		// if we write same data, stack0 is in use, stack1 should not be present
-		p3 := dump(
-			r(5, 5*testObjectSize, 0, 0, stack0),
-			r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
+		p3 := h.dump(
+			h.r(5, 5*testObjectSize, 0, 0, stack0),
+			h.r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
 		)
 		expectStackFrames(t, p3, stack0Marker, 0, 0, 10, 3525422)
 		expectNoStackFrames(t, p3, stack1Marker)
 	}
 
-	p4 := dump(
-		r(5, 5*testObjectSize, 5, 5*testObjectSize, stack0),
-		r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
+	p4 := h.dump(
+		h.r(5, 5*testObjectSize, 5, 5*testObjectSize, stack0),
+		h.r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
 	)
 	expectEmptyProfile(t, p4)
 
-	p5 := dump(
-		r(8, 8*testObjectSize, 5, 5*testObjectSize, stack0),
-		r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
+	p5 := h.dump(
+		h.r(8, 8*testObjectSize, 5, 5*testObjectSize, stack0),
+		h.r(3, 3*testObjectSize, 3, 3*testObjectSize, stack1),
 	)
 	// note, this value depends on scale order, it currently tests the current implementation, but we may change it
 	// to alloc objects to be scale(8) - scale(5) = 17-10 = 7
@@ -98,8 +121,6 @@ func TestDeltaHeap(t *testing.T) {
 }
 
 func TestDeltaBlockProfile(t *testing.T) {
-	cpuGHz := float64(pprof.Runtime_cyclesPerSecond()) / 1e9
-
 	for i, scaler := range mutexProfileScalers {
 		name := "ScalerMutexProfile"
 		if i == 1 {
@@ -110,56 +131,35 @@ func TestDeltaBlockProfile(t *testing.T) {
 			runtime.SetMutexProfileFraction(5)
 			defer runtime.SetMutexProfileFraction(prevMutexProfileFraction)
 
-			dh := new(pprof.DeltaMutexProfiler)
-			opt := new(pprof.ProfileBuilderOptions)
-
-			scale := func(rcount, rcycles int64) (int64, int64) {
-				count, nanosec := pprof.ScaleMutexProfile(scaler, rcount, float64(rcycles)/cpuGHz)
-				inanosec := int64(nanosec)
-				return count, inanosec
-			}
-			dump := func(r ...runtime.BlockProfileRecord) *bytes.Buffer {
-				buf := bytes.NewBuffer(nil)
-				err := PrintCountCycleProfile(dh, opt, buf, scaler, r)
-				assert.NoError(t, err)
-				return buf
-			}
-			r := func(count, cycles int64, s [32]uintptr) runtime.BlockProfileRecord {
-				return runtime.BlockProfileRecord{
-					Count:  count,
-					Cycles: cycles,
-					StackRecord: runtime.StackRecord{
-						Stack0: s,
-					},
-				}
-			}
+			h := newMutexTestHelper()
+			h.scaler = scaler
 
-			p1 := dump(
-				r(0, 0, stack0),
-				r(0, 0, stack1),
+			p1 := h.dump(
+				h.r(0, 0, stack0),
+				h.r(0, 0, stack1),
 			)
 			expectEmptyProfile(t, p1)
 
 			const cycles = 42
-			p2 := dump(
-				r(239, 239*cycles, stack0),
-				r(0, 0, stack1),
+			p2 := h.dump(
+				h.r(239, 239*cycles, stack0),
+				h.r(0, 0, stack1),
 			)
-			count0, nanos0 := scale(239, 239*cycles)
+			count0, nanos0 := h.scale(239, 239*cycles)
 			expectStackFrames(t, p2, stack0Marker, count0, nanos0)
 			expectNoStackFrames(t, p2, stack1Marker)
 
 			for j := 0; j < 2; j++ {
-				p3 := dump(
-					r(239, 239*cycles, stack0),
-					r(0, 0, stack1),
+				p3 := h.dump(
+					h.r(239, 239*cycles, stack0),
+					h.r(0, 0, stack1),
 				)
 				expectEmptyProfile(t, p3)
 			}
 
-			count1, nanos1 := scale(240, 240*cycles)
-			p4 := dump(
-				r(240, 240*cycles, stack0),
+			count1, nanos1 := h.scale(240, 240*cycles)
+			p4 := h.dump(
+				h.r(240, 240*cycles, stack0),
 			)
 			expectStackFrames(t, p4, stack0Marker, count1-count0, nanos1-nanos0)
 			expectNoStackFrames(t, p4, stack1Marker)
@@ -168,22 +168,23 @@ func TestDeltaBlockProfile(t *testing.T) {
 }
 
 func BenchmarkHeapDelta(b *testing.B) {
-	dh := new(pprof.DeltaHeapProfiler)
-	fs := generateMemProfileRecords(512, 32, 239)
-	rng := rand.NewSource(239)
-	objSize := fs[0].AllocBytes / fs[0].AllocObjects
-	nMutations := int(uint(rng.Int63())) % len(fs)
+	h := newHeapTestHelper()
+	fs := h.generateMemProfileRecords(512, 32)
 	builder := &noopBuilder{}
+	h.rng.Seed(239)
+	nmutations := int(h.rng.Int63() % int64(len(fs)))
+
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		_ = dh.WriteHeapProto(builder, fs, int64(runtime.MemProfileRate))
-		for j := 0; j < nMutations; j++ {
-			idx := int(uint(rng.Int63())) % len(fs)
-			fs[idx].AllocObjects += 1
-			fs[idx].AllocBytes += objSize
-			fs[idx].FreeObjects += 1
-			fs[idx].FreeBytes += objSize
+		if i == 1000 {
+			v := h.rng.Int63()
+			if v != 7817861117094116717 {
+				b.Errorf("unexpected random value: %d. "+
+					"The bench should be deterministic for better comparision.", v)
+			}
 		}
+		_ = h.dp.WriteHeapProto(builder, fs, int64(runtime.MemProfileRate))
+		h.mutate(nmutations, fs)
 	}
 }
 
@@ -198,37 +199,98 @@ func BenchmarkMutexDelta(b *testing.B) {
 			runtime.SetMutexProfileFraction(5)
 			defer runtime.SetMutexProfileFraction(prevMutexProfileFraction)
 
-			dh := new(pprof.DeltaMutexProfiler)
-			fs := generateBlockProfileRecords(512, 32, 239)
-			rng := rand.NewSource(239)
-			nMutations := int(uint(rng.Int63())) % len(fs)
-			oneBlockCycles := fs[0].Cycles / fs[0].Count
+			h := newMutexTestHelper()
+			h.scaler = scaler
+			fs := h.generateBlockProfileRecords(512, 32)
 			builder := &noopBuilder{}
+			h.rng.Seed(239)
+			nmutations := int(h.rng.Int63() % int64(len(fs)))
 			b.ResetTimer()
 
 			for i := 0; i < b.N; i++ {
-				_ = dh.PrintCountCycleProfile(builder, scaler, fs)
-				for j := 0; j < nMutations; j++ {
-					idx := int(uint(rng.Int63())) % len(fs)
-					fs[idx].Count += 1
-					fs[idx].Cycles += oneBlockCycles
+				if i == 1000 {
+					v := h.rng.Int63()
+					if v != 7817861117094116717 {
+						b.Errorf("unexpected random value: %d. "+
+							"The bench should be deterministic for better comparision.", v)
+					}
 				}
+				_ = h.dp.PrintCountCycleProfile(builder, scaler, fs)
+				h.mutate(nmutations, fs)
 			}
 		})
 
 	}
 }
 
-type noopBuilder struct {
-}
+func TestMutexDuplicates(t *testing.T) {
+	prev := runtime.SetMutexProfileFraction(-1)
+	runtime.SetMutexProfileFraction(1)
+	defer runtime.SetMutexProfileFraction(prev)
 
-func (b *noopBuilder) LocsForStack(_ []uintptr) []uint64 {
-	return nil
-}
-func (b *noopBuilder) Sample(_ []int64, _ []uint64, _ int64) {
+	h := newMutexTestHelper()
+	const cycles = 42
+	p := h.dump(
+		h.r(239, 239*cycles, stack0),
+		h.r(42, 42*cycles, stack1),
+		h.r(7, 7*cycles, stack0),
+	)
 
+	expectStackFrames(t, p, stack0Marker, h.scale2(239+7, (239+7)*cycles)...)
+	expectStackFrames(t, p, stack1Marker, h.scale2(42, (42)*cycles)...)
+
+	expectPPROFLocations(t, p, fmt.Sprintf("^%s$", stack0Marker), 1, h.scale2(239+7, (239+7)*cycles)...)
+	expectPPROFLocations(t, p, fmt.Sprintf("^%s$", stack1Marker), 1, h.scale2(42, 42*cycles)...)
+
+	p = h.dump(
+		h.r(239, 239*cycles, stack0),
+		h.r(42, 42*cycles, stack1),
+		h.r(7, 7*cycles, stack0),
+	)
+	expectEmptyProfile(t, p)
 }
 
-func (b *noopBuilder) Build() {
+func TestHeapDuplicates(t *testing.T) {
+	const testMemProfileRate = 524288
+	h := newHeapTestHelper()
+	h.rate = testMemProfileRate
+	const blockSize = 1024
+	const blockSize2 = 2048
+	p := h.dump(
+		h.r(239, 239*blockSize, 239, 239*blockSize, stack0),
+		h.r(3, 3*blockSize2, 3, 3*blockSize2, stack0),
+		h.r(42, 42*blockSize, 42, 42*blockSize, stack1),
+		h.r(7, 7*blockSize, 7, 7*blockSize, stack0),
+		h.r(3, 3*blockSize, 3, 3*blockSize, stack2),
+		h.r(5, 5*blockSize, 5, 5*blockSize, stack3),
+		h.r(11, 11*blockSize, 11, 11*blockSize, stack4),
+	)
+	pp, err := gprofile.ParseData(p.Bytes())
+	assert.NoError(t, err)
 
+	scale := func(c, b int) []int64 {
+		c1, b1 := pprof.ScaleHeapSample(int64(c), int64(b), testMemProfileRate)
+		return []int64{c1, b1, 0, 0}
+	}
+	//expectStackFrames(t, p, stack0Marker, scale(239+7, (239+7)*blockSize)...)
+	//expectStackFrames(t, p, stack1Marker, scale(42, 42*blockSize)...)
+
+	//printProfile(t, p)
+	expectPPROFLocations(t, p, fmt.Sprintf("^%s$", stack0Marker), 1, scale(239+7, (239+7)*blockSize)...)
+	expectPPROFLocations(t, p, fmt.Sprintf("^%s$", stack1Marker), 1, scale(42, 42*blockSize)...)
+	expectPPROFLocations(t, p, fmt.Sprintf("^%s$", stack2Marker), 1, scale(3, 3*blockSize)...)
+	expectPPROFLocations(t, p, fmt.Sprintf("^%s$", stack2Marker), 1, scale(5, 5*blockSize)...)
+	expectPPROFLocations(t, p, fmt.Sprintf("^%s$", stack2Marker), 1, scale(11, 11*blockSize)...)
+	assert.Equal(t, 6, len(pp.Sample))
+
+	p = h.dump(
+		h.r(239, 239*blockSize, 239, 239*blockSize, stack0),
+		h.r(3, 3*blockSize2, 3, 3*blockSize2, stack0),
+		h.r(42, 42*blockSize, 42, 42*blockSize, stack1),
+		h.r(7, 7*blockSize, 7, 7*blockSize, stack0),
+		h.r(3, 3*blockSize, 3, 3*blockSize, stack2),
+		h.r(5, 5*blockSize, 5, 5*blockSize, stack3),
+		h.r(11, 11*blockSize, 11, 11*blockSize, stack4),
+	)
+	expectEmptyProfile(t, p)
 }
diff --git a/godeltaprof/compat/reject_order_test.go b/godeltaprof/compat/reject_order_test.go
index 9bdd658..6266b78 100644
--- a/godeltaprof/compat/reject_order_test.go
+++ b/godeltaprof/compat/reject_order_test.go
@@ -13,11 +13,10 @@ import (
 )
 
 func TestHeapReject(t *testing.T) {
-	dh := new(pprof.DeltaHeapProfiler)
-	opt := new(pprof.ProfileBuilderOptions)
-	fs := generateMemProfileRecords(512, 32, 239)
+	h := newHeapTestHelper()
+	fs := h.generateMemProfileRecords(512, 32)
 	p1 := bytes.NewBuffer(nil)
-	err := WriteHeapProto(dh, opt, p1, fs, int64(runtime.MemProfileRate))
+	err := WriteHeapProto(h.dp, h.opt, p1, fs, int64(runtime.MemProfileRate))
 	assert.NoError(t, err)
 	p1Size := p1.Len()
 	profile, err := gprofile.Parse(p1)
@@ -28,7 +27,7 @@ func TestHeapReject(t *testing.T) {
 	t.Log("p1 size", p1Size)
 
 	p2 := bytes.NewBuffer(nil)
-	err = WriteHeapProto(dh, opt, p2, fs, int64(runtime.MemProfileRate))
+	err = WriteHeapProto(h.dp, h.opt, p2, fs, int64(runtime.MemProfileRate))
 	assert.NoError(t, err)
 	p2Size := p2.Len()
 	assert.Less(t, p2Size, 1000)
@@ -41,15 +40,11 @@ func TestHeapReject(t *testing.T) {
 }
 
 func BenchmarkHeapRejectOrder(b *testing.B) {
-	opt := &pprof.ProfileBuilderOptions{
-		GenericsFrames: false,
-		LazyMapping:    true,
-	}
-	dh := &pprof.DeltaHeapProfiler{}
-	fs := generateMemProfileRecords(512, 32, 239)
+	h := newHeapTestHelper()
+	fs := h.generateMemProfileRecords(512, 32)
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
-		WriteHeapProto(dh, opt, io.Discard, fs, int64(runtime.MemProfileRate))
+		WriteHeapProto(h.dp, h.opt, io.Discard, fs, int64(runtime.MemProfileRate))
 	}
 }
 
@@ -69,11 +64,11 @@ func TestMutexReject(t *testing.T) {
 			runtime.SetMutexProfileFraction(5)
 			defer runtime.SetMutexProfileFraction(prevMutexProfileFraction)
 
-			dh := new(pprof.DeltaMutexProfiler)
-			opt := new(pprof.ProfileBuilderOptions)
-			fs := generateBlockProfileRecords(512, 32, 239)
+			h := newMutexTestHelper()
+			h.scaler = scaler
+			fs := h.generateBlockProfileRecords(512, 32)
 			p1 := bytes.NewBuffer(nil)
-			err := PrintCountCycleProfile(dh, opt, p1, scaler, fs)
+			err := PrintCountCycleProfile(h.dp, h.opt, p1, scaler, fs)
 			assert.NoError(t, err)
 			p1Size := p1.Len()
 			profile, err := gprofile.Parse(p1)
@@ -84,7 +79,7 @@ func TestMutexReject(t *testing.T) {
 			t.Log("p1 size", p1Size)
 
 			p2 := bytes.NewBuffer(nil)
-			err = PrintCountCycleProfile(dh, opt, p2, scaler, fs)
+			err = PrintCountCycleProfile(h.dp, h.opt, p2, scaler, fs)
 			assert.NoError(t, err)
 			p2Size := p2.Len()
 			assert.Less(t, p2Size, 1000)
@@ -108,16 +103,13 @@ func BenchmarkMutexRejectOrder(b *testing.B) {
 			prevMutexProfileFraction := runtime.SetMutexProfileFraction(-1)
 			runtime.SetMutexProfileFraction(5)
 			defer runtime.SetMutexProfileFraction(prevMutexProfileFraction)
-			opt := &pprof.ProfileBuilderOptions{
-				GenericsFrames: false,
-				LazyMapping:    true,
-			}
-			dh := &pprof.DeltaMutexProfiler{}
-			fs := generateBlockProfileRecords(512, 32, 239)
+			h := newMutexTestHelper()
+			h.scaler = scaler
+			fs := h.generateBlockProfileRecords(512, 32)
 			b.ResetTimer()
 
 			for i := 0; i < b.N; i++ {
-				PrintCountCycleProfile(dh, opt, io.Discard, scaler, fs)
+				PrintCountCycleProfile(h.dp, h.opt, io.Discard, scaler, fs)
 			}
 		})
 
diff --git a/godeltaprof/compat/stackcollapse.go b/godeltaprof/compat/stackcollapse.go
index 7051ab5..8f95c17 100644
--- a/godeltaprof/compat/stackcollapse.go
+++ b/godeltaprof/compat/stackcollapse.go
@@ -2,7 +2,9 @@ package compat
 
 import (
 	"bytes"
+	"fmt"
 	"io"
+	"reflect"
 	"regexp"
 	"sort"
 	"strings"
@@ -26,6 +28,16 @@ func expectEmptyProfile(t *testing.T, buffer io.Reader) {
 	assert.Empty(t, ls)
 }
 
+func printProfile(t *testing.T, p *bytes.Buffer) {
+	profile, err := gprofile.Parse(bytes.NewBuffer(p.Bytes()))
+	require.NoError(t, err)
+	t.Log("==================")
+	for _, sample := range profile.Sample {
+		s := pprofSampleStackToString(sample)
+		t.Logf("%v %v %v\n", s, sample.Value, sample.NumLabel)
+	}
+}
+
 func expectNoStackFrames(t *testing.T, buffer *bytes.Buffer, sfPattern string) {
 	profile, err := gprofile.ParseData(buffer.Bytes())
 	require.NoError(t, err)
@@ -34,6 +46,7 @@ func expectNoStackFrames(t *testing.T, buffer *bytes.Buffer, sfPattern string) {
 }
 
 func expectStackFrames(t *testing.T, buffer *bytes.Buffer, sfPattern string, values ...int64) {
+	fmt.Printf("expectStackFrames: %s %+v\n", sfPattern, values)
 	profile, err := gprofile.ParseData(buffer.Bytes())
 	require.NoError(t, err)
 	line := findStack(t, stackCollapseProfile(t, profile), sfPattern)
@@ -45,6 +58,32 @@ func expectStackFrames(t *testing.T, buffer *bytes.Buffer, sfPattern string, val
 	}
 }
 
+func expectPPROFLocations(t *testing.T, buffer *bytes.Buffer, samplePattern string, expectedCount int, expectedValues ...int64) {
+	profile, err := gprofile.ParseData(buffer.Bytes())
+	require.NoError(t, err)
+	cnt := 0
+	samples := grepSamples(profile, samplePattern)
+	for _, s := range samples {
+		if reflect.DeepEqual(s.Value, expectedValues) {
+			cnt++
+		}
+	}
+	assert.Equalf(t, expectedCount, cnt, "expected samples re: %s\n   values: %v\n    count:%d\n    all samples:%+v\n", samplePattern, expectedValues, expectedCount, samples)
+}
+
+func grepSamples(profile *gprofile.Profile, samplePattern string) []*gprofile.Sample {
+	rr := regexp.MustCompile(samplePattern)
+	var samples []*gprofile.Sample
+	for i := range profile.Sample {
+		ss := pprofSampleStackToString(profile.Sample[i])
+		if !rr.MatchString(ss) {
+			continue
+		}
+		samples = append(samples, profile.Sample[i])
+	}
+	return samples
+}
+
 func findStack(t *testing.T, res []stack, re string) *stack {
 	rr := regexp.MustCompile(re)
 	for i, re := range res {
@@ -58,23 +97,9 @@ func findStack(t *testing.T, res []stack, re string) *stack {
 func stackCollapseProfile(t testing.TB, p *gprofile.Profile) []stack {
 	var ret []stack
 	for _, s := range p.Sample {
-		var funcs []string
-		for i := range s.Location {
-
-			loc := s.Location[i]
-			for _, line := range loc.Line {
-				f := line.Function
-				//funcs = append(funcs, fmt.Sprintf("%s:%d", f.Name, line.Line))
-				funcs = append(funcs, f.Name)
-			}
-		}
-		for i := 0; i < len(funcs)/2; i++ {
-			j := len(funcs) - i - 1
-			funcs[i], funcs[j] = funcs[j], funcs[i]
-		}
-
+		funcs, strSample := pprofSampleStackToStrings(s)
 		ret = append(ret, stack{
-			line:  strings.Join(funcs, ";"),
+			line:  strSample,
 			funcs: funcs,
 			value: s.Value,
 		})
@@ -97,11 +122,36 @@ func stackCollapseProfile(t testing.TB, p *gprofile.Profile) []stack {
 		unique = append(unique, s)
 
 	}
-	t.Log("============= stackCollapseProfile ================")
-	for _, s := range unique {
-		t.Log(s.line, s.value)
-	}
-	t.Log("===================================================")
+	//t.Log("============= stackCollapseProfile ================")
+	//for _, s := range unique {
+	//	t.Log(s.line, s.value)
+	//}
+	//t.Log("===================================================")
 
 	return unique
 }
+
+func pprofSampleStackToString(s *gprofile.Sample) string {
+	_, v := pprofSampleStackToStrings(s)
+	return v
+}
+
+func pprofSampleStackToStrings(s *gprofile.Sample) ([]string, string) {
+	var funcs []string
+	for i := range s.Location {
+
+		loc := s.Location[i]
+		for _, line := range loc.Line {
+			f := line.Function
+			//funcs = append(funcs, fmt.Sprintf("%s:%d", f.Name, line.Line))
+			funcs = append(funcs, f.Name)
+		}
+	}
+	for i := 0; i < len(funcs)/2; i++ {
+		j := len(funcs) - i - 1
+		funcs[i], funcs[j] = funcs[j], funcs[i]
+	}
+
+	strSample := strings.Join(funcs, ";")
+	return funcs, strSample
+}
diff --git a/godeltaprof/compat/stub_go23_test.go b/godeltaprof/compat/stub_go23_test.go
index bf495a9..6c2d227 100644
--- a/godeltaprof/compat/stub_go23_test.go
+++ b/godeltaprof/compat/stub_go23_test.go
@@ -36,6 +36,9 @@ func TestRuntimeCyclesPerSecond(t *testing.T) {
 	checkSignature(t, "runtime",
 		"pprof_cyclesPerSecond",
 		"func runtime.pprof_cyclesPerSecond() int64")
+	checkSignature(t, "runtime/pprof",
+		"pprof_cyclesPerSecond",
+		"func runtime/pprof.pprof_cyclesPerSecond() int64")
 	checkSignature(t, "github.com/grafana/pyroscope-go/godeltaprof/internal/pprof",
 		"runtime_cyclesPerSecond",
 		"func github.com/grafana/pyroscope-go/godeltaprof/internal/pprof.runtime_cyclesPerSecond() int64")
diff --git a/godeltaprof/compat/testdata.go b/godeltaprof/compat/testdata.go
index 509ba47..be95c9e 100644
--- a/godeltaprof/compat/testdata.go
+++ b/godeltaprof/compat/testdata.go
@@ -1,16 +1,13 @@
 package compat
 
 import (
-	"go/types"
+	"bytes"
+	"github.com/grafana/pyroscope-go/godeltaprof/internal/pprof"
+	"github.com/stretchr/testify/assert"
+	"io"
 	"math/rand"
 	"reflect"
 	"runtime"
-	"strings"
-	"testing"
-
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-	"golang.org/x/tools/go/packages"
 )
 
 func getFunctionPointers() []uintptr {
@@ -160,12 +157,12 @@ func getFunctionPointers() []uintptr {
 	}
 }
 
-func generateMemProfileRecords(n, depth, seed int) []runtime.MemProfileRecord {
+func (h *heapTestHelper) generateMemProfileRecords(n, depth int) []runtime.MemProfileRecord {
 	var records []runtime.MemProfileRecord
-	rng := rand.NewSource(int64(seed))
+
 	fs := getFunctionPointers()
 	for i := 0; i < n; i++ {
-		nobj := int(uint64(rng.Int63())) % 1000000
+		nobj := int(uint64(h.rng.Int63())) % 1000000
 		r := runtime.MemProfileRecord{
 			AllocObjects: int64(nobj),
 			AllocBytes:   int64(nobj * 1024),
@@ -173,53 +170,182 @@ func generateMemProfileRecords(n, depth, seed int) []runtime.MemProfileRecord {
 			FreeBytes:    int64(nobj * 1024),
 		}
 		for j := 0; j < depth; j++ {
-			r.Stack0[j] = fs[int(uint64(rng.Int63()))%len(fs)]
+			r.Stack0[j] = fs[int(uint64(h.rng.Int63()))%len(fs)]
 		}
 		records = append(records, r)
 	}
 	return records
 }
 
-func generateBlockProfileRecords(n, depth, seed int) []runtime.BlockProfileRecord {
+func (h *mutexTestHelper) generateBlockProfileRecords(n, depth int) []runtime.BlockProfileRecord {
 	var records []runtime.BlockProfileRecord
-	rng := rand.NewSource(int64(seed))
 	fs := getFunctionPointers()
 	for i := 0; i < n; i++ {
-		nobj := int(uint64(rng.Int63())) % 1000000
+		nobj := int(uint64(h.rng.Int63())) % 1000000
 		r := runtime.BlockProfileRecord{
 			Count:  int64(nobj),
 			Cycles: int64(nobj * 10),
 		}
 		for j := 0; j < depth; j++ {
-			r.Stack0[j] = fs[int(uint64(rng.Int63()))%len(fs)]
+			r.Stack0[j] = fs[int(uint64(h.rng.Int63()))%len(fs)]
 		}
 		records = append(records, r)
 	}
 	return records
 }
 
-func getFunctions(t testing.TB, pkg string) []*types.Func {
-	var res []*types.Func
-	cfg := &packages.Config{
-		Mode:  packages.NeedImports | packages.NeedExportFile | packages.NeedTypes | packages.NeedSyntax,
-		Tests: true,
+type mutexTestHelper struct {
+	dp     *pprof.DeltaMutexProfiler
+	opt    *pprof.ProfileBuilderOptions
+	scaler pprof.MutexProfileScaler
+	rng    rand.Source
+}
+
+func newMutexTestHelper() *mutexTestHelper {
+	res := &mutexTestHelper{
+		dp: &pprof.DeltaMutexProfiler{},
+		opt: &pprof.ProfileBuilderOptions{
+			GenericsFrames: true,
+			LazyMapping:    true,
+		},
+		scaler: pprof.ScalerMutexProfile,
+		rng:    rand.NewSource(239),
 	}
-	pkgs, err := packages.Load(cfg, pkg)
-	require.NoError(t, err)
-	for _, p := range pkgs {
-		if strings.Contains(p.ID, ".test") {
-			continue
-		}
-		for _, name := range p.Types.Scope().Names() {
-			f := p.Types.Scope().Lookup(name)
-
-			if f != nil {
-				ff, ok := f.(*types.Func)
-				if ok {
-					res = append(res, ff)
-				}
-			}
-		}
+	return res
+
+}
+
+func (h *mutexTestHelper) scale(rcount, rcycles int64) (int64, int64) {
+	cpuGHz := float64(pprof.Runtime_cyclesPerSecond()) / 1e9
+	count, nanosec := pprof.ScaleMutexProfile(h.scaler, rcount, float64(rcycles)/cpuGHz)
+	inanosec := int64(nanosec)
+	return count, inanosec
+}
+
+func (h *mutexTestHelper) scale2(rcount, rcycles int64) []int64 {
+	c, n := h.scale(rcount, rcycles)
+	return []int64{c, n}
+}
+
+func (h *mutexTestHelper) dump(r ...runtime.BlockProfileRecord) *bytes.Buffer {
+	buf := bytes.NewBuffer(nil)
+	err := PrintCountCycleProfile(h.dp, h.opt, buf, h.scaler, r)
+	if err != nil { // never happens
+		panic(err)
+	}
+	return buf
+}
+
+func (h *mutexTestHelper) r(count, cycles int64, s [32]uintptr) runtime.BlockProfileRecord {
+	return runtime.BlockProfileRecord{
+		Count:  count,
+		Cycles: cycles,
+		StackRecord: runtime.StackRecord{
+			Stack0: s,
+		},
+	}
+}
+
+func (h *mutexTestHelper) mutate(nmutations int, fs []runtime.BlockProfileRecord) {
+	oneBlockCycles := fs[0].Cycles / fs[0].Count
+	for j := 0; j < nmutations; j++ {
+		idx := int(uint(h.rng.Int63())) % len(fs)
+		fs[idx].Count += 1
+		fs[idx].Cycles += oneBlockCycles
+	}
+}
+
+type heapTestHelper struct {
+	dp   *pprof.DeltaHeapProfiler
+	opt  *pprof.ProfileBuilderOptions
+	rate int64
+	rng  rand.Source
+}
+
+func newHeapTestHelper() *heapTestHelper {
+	res := &heapTestHelper{
+		dp: &pprof.DeltaHeapProfiler{},
+		opt: &pprof.ProfileBuilderOptions{
+			GenericsFrames: true,
+			LazyMapping:    true,
+		},
+		rng:  rand.NewSource(239),
+		rate: int64(runtime.MemProfileRate),
 	}
 	return res
 }
+
+func (h *heapTestHelper) dump(r ...runtime.MemProfileRecord) *bytes.Buffer {
+	buf := bytes.NewBuffer(nil)
+	err := WriteHeapProto(h.dp, h.opt, buf, r, h.rate)
+	if err != nil { // never happens
+		panic(err)
+	}
+	return buf
+}
+
+func (h *heapTestHelper) r(AllocObjects, AllocBytes, FreeObjects, FreeBytes int64, s [32]uintptr) runtime.MemProfileRecord {
+	return runtime.MemProfileRecord{
+		AllocObjects: AllocObjects,
+		AllocBytes:   AllocBytes,
+		FreeBytes:    FreeBytes,
+		FreeObjects:  FreeObjects,
+		Stack0:       s,
+	}
+}
+
+func (h *heapTestHelper) mutate(nmutations int, fs []runtime.MemProfileRecord) {
+	objSize := fs[0].AllocBytes / fs[0].AllocObjects
+	for j := 0; j < nmutations; j++ {
+		idx := int(uint(h.rng.Int63())) % len(fs)
+		fs[idx].AllocObjects += 1
+		fs[idx].AllocBytes += objSize
+		fs[idx].FreeObjects += 1
+		fs[idx].FreeBytes += objSize
+	}
+}
+
+func WriteHeapProto(dp *pprof.DeltaHeapProfiler, opt *pprof.ProfileBuilderOptions, w io.Writer, p []runtime.MemProfileRecord, rate int64) error {
+	stc := pprof.HeapProfileConfig(rate)
+	b := pprof.NewProfileBuilder(w, opt, stc)
+	return dp.WriteHeapProto(b, p, rate)
+}
+
+func PrintCountCycleProfile(d *pprof.DeltaMutexProfiler, opt *pprof.ProfileBuilderOptions, w io.Writer, scaler pprof.MutexProfileScaler, records []runtime.BlockProfileRecord) error {
+	stc := pprof.MutexProfileConfig()
+	b := pprof.NewProfileBuilder(w, opt, stc)
+	return d.PrintCountCycleProfile(b, scaler, records)
+}
+
+func dumpMemProfileRecords() []runtime.MemProfileRecord {
+	var p []runtime.MemProfileRecord
+	n, ok := runtime.MemProfile(nil, true)
+	for {
+		// Allocate room for a slightly bigger profile,
+		// in case a few more entries have been added
+		// since the call to MemProfile.
+		p = make([]runtime.MemProfileRecord, n+50)
+		n, ok = runtime.MemProfile(p, true)
+		if ok {
+			p = p[0:n]
+			break
+		}
+		// Profile grew; try again.
+	}
+	return p
+}
+
+type noopBuilder struct {
+}
+
+func (b *noopBuilder) LocsForStack(_ []uintptr) []uint64 {
+	return nil
+}
+
+func (b *noopBuilder) Sample(_ []int64, _ []uint64, _ int64) {
+
+}
+
+func (b *noopBuilder) Build() {
+
+}
diff --git a/godeltaprof/go.mod b/godeltaprof/go.mod
index 0fbc602..e36d42c 100644
--- a/godeltaprof/go.mod
+++ b/godeltaprof/go.mod
@@ -1,5 +1,5 @@
 module github.com/grafana/pyroscope-go/godeltaprof
 
-go 1.16
+go 1.18
 
-require github.com/klauspost/compress v1.17.3
+require github.com/klauspost/compress v1.17.8
diff --git a/godeltaprof/go.sum b/godeltaprof/go.sum
index 0d8e8f5..734f917 100644
--- a/godeltaprof/go.sum
+++ b/godeltaprof/go.sum
@@ -1,2 +1,2 @@
-github.com/klauspost/compress v1.17.3 h1:qkRjuerhUU1EmXLYGkSH6EZL+vPSxIrYjLNAK4slzwA=
-github.com/klauspost/compress v1.17.3/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM=
+github.com/klauspost/compress v1.17.8 h1:YcnTYrq7MikUT7k0Yb5eceMmALQPYBW/Xltxn0NAMnU=
+github.com/klauspost/compress v1.17.8/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
diff --git a/godeltaprof/internal/pprof/delta_heap.go b/godeltaprof/internal/pprof/delta_heap.go
index 0392382..883d020 100644
--- a/godeltaprof/internal/pprof/delta_heap.go
+++ b/godeltaprof/internal/pprof/delta_heap.go
@@ -6,16 +6,27 @@ import (
 	"strings"
 )
 
+type heapPrevValue struct {
+	allocObjects int64
+}
+
+type heapAccValue struct {
+	allocObjects int64
+	inuseObjects int64
+}
+
 type DeltaHeapProfiler struct {
-	m profMap
+	m profMap[heapPrevValue, heapAccValue]
+	//todo consider adding an option to remove block size label and merge allocations of different size
 }
 
 // WriteHeapProto writes the current heap profile in protobuf format to w.
 func (d *DeltaHeapProfiler) WriteHeapProto(b ProfileBuilder, p []runtime.MemProfileRecord, rate int64) error {
 	values := []int64{0, 0, 0, 0}
 	var locs []uint64
-	for _, r := range p {
-		// do the delta
+	// deduplicate: accumulate allocObjects and inuseObjects in entry.acc for equal stacks
+	for i := range p {
+		r := &p[i]
 		if r.AllocBytes == 0 && r.AllocObjects == 0 && r.FreeObjects == 0 && r.FreeBytes == 0 {
 			// it is a fresh bucket and it will be published after next 1-2 gc cycles
 			continue
@@ -25,17 +36,42 @@ func (d *DeltaHeapProfiler) WriteHeapProto(b ProfileBuilder, p []runtime.MemProf
 			blockSize = r.AllocBytes / r.AllocObjects
 		}
 		entry := d.m.Lookup(r.Stack(), uintptr(blockSize))
+		entry.acc.allocObjects += r.AllocObjects
+		entry.acc.inuseObjects += r.InUseObjects()
+	}
+	// do the delta using the accumulated values and previous values
+	for i := range p {
+		r := &p[i]
+		if r.AllocBytes == 0 && r.AllocObjects == 0 && r.FreeObjects == 0 && r.FreeBytes == 0 {
+			// it is a fresh bucket and it will be published after next 1-2 gc cycles
+			continue
+		}
+		var blockSize int64
+		if r.AllocObjects > 0 {
+			blockSize = r.AllocBytes / r.AllocObjects
+		}
+		entry := d.m.Lookup(r.Stack(), uintptr(blockSize))
+		if entry.acc == (heapAccValue{}) {
+			continue
+		}
 
-		if (r.AllocObjects - entry.count.v1) < 0 {
+		allocObjects := entry.acc.allocObjects - entry.prev.allocObjects
+		if allocObjects < 0 {
 			continue
 		}
-		AllocObjects := r.AllocObjects - entry.count.v1
-		AllocBytes := r.AllocBytes - entry.count.v2
-		entry.count.v1 = r.AllocObjects
-		entry.count.v2 = r.AllocBytes
 
-		values[0], values[1] = scaleHeapSample(AllocObjects, AllocBytes, rate)
-		values[2], values[3] = scaleHeapSample(r.InUseObjects(), r.InUseBytes(), rate)
+		// allocBytes, inuseBytes is calculated as multiplication of number of objects by blockSize
+		// This is done to reduce the size of the map entry (i.e. heapAccValue for deduplication and
+		// heapPrevValue for keeping the delta).
+
+		allocBytes := allocObjects * blockSize
+		entry.prev.allocObjects = entry.acc.allocObjects
+		inuseBytes := entry.acc.inuseObjects * blockSize
+
+		values[0], values[1] = ScaleHeapSample(allocObjects, allocBytes, rate)
+		values[2], values[3] = ScaleHeapSample(entry.acc.inuseObjects, inuseBytes, rate)
+
+		entry.acc = heapAccValue{}
 
 		if values[0] == 0 && values[1] == 0 && values[2] == 0 && values[3] == 0 {
 			continue
@@ -70,7 +106,7 @@ func (d *DeltaHeapProfiler) WriteHeapProto(b ProfileBuilder, p []runtime.MemProf
 	return nil
 }
 
-// scaleHeapSample adjusts the data from a heap Sample to
+// ScaleHeapSample adjusts the data from a heap Sample to
 // account for its probability of appearing in the collected
 // data. heap profiles are a sampling of the memory allocations
 // requests in a program. We estimate the unsampled value by dividing
@@ -79,7 +115,7 @@ func (d *DeltaHeapProfiler) WriteHeapProto(b ProfileBuilder, p []runtime.MemProf
 // which samples to collect, based on the desired average collection
 // rate R. The probability of a sample of size S to appear in that
 // profile is 1-exp(-S/R).
-func scaleHeapSample(count, size, rate int64) (int64, int64) {
+func ScaleHeapSample(count, size, rate int64) (int64, int64) {
 	if count == 0 || size == 0 {
 		return 0, 0
 	}
diff --git a/godeltaprof/internal/pprof/delta_mutex.go b/godeltaprof/internal/pprof/delta_mutex.go
index 0b2500a..5c177e3 100644
--- a/godeltaprof/internal/pprof/delta_mutex.go
+++ b/godeltaprof/internal/pprof/delta_mutex.go
@@ -4,8 +4,18 @@ import (
 	"runtime"
 )
 
+type mutexPrevValue struct {
+	count    int64
+	inanosec int64
+}
+
+type mutexAccValue struct {
+	count  int64
+	cycles int64
+}
+
 type DeltaMutexProfiler struct {
-	m profMap
+	m profMap[mutexPrevValue, mutexAccValue]
 }
 
 // PrintCountCycleProfile outputs block profile records (for block or mutex profiles)
@@ -19,16 +29,33 @@ func (d *DeltaMutexProfiler) PrintCountCycleProfile(b ProfileBuilder, scaler Mut
 
 	values := []int64{0, 0}
 	var locs []uint64
-	for _, r := range records {
-		count, nanosec := ScaleMutexProfile(scaler, r.Count, float64(r.Cycles)/cpuGHz)
+	// deduplicate: accumulate count and cycles in entry.acc for equal stacks
+	for i := range records {
+		r := &records[i]
+		entry := d.m.Lookup(r.Stack(), 0)
+		entry.acc.count += r.Count // accumulate unscaled
+		entry.acc.cycles += r.Cycles
+	}
+
+	// do the delta using the accumulated values and previous values
+	for i := range records {
+		r := &records[i]
+		stk := r.Stack()
+		entry := d.m.Lookup(stk, 0)
+		accCount := entry.acc.count
+		accCycles := entry.acc.cycles
+		if accCount == 0 && accCycles == 0 {
+			continue
+		}
+		entry.acc = mutexAccValue{}
+		count, nanosec := ScaleMutexProfile(scaler, accCount, float64(accCycles)/cpuGHz)
 		inanosec := int64(nanosec)
 
 		// do the delta
-		entry := d.m.Lookup(r.Stack(), 0)
-		values[0] = count - entry.count.v1
-		values[1] = inanosec - entry.count.v2
-		entry.count.v1 = count
-		entry.count.v2 = inanosec
+		values[0] = count - entry.prev.count
+		values[1] = inanosec - entry.prev.inanosec
+		entry.prev.count = count
+		entry.prev.inanosec = inanosec
 
 		if values[0] < 0 || values[1] < 0 {
 			continue
@@ -39,7 +66,7 @@ func (d *DeltaMutexProfiler) PrintCountCycleProfile(b ProfileBuilder, scaler Mut
 
 		// For count profiles, all stack addresses are
 		// return PCs, which is what appendLocsForStack expects.
-		locs = b.LocsForStack(r.Stack())
+		locs = b.LocsForStack(stk)
 		b.Sample(values, locs, 0)
 	}
 	b.Build()
diff --git a/godeltaprof/internal/pprof/map.go b/godeltaprof/internal/pprof/map.go
index 188001e..dcb6e56 100644
--- a/godeltaprof/internal/pprof/map.go
+++ b/godeltaprof/internal/pprof/map.go
@@ -8,30 +8,23 @@ import "unsafe"
 
 // A profMap is a map from (stack, tag) to mapEntry.
 // It grows without bound, but that's assumed to be OK.
-type profMap struct {
-	hash    map[uintptr]*profMapEntry
-	all     *profMapEntry
-	last    *profMapEntry
-	free    []profMapEntry
+type profMap[PREV any, ACC any] struct {
+	hash    map[uintptr]*profMapEntry[PREV, ACC]
+	free    []profMapEntry[PREV, ACC]
 	freeStk []uintptr
 }
 
-type count struct {
-	// alloc_objects, alloc_bytes for heap
-	// mutex_count, mutex_duration for mutex
-	v1, v2 int64
-}
-
 // A profMapEntry is a single entry in the profMap.
-type profMapEntry struct {
-	nextHash *profMapEntry // next in hash list
-	nextAll  *profMapEntry // next in list of all entries
+// todo use unsafe.Pointer + len for stk ?
+type profMapEntry[PREV any, ACC any] struct {
+	nextHash *profMapEntry[PREV, ACC] // next in hash list
 	stk      []uintptr
 	tag      uintptr
-	count    count
+	prev     PREV
+	acc      ACC
 }
 
-func (m *profMap) Lookup(stk []uintptr, tag uintptr) *profMapEntry {
+func (m *profMap[PREV, ACC]) Lookup(stk []uintptr, tag uintptr) *profMapEntry[PREV, ACC] {
 	// Compute hash of (stk, tag).
 	h := uintptr(0)
 	for _, x := range stk {
@@ -42,7 +35,7 @@ func (m *profMap) Lookup(stk []uintptr, tag uintptr) *profMapEntry {
 	h += uintptr(tag) * 41
 
 	// Find entry if present.
-	var last *profMapEntry
+	var last *profMapEntry[PREV, ACC]
 Search:
 	for e := m.hash[h]; e != nil; last, e = e, e.nextHash {
 		if len(e.stk) != len(stk) || e.tag != tag {
@@ -64,7 +57,7 @@ Search:
 
 	// Add new entry.
 	if len(m.free) < 1 {
-		m.free = make([]profMapEntry, 128)
+		m.free = make([]profMapEntry[PREV, ACC], 128)
 	}
 	e := &m.free[0]
 	m.free = m.free[1:]
@@ -82,15 +75,8 @@ Search:
 		e.stk[j] = uintptr(stk[j])
 	}
 	if m.hash == nil {
-		m.hash = make(map[uintptr]*profMapEntry)
+		m.hash = make(map[uintptr]*profMapEntry[PREV, ACC])
 	}
 	m.hash[h] = e
-	if m.all == nil {
-		m.all = e
-		m.last = e
-	} else {
-		m.last.nextAll = e
-		m.last = e
-	}
 	return e
 }
diff --git a/godeltaprof/internal/pprof/stub_go23.go b/godeltaprof/internal/pprof/stub_go23.go
index 37ccbe8..d587cad 100644
--- a/godeltaprof/internal/pprof/stub_go23.go
+++ b/godeltaprof/internal/pprof/stub_go23.go
@@ -23,5 +23,5 @@ func runtime_FrameSymbolName(f *runtime.Frame) string
 //go:linkname runtime_expandFinalInlineFrame runtime/pprof.runtime_expandFinalInlineFrame
 func runtime_expandFinalInlineFrame(stk []uintptr) []uintptr
 
-//go:linkname runtime_cyclesPerSecond runtime.pprof_cyclesPerSecond
+//go:linkname runtime_cyclesPerSecond runtime/pprof.runtime_cyclesPerSecond
 func runtime_cyclesPerSecond() int64