diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
index 3c36907414..c3e25da37d 100644
--- a/.buildkite/pipeline.yml
+++ b/.buildkite/pipeline.yml
@@ -384,7 +384,6 @@ steps:
             build.message !~ /\[skip special\]/
         timeout_in_minutes: 45
 
-  # we want to benchmark every commit on the master branch, even if it failed CI
   - wait: ~
     continue_on_failure: true
 
@@ -412,78 +411,36 @@ steps:
         build.message !~ /\[skip docs\]/
     timeout_in_minutes: 15
 
-  - group: ":racehorse: Benchmarks"
-    steps:
-      # benchmarks outside of the master branch don't submit their results,
-      # so they can run on any system in the juliagpu queue.
-      - label: "Benchmarks (dry run)"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "1.11"
-        command: |
-          julia --project -e '
-            using Pkg
-
-            println("--- :julia: Instantiating project")
-            Pkg.resolve()
-            Pkg.instantiate()
-            Pkg.activate("perf")
-            Pkg.resolve()
-            Pkg.instantiate()
-            push!(LOAD_PATH, @__DIR__)
-
-            println("+++ :julia: Benchmarking")
-            include("perf/runbenchmarks.jl")'
-        agents:
-          queue: "juliagpu"
-          cuda: "*"
-        if: |
-          build.message =~ /\[only benchmarks\]/ ||
-          build.message !~ /\[only/ && !build.pull_request.draft &&
-            build.message !~ /\[skip benchmarks\]/
-        timeout_in_minutes: 30
-
-      # if we will submit results, use the benchmark queue so that we will
-      # be running on the same system each time
-      - label: "Benchmarks on Julia {{matrix.julia}}"
-        plugins:
-          - JuliaCI/julia#v1:
-              version: "{{matrix.julia}}"
-        env:
-          BENCHMARKS: "true"
-          CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
-          CODESPEED_BRANCH: "$BUILDKITE_BRANCH"
-          CODESPEED_COMMIT: "$BUILDKITE_COMMIT"
-          CODESPEED_EXECUTABLE: "Julia {{matrix.julia}}"
-        command: |
-          julia --project -e '
-            using Pkg
-            ENV["CODESPEED_ENVIRONMENT"] = ENV["BUILDKITE_AGENT_NAME"]
+  - label: ":racehorse: Benchmarks"
+    plugins:
+      - JuliaCI/julia#v1:
+          version: "1.11"
+    env:
+      BENCHMARKS: "true"
+      CODESPEED_PROJECT: "$BUILDKITE_PIPELINE_NAME"
+      CODESPEED_BRANCH: "$BUILDKITE_BRANCH"
+      CODESPEED_COMMIT: "$BUILDKITE_COMMIT"
+      CODESPEED_EXECUTABLE: "Julia {{matrix.julia}}"
+    command: |
+      julia --project -e '
+        using Pkg
 
-            println("--- :julia: Instantiating project")
-            Pkg.resolve()
-            Pkg.instantiate()
-            Pkg.activate("perf")
-            Pkg.resolve()
-            Pkg.instantiate()
-            push!(LOAD_PATH, @__DIR__)
+        println("--- :julia: Instantiating project")
+        Pkg.develop([PackageSpec(path=pwd())])
 
-            println("+++ :julia: Benchmarking")
-            include("perf/runbenchmarks.jl")'
-        agents:
-          queue: "benchmark"
-          gpu: "rtx2070"
-          cuda: "*"
-        if: |
-          build.branch =~ /^master$$/ && build.message =~ /\[only benchmarks\]/ ||
-          build.branch =~ /^master$$/ && build.message !~ /\[only/ &&
-            build.message !~ /\[skip benchmarks\]/
-        matrix:
-          setup:
-            julia:
-              - "1.11"
-              - "1.11"
-        timeout_in_minutes: 30
+        println("+++ :julia: Benchmarking")
+        include("perf/runbenchmarks.jl")'
+    artifact_paths:
+      - "benchmarkresults.json"
+    agents:
+      queue: "benchmark"
+      gpu: "rtx2070"
+      cuda: "*"
+    if: |
+      build.message =~ /\[only benchmarks\]/ ||
+      build.message !~ /\[only/ && !build.pull_request.draft &&
+        build.message !~ /\[skip benchmarks\]/
+    timeout_in_minutes: 30
 
 env:
   JULIA_PKG_SERVER_REGISTRY_PREFERENCE: "eager" # OK to downloading JLLs from GitHub
diff --git a/.github/workflows/Benchmark.yml b/.github/workflows/Benchmark.yml
new file mode 100644
index 0000000000..9eb855c52b
--- /dev/null
+++ b/.github/workflows/Benchmark.yml
@@ -0,0 +1,66 @@
+name: Benchmarks
+permissions:
+  statuses: read        # find Buildkite URL from PR status
+  contents: write       # update benchmark contents in gh-pages branch
+  pull-requests: write  # comment on PR with benchmark results
+  deployments: write    # deploy GitHub pages website
+
+on:
+  pull_request_target:
+    branches:
+      - main
+    paths:
+      - "src/**/*"
+      - "lib/**/*"
+      - "ext/**/*"
+      - "perf/**/*"
+      - ".buildkite/**/*"
+      - "Project.toml"
+  push:
+    branches:
+      - main
+    paths:
+      - "src/**/*"
+      - "lib/**/*"
+      - "ext/**/*"
+      - "perf/**/*"
+      - ".buildkite/**/*"
+      - "Project.toml"
+
+jobs:
+  benchmark:
+    if: |
+      contains(github.event.head_commit.message, '[only benchmarks]') ||
+      !contains(github.event.head_commit.message, '[only') &&
+        !contains(github.event.head_commit.message, '[skip benchmarks]') &&
+        !github.event.pull_request.draft
+    runs-on: ubuntu-latest
+    steps:
+      - name: Download Buildkite Artifacts
+        id: download
+        uses: EnricoMi/download-buildkite-artifact-action@v1
+        with:
+          buildkite_token: ${{ secrets.BUILDKITE_TOKEN }}
+          ignore_build_states: blocked,canceled,skipped,not_run
+          ignore_job_states: timed_out,failed
+          output_path: artifacts
+
+      - name: Locate Benchmarks Artifact
+        id: locate
+        if: ${{ steps.download.outputs.download-state == 'success' }}
+        run: echo "path=$(find artifacts -type f -name benchmarkresults.json 2>/dev/null)" >> $GITHUB_OUTPUT
+
+      - name: Upload Benchmark Results
+        if: ${{ steps.locate.outputs.path != '' }}
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: CUDA.jl Benchmarks
+          tool: "julia"
+          output-file-path: ${{ steps.locate.outputs.path }}
+          benchmark-data-dir-path: "bench"
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          comment-always: ${{ github.event_name == 'pull_request' }}
+          summary-always: true
+          alert-threshold: "125%"
+          fail-on-alert: false
+          auto-push: ${{ github.event_name != 'pull_request' }}
diff --git a/README.md b/README.md
index 66cc75ff69..3676c61aa2 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 
 *CUDA programming in Julia*
 
-[![][doi-img]][doi-url] [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] [![][buildkite-img]][buildkite-url] [![][codecov-img]][codecov-url] [![][codespeed-trend-img]][codespeed-trend-url] [![][codespeed-chart-img]][codespeed-chart-url]
+[![][doi-img]][doi-url] [![][docs-stable-img]][docs-stable-url] [![][docs-dev-img]][docs-dev-url] [![][buildkite-img]][buildkite-url] [![][codecov-img]][codecov-url] [![][benchmark-img]][benchmark-url]
 
 [doi-img]: https://zenodo.org/badge/doi/10.1109/TPDS.2018.2872064.svg
 [doi-url]: https://ieeexplore.ieee.org/abstract/document/8471188
@@ -19,11 +19,8 @@
 [codecov-img]: https://codecov.io/gh/JuliaGPU/CUDA.jl/branch/master/graph/badge.svg
 [codecov-url]: https://codecov.io/gh/JuliaGPU/CUDA.jl
 
-[codespeed-chart-img]: https://img.shields.io/badge/benchmarks-Chart-yellowgreen
-[codespeed-chart-url]: https://speed.juliagpu.org/timeline/#/?exe=9,11&env=1&base=none&ben=grid&revs=50
-
-[codespeed-trend-img]: https://img.shields.io/badge/benchmarks-Trend-yellowgreen
-[codespeed-trend-url]: https://speed.juliagpu.org/changes/?exe=9&env=1&tre=50
+[benchmark-img]: https://img.shields.io/badge/benchmarks-Chart-yellowgreen
+[benchmark-url]: https://cuda.juliagpu.org/bench/
 
 The CUDA.jl package is the main programming interface for working with NVIDIA CUDA GPUs
 using Julia. It features a user-friendly array abstraction, a compiler for writing CUDA
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
index 5969dcd9b7..8d9f9d3a9c 100644
--- a/perf/runbenchmarks.jl
+++ b/perf/runbenchmarks.jl
@@ -7,13 +7,6 @@ using BenchmarkTools
 using StableRNGs
 rng = StableRNG(123)
 
-# we only submit results when running on the master branch
-real_run = get(ENV, "CODESPEED_BRANCH", nothing) == "master"
-if real_run
-    # to find untuned benchmarks
-    BenchmarkTools.DEFAULT_PARAMETERS.evals = 0
-end
-
 # convenience macro to create a benchmark that requires synchronizing the GPU
 macro async_benchmarkable(ex...)
     quote
@@ -28,21 +21,17 @@ latency_results = include("latency.jl")
 
 SUITE = BenchmarkGroup()
 
-# NOTE: don't use spaces in benchmark names (tobami/codespeed#256)
-
 include("cuda.jl")
 include("kernel.jl")
 include("array.jl")
 
-if real_run
-    @info "Preparing main benchmarks"
-    warmup(SUITE; verbose=false)
-    tune!(SUITE)
+@info "Preparing main benchmarks"
+warmup(SUITE; verbose=false)
+tune!(SUITE)
 
-    # reclaim memory that might have been used by the tuning process
-    GC.gc(true)
-    CUDA.reclaim()
-end
+# reclaim memory that might have been used by the tuning process
+GC.gc(true)
+CUDA.reclaim()
 
 # benchmark groups that aren't part of the suite
 addgroup!(SUITE, "integration")
@@ -60,69 +49,5 @@ integration_results["cudadevrt"] = include("cudadevrt.jl")
 results["latency"] = latency_results
 results["integration"] = integration_results
 
-println(results)
-
-
-## comparison
-
 # write out the results
-BenchmarkTools.save(joinpath(@__DIR__, "results.json"), results)
-
-# compare against previous results
-# TODO: store these results so that we can compare when benchmarking PRs
-reference_path = joinpath(@__DIR__, "reference.json")
-if ispath(reference_path)
-    reference = BenchmarkTools.load(reference_path)[1]
-    comparison = judge(minimum(results), minimum(reference))
-
-    println("Improvements:")
-    println(improvements(comparison))
-
-    println("Regressions:")
-    println(regressions(comparison))
-end
-
-
-## submission
-
-using JSON, HTTP
-
-if real_run
-    @info "Submitting to Codespeed..."
-
-    basedata = Dict(
-        "branch"        => ENV["CODESPEED_BRANCH"],
-        "commitid"      => ENV["CODESPEED_COMMIT"],
-        "project"       => ENV["CODESPEED_PROJECT"],
-        "environment"   => ENV["CODESPEED_ENVIRONMENT"],
-        "executable"    => ENV["CODESPEED_EXECUTABLE"]
-    )
-
-    # convert nested groups of benchmark to flat dictionaries of results
-    flat_results = []
-    function flatten(results, prefix="")
-        for (key,value) in results
-            if value isa BenchmarkGroup
-                flatten(value, "$prefix$key/")
-            else
-                @assert value isa BenchmarkTools.Trial
-
-                # codespeed reports maxima, but those are often very noisy.
-                # get rid of measurements that unnecessarily skew the distribution.
-                rmskew!(value)
-
-                push!(flat_results,
-                    Dict(basedata...,
-                        "benchmark" => "$prefix$key",
-                        "result_value" => median(value).time / 1e9,
-                        "min" => minimum(value).time / 1e9,
-                        "max" => maximum(value).time / 1e9))
-            end
-        end
-    end
-    flatten(results)
-
-    HTTP.post("$(ENV["CODESPEED_SERVER"])/result/add/json/",
-                ["Content-Type" => "application/x-www-form-urlencoded"],
-                HTTP.URIs.escapeuri(Dict("json" => JSON.json(flat_results))))
-end
+BenchmarkTools.save("benchmarkresults.json", median(results))