diff --git a/doc/roots/cubic_roots.qbk b/doc/roots/cubic_roots.qbk
new file mode 100644
index 0000000000..89ae69fa61
--- /dev/null
+++ b/doc/roots/cubic_roots.qbk
@@ -0,0 +1,110 @@
+[/
+Copyright (c) 2021 Nick Thompson
+Use, modification and distribution are subject to the
+Boost Software License, Version 1.0. (See accompanying file
+LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+]
+
+[section:cubic_roots Roots of Cubic Polynomials]
+
+[heading Synopsis]
+
+```
+#include <boost/math/roots/cubic_roots.hpp>
+
+namespace boost::math::tools {
+
+// Solves ax³ + bx² + cx + d = 0.
+std::array<Real,3> cubic_roots(Real a, Real b, Real c, Real d);
+
+// Computes the empirical residual p(r) (first element) and expected residual ε|rṗ(r)| (second element) for a root.
+// Recall that for a numerically computed root r of a function p, |p(r)| ≤ ε|rṗ(r)|.
+template<typename Real>
+std::array<Real, 2> cubic_root_residual(Real a, Real b, Real c, Real d, Real root);
+}
+```
+
+[heading Background]
+
+The `cubic_roots` function extracts all real roots of a cubic polynomial ax³ + bx² + cx + d.
+The result is a `std::array<Real, 3>`, which has length three, irrespective of whether there are 3 real roots.
+There is always 1 real root, and hence (barring overflow or other exceptional circumstances), the first element of the
+`std::array` is always populated.
+If there is only one real root of the polynomial, then the second and third elements are set to `nans`.
+The roots are returned in nondecreasing order.
+
+Be careful with double roots.
+First, if you have a real double root, it is numerically indistinguishable from a complex conjugate pair of roots,
+where the complex part is tiny.
+Second, the condition number of rootfinding is infinite at a double root,
+so even changes as subtle as different instruction generation can change the outcome.
+We have some heuristics in place to detect double roots, but these should be regarded with suspicion.
+
+
+[heading Example]
+
+```
+#include <iostream>
+#include <sstream>
+#include <boost/math/tools/cubic_roots.hpp>
+
+using boost::math::tools::cubic_roots;
+using boost::math::tools::cubic_root_residual;
+
+template<typename Real>
+std::string print_roots(std::array<Real, 3> const & roots) {
+    std::ostringstream out;
+    out << "{" << roots[0] << ", " << roots[1] << ", " << roots[2] << "}";
+    return out.str();
+}
+
+int main() {
+    // Solves x³ - 6x² + 11x - 6 = (x-1)(x-2)(x-3).
+    auto roots = cubic_roots(1.0, -6.0, 11.0, -6.0);
+    std::cout << "The roots of x³ - 6x² + 11x - 6 are " << print_roots(roots) << ".\n";
+
+    // Double root; YMMV:
+    // (x+1)²(x-2) = x³ - 3x - 2:
+    roots = cubic_roots(1.0, 0.0, -3.0, -2.0);
+    std::cout << "The roots of x³ - 3x - 2 are " << print_roots(roots) << ".\n";
+
+    // Single root: (x-i)(x+i)(x-3) = x³ - 3x² + x - 3:
+    roots = cubic_roots(1.0, -3.0, 1.0, -3.0);
+    std::cout << "The real roots of x³ - 3x² + x - 3 are " << print_roots(roots) << ".\n";
+
+    // I don't know the roots of x³ + 6.28x² + 2.3x + 3.6;
+    // how can I see if they've been computed correctly?
+    roots = cubic_roots(1.0, 6.28, 2.3, 3.6);
+    std::cout << "The real root of x³ +6.28x² + 2.3x + 3.6 is " << roots[0] << ".\n";
+    auto res = cubic_root_residual(1.0, 6.28, 2.3, 3.6, roots[0]);
+    std::cout << "The residual is " << res[0] << ", and the expected residual is " << res[1] << ". ";
+    if (abs(res[0]) <= res[1]) {
+        std::cout << "This is an expected accuracy.\n";
+    } else {
+        std::cerr << "The residual is unexpectedly large.\n";
+    }
+}
+```
+
+This prints:
+
+```
+The roots of  x³ - 6x² + 11x - 6 are {1, 2, 3}.
+The roots of  x³ - 3x - 2 are {-1, -1, 2}.
+The real roots of x³ - 3x² + x - 3 are {3, nan, nan}.
+The real root of x³ +6.28x² + 2.3x + 3.6 is -5.99656.
+The residual is -1.56586e-14, and the expected residual is 4.64155e-14. This is an expected accuracy.
+```
+
+[heading Performance and Accuracy]
+
+On an Intel laptop chip running at 2700 MHz, we observe 3 roots taking ~90ns to compute.
+If the polynomial only possesses a single real root, it takes ~50ns.
+See `reporting/performance/cubic_roots_performance.cpp`.
+
+The forward error cannot be effectively bounded.
+However, for an approximate root x returned by this routine, the residuals should be constrained by |p(x)| ≤ ε|xṗ(x)|.
+
+
+[endsect]
+[/section:cubic_roots]
diff --git a/doc/roots/roots_overview.qbk b/doc/roots/roots_overview.qbk
index 5f1490d668..770432af5d 100644
--- a/doc/roots/roots_overview.qbk
+++ b/doc/roots/roots_overview.qbk
@@ -17,6 +17,7 @@ There are several fully-worked __root_finding_examples, including:
 
 [include roots_without_derivatives.qbk]
 [include roots.qbk]
+[include cubic_roots.qbk]
 [include root_finding_examples.qbk]
 [include minima.qbk]
 [include root_comparison.qbk]
diff --git a/include/boost/math/tools/cubic_roots.hpp b/include/boost/math/tools/cubic_roots.hpp
index 2736fc4ba1..e3070dbebf 100644
--- a/include/boost/math/tools/cubic_roots.hpp
+++ b/include/boost/math/tools/cubic_roots.hpp
@@ -1,4 +1,4 @@
-//  (C) Copyright Nick Thompson 2019.
+//  (C) Copyright Nick Thompson 2021.
 //  Use, modification and distribution are subject to the
 //  Boost Software License, Version 1.0. (See accompanying file
 //  LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -30,7 +30,7 @@ std::array<Real, 3> cubic_roots(Real a, Real b, Real c, Real d) {
                                  std::numeric_limits<Real>::quiet_NaN(),
                                  std::numeric_limits<Real>::quiet_NaN()};
     if (a == 0) {
-        // bx^2 + cx + d = 0:
+        // bx² + cx + d = 0:
         if (b == 0) {
             // cx + d = 0:
             if (c == 0) {
@@ -65,7 +65,7 @@ std::array<Real, 3> cubic_roots(Real a, Real b, Real c, Real d) {
     Real Q = (p*p - 3*q)/9;
     Real R = (2*p*p*p - 9*p*q + 27*r)/54;
     if (R*R < Q*Q*Q) {
-        //std::cout << "In the R^2 < Q^3 branch.\n";
+        //std::cout << "In the R² < Q³ branch.\n";
         Real rtQ = sqrt(Q);
         Real theta = acos(R/(Q*rtQ))/3;
         Real st = sin(theta);
@@ -73,45 +73,33 @@ std::array<Real, 3> cubic_roots(Real a, Real b, Real c, Real d) {
         roots[0] = -2*rtQ*ct - p/3;
         roots[1] = -rtQ*(-ct + sqrt(Real(3))*st) - p/3;
         roots[2] = rtQ*(ct + sqrt(Real(3))*st) - p/3;
-        // This formula is not super accurate.
-        // Do a cleanup iteration.
-        for (auto & r : roots) {
-            // Horner's method.
-            // Here I'll take John Gustaffson's opinion that the fma is a *distinct* operation from a*x +b:
-            // Make sure to compile these fmas into a single instruction!
-            Real f = fma(a, r, b);
-            f = fma(f,r,c);
-            f = fma(f,r,d);
-            Real df = fma(3*a, r, 2*b);
-            df = fma(df, r, c);
-            if (df != 0) {
-                // No standard library feature for fused-divide add!
-                r -= f/df;
-            }
+    } else {
+        // In Numerical Recipes, Chapter 5, Section 6, it is claimed that we only have one real root
+        // if R² >= Q³. But this isn't true; we can even see this from equation 5.6.18.
+        // The condition for having three real roots is that A = B.
+        // It *is* the case that if we're in this branch, and we have 3 real roots, two are a double root.
+        // Take (x+1)²(x-2) = x³ - 3x -2 as an example. This clearly has a double root at x = -1,
+        // and it gets sent into this branch.
+        Real arg = R*R - Q*Q*Q;
+        Real A = -detail::sgn(R)*cbrt(abs(R) + sqrt(arg));
+        Real B = 0;
+        if (A != 0) {
+            B = Q/A;
+        }
+        roots[0] = A + B - p/3;
+        // Yes, we're comparing floats for equality:
+        // Any perturbation pushes the roots into the complex plane; out of the bailiwick of this routine.
+        if (A == B || arg == 0) {
+            roots[1] = -A - p/3;
+            roots[2] = -A - p/3;
         }
-        std::sort(roots.begin(), roots.end());
-        return roots;
-    }
-    // In Numerical Recipes, Chapter 5, Section 6, it is claimed that we only have one real root
-    // if R^2 >= Q^3. But this isn't true; we can even see this from equation 5.6.18.
-    // The condition for having three real roots is that A = B.
-    // It *is* the case that if we're in this branch, and we have 3 real roots, two are a double root.
-    // Take (x+1)^2(x-2) = x^3 - 3x -2 as an example. This clearly has a double root at x = -1,
-    // and it gets sent into this branch.
-    Real arg = R*R - Q*Q*Q;
-    Real A = -detail::sgn(R)*cbrt(abs(R) + sqrt(arg));
-    Real B = 0;
-    if (A != 0) {
-        B = Q/A;
-    }
-    roots[0] = A + B - p/3;
-    // Yes, we're comparing floats for equality:
-    // Any perturbation pushes the roots into the complex plane; out of the bailiwick of this routine.
-    if (A == B || arg == 0) {
-        roots[1] = -A - p/3;
-        roots[2] = -A - p/3;
     }
+    // Root polishing:
     for (auto & r : roots) {
+        // Horner's method.
+        // Here I'll take John Gustaffson's opinion that the fma is a *distinct* operation from a*x +b:
+        // Make sure to compile these fmas into a single instruction and not a function call!
+        // (I'm looking at you Windows.)
         Real f = fma(a, r, b);
         f = fma(f,r,c);
         f = fma(f,r,d);
@@ -126,5 +114,25 @@ std::array<Real, 3> cubic_roots(Real a, Real b, Real c, Real d) {
     return roots;
 }
 
+// Computes the empirical residual p(r) (first element) and expected residual ε|rṗ(r)| (second element) for a root.
+// Recall that for a numerically computed root r of a function p, |p(r)| ≤ ε|rṗ(r)|.
+template<typename Real>
+std::array<Real, 2> cubic_root_residual(Real a, Real b, Real c, Real d, Real root) {
+    using std::fma;
+    using std::abs;
+    std::array<Real, 2> out;
+    Real residual = fma(a, root, b);
+    residual = fma(residual,root,c);
+    residual = fma(residual,root,d);
+
+    out[0] = residual;
+
+    Real expected_residual = fma(3*a, root, 2*b);
+    expected_residual = fma(expected_residual, root, c);
+    expected_residual = abs(root*expected_residual)*std::numeric_limits<Real>::epsilon();
+    out[1] = expected_residual;
+    return out;
+}
+
 }
 #endif
diff --git a/reporting/performance/cubic_roots_performance.cpp b/reporting/performance/cubic_roots_performance.cpp
index 5eb9a993f6..3550677dda 100644
--- a/reporting/performance/cubic_roots_performance.cpp
+++ b/reporting/performance/cubic_roots_performance.cpp
@@ -16,8 +16,9 @@ template<class Real>
 void CubicRoots(benchmark::State& state)
 {
     std::random_device rd;
-    //auto seed = rd();
-    uint32_t seed = 416683252;
+    auto seed = rd();
+    // This seed generates 3 real roots:
+    //uint32_t seed = 416683252;
     std::mt19937_64 mt(seed);
     std::uniform_real_distribution<Real> unif(-10, 10);
 
@@ -30,12 +31,10 @@ void CubicRoots(benchmark::State& state)
         auto roots = cubic_roots(a,b,c,d);
         benchmark::DoNotOptimize(roots[0]);
     }
-    std::cout << "Just solved " << a << "x^3 + " << b << "x^2 + " << c << "x + " << d << "\n";
-    std::cout << "This was generated by seed " << seed << "\n";
 }
 
-//BENCHMARK_TEMPLATE(CubicRoots, float);
+BENCHMARK_TEMPLATE(CubicRoots, float);
 BENCHMARK_TEMPLATE(CubicRoots, double);
-//BENCHMARK_TEMPLATE(CubicRoots, long double);
+BENCHMARK_TEMPLATE(CubicRoots, long double);
 
 BENCHMARK_MAIN();
diff --git a/test/Jamfile.v2 b/test/Jamfile.v2
index dc6682abaf..bc1859fbbb 100644
--- a/test/Jamfile.v2
+++ b/test/Jamfile.v2
@@ -990,6 +990,7 @@ test-suite misc :
    [ run signal_statistics_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
    [ run anderson_darling_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
    [ run ljung_box_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
+   [ run cubic_roots_test.cpp : : : [ requires cxx17_if_constexpr cxx17_std_apply ] ]
    [ run test_t_test.cpp : : : [ check-target-builds ../config//has_float128 "GCC libquadmath and __float128 support" : <define>BOOST_MATH_TEST_FLOAT128 <linkflags>"-Bstatic -lquadmath -Bdynamic" ] [ requires cxx11_hdr_forward_list cxx11_hdr_atomic cxx11_hdr_thread cxx11_hdr_tuple cxx11_hdr_future cxx11_sfinae_expr ]  ]
    [ run test_z_test.cpp : : : [ check-target-builds ../config//has_float128 "GCC libquadmath and __float128 support" : <define>BOOST_MATH_TEST_FLOAT128 <linkflags>"-Bstatic -lquadmath -Bdynamic" ] [ requires cxx11_hdr_forward_list cxx11_hdr_atomic cxx11_hdr_thread cxx11_hdr_tuple cxx11_hdr_future cxx11_sfinae_expr ]  ]
    [ run bivariate_statistics_test.cpp : : : [ requires cxx11_hdr_forward_list cxx11_hdr_atomic cxx11_hdr_thread cxx11_hdr_tuple cxx11_hdr_future cxx11_sfinae_expr ]  ]
diff --git a/test/cubic_roots_test.cpp b/test/cubic_roots_test.cpp
index b87dbeb3d8..48e0d4d2c7 100644
--- a/test/cubic_roots_test.cpp
+++ b/test/cubic_roots_test.cpp
@@ -98,13 +98,13 @@ void test_zero_coefficients()
 
         auto roots = cubic_roots(a, b, c, d);
         // I could check the condition number here, but this is fine right?
-        if(!CHECK_ULP_CLOSE(r[0], roots[0], 3)) {
+        if(!CHECK_ULP_CLOSE(r[0], roots[0], 25)) {
             std::cerr << "  Polynomial x^3 + " << b << "x^2 + " << c << "x + " << d << " has roots {";
             std::cerr << r[0] << ", " << r[1] << ", " << r[2] << "}, but the computed roots are {";
             std::cerr << roots[0] << ", " << roots[1] << ", " << roots[2] << "}\n";
         }
-        CHECK_ULP_CLOSE(r[1], roots[1], 3);
-        CHECK_ULP_CLOSE(r[2], roots[2], 3);
+        CHECK_ULP_CLOSE(r[1], roots[1], 25);
+        CHECK_ULP_CLOSE(r[2], roots[2], 25);
     }
 }