diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index ce6cf5c21e5eb2..65c8ecaaa1858a 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -87,6 +87,19 @@ jobs: name: ruff path: target/debug/ruff + cargo-fuzz: + runs-on: ubuntu-latest + name: "cargo fuzz" + steps: + - uses: actions/checkout@v3 + - name: "Install Rust toolchain" + run: rustup show + - uses: Swatinem/rust-cache@v2 + - name: "Install cargo-binstall" + uses: taiki-e/install-action@cargo-binstall + - run: cargo binstall cargo-fuzz -y + - run: cargo fuzz build -s none + cargo-test-wasm: runs-on: ubuntu-latest name: "cargo test (wasm)" diff --git a/crates/ruff/src/lib.rs b/crates/ruff/src/lib.rs index f1a9fe1be68daf..fc73e6e40223a6 100644 --- a/crates/ruff/src/lib.rs +++ b/crates/ruff/src/lib.rs @@ -35,5 +35,5 @@ mod rule_selector; pub mod rules; pub mod settings; -#[cfg(test)] -mod test; +#[cfg(any(test, fuzzing))] +pub mod test; diff --git a/crates/ruff/src/test.rs b/crates/ruff/src/test.rs index fba3c3b389836d..60a4eb9739fef9 100644 --- a/crates/ruff/src/test.rs +++ b/crates/ruff/src/test.rs @@ -1,8 +1,9 @@ -#![cfg(test)] +#![cfg(any(test, fuzzing))] //! Helper functions for the tests of rule implementations. use std::path::Path; +#[cfg(not(fuzzing))] use anyhow::Result; use itertools::Itertools; use ruff_textwrap::dedent; @@ -21,11 +22,13 @@ use crate::registry::AsRule; use crate::rules::pycodestyle::rules::syntax_error; use crate::settings::{flags, Settings}; +#[cfg(not(fuzzing))] pub(crate) fn test_resource_path(path: impl AsRef) -> std::path::PathBuf { Path::new("./resources/test/").join(path) } /// Run [`check_path`] on a file in the `resources/test/fixtures` directory. +#[cfg(not(fuzzing))] pub(crate) fn test_path(path: impl AsRef, settings: &Settings) -> Result> { let path = test_resource_path("fixtures").join(path); let contents = std::fs::read_to_string(&path)?; @@ -33,17 +36,27 @@ pub(crate) fn test_path(path: impl AsRef, settings: &Settings) -> Result Vec { +pub fn test_snippet(contents: &str, settings: &Settings) -> Vec { let path = Path::new(""); let contents = dedent(contents); test_contents(&contents, path, settings) } +thread_local! { + static MAX_ITERATIONS: std::cell::Cell = std::cell::Cell::new(20); +} + +pub fn set_max_iterations(max: usize) { + MAX_ITERATIONS.with(|iterations| iterations.set(max)); +} + +pub(crate) fn max_iterations() -> usize { + MAX_ITERATIONS.with(std::cell::Cell::get) +} + /// A convenient wrapper around [`check_path`], that additionally /// asserts that autofixes converge after a fixed number of iterations. fn test_contents(contents: &str, path: &Path, settings: &Settings) -> Vec { - static MAX_ITERATIONS: usize = 20; - let tokens: Vec = ruff_rustpython::tokenize(contents); let locator = Locator::new(contents); let stylist = Stylist::from_tokens(&tokens, &locator); @@ -83,14 +96,16 @@ fn test_contents(contents: &str, path: &Path, settings: &Settings) -> Vec", + "Addison Crump ", +] +publish = false +edition = "2021" + +[features] +default = ["libfuzzer"] +full-idempotency = [] +libafl = ["libafl_libfuzzer"] +libafl_merge = ["libafl", "libafl_libfuzzer/merge"] +libfuzzer = ["libfuzzer-sys/link_libfuzzer"] + +[package.metadata] +cargo-fuzz = true + +[dependencies] +arbitrary = { version = "1.3.0", features = ["derive"] } +libafl_libfuzzer = { git = "https://github.com/AFLplusplus/LibAFL.git", branch = "libfuzzer", optional = true } +libfuzzer-sys = { git = "https://github.com/rust-fuzz/libfuzzer", default-features = false } +ruff = { path = "../crates/ruff" } +ruff_python_ast = { path = "../crates/ruff_python_ast" } +ruff_python_formatter = { path = "../crates/ruff_python_formatter" } +similar = { version = "2.2.1" } + +# Prevent this from interfering with workspaces +[workspace] +members = ["."] + +[[bin]] +name = "ruff_parse_simple" +path = "fuzz_targets/ruff_parse_simple.rs" + +[[bin]] +name = "ruff_fix_validity" +path = "fuzz_targets/ruff_fix_validity.rs" + +[[bin]] +name = "ruff_parse_idempotency" +path = "fuzz_targets/ruff_parse_idempotency.rs" + +[profile.release] +opt-level = 3 +debug = true + +[profile.dev] +opt-level = 3 +debug = true + +[profile.test] +opt-level = 3 +debug = true diff --git a/fuzz/README.md b/fuzz/README.md new file mode 100644 index 00000000000000..42907fd9f18e42 --- /dev/null +++ b/fuzz/README.md @@ -0,0 +1,105 @@ +# ruff-fuzz + +Fuzzers and associated utilities for automatic testing of Ruff. + +## Usage + +To use the fuzzers provided in this directory, start by invoking: + +```bash +./fuzz/init-fuzzers.sh +``` + +This will install [`cargo-fuzz`](https://github.com/rust-fuzz/cargo-fuzz) and optionally download a +[dataset](https://zenodo.org/record/3628784) which improves the efficacy of the testing. +The dataset may take several hours to download and clean, so if you're just looking to try out the +fuzzers, skip the dataset download, though be warned that some features simply cannot be tested +without it (very unlikely for the fuzzer to generate valid python code from "thin air"). + +Once you have initialised the fuzzers, you can then execute any fuzzer with: + +```bash +cargo fuzz run -s none name_of_fuzzer -- -timeout=1 +``` + +You can view the names of the available fuzzers with `cargo fuzz list`. +For specific details about how each fuzzer works, please read this document in its entirety. + +**IMPORTANT: You should run `./reinit-fuzzer.sh` after adding more file-based testcases.** This will +allow the testing of new features that you've added unit tests for. + +### Debugging a crash + +Once you've found a crash, you'll need to debug it. +The easiest first step in this process is to minimise the input such that the crash is still +triggered with a smaller input. +`cargo-fuzz` supports this out of the box with: + +```bash +cargo fuzz tmin -s none name_of_fuzzer artifacts/name_of_fuzzer/crash-... +``` + +From here, you will need to analyse the input and potentially the behaviour of the program. +The debugging process from here is unfortunately less well-defined, so you will need to apply some +expertise here. +Happy hunting! + +## A brief introduction to fuzzers + +Fuzzing, or fuzz testing, is the process of providing generated data to a program under test. +The most common variety of fuzzers are mutational fuzzers; given a set of existing inputs (a +"corpus"), it will attempt to slightly change (or "mutate") these inputs into new inputs that cover +parts of the code that haven't yet been observed. +Using this strategy, we can quite efficiently generate testcases which cover significant portions of +the program, both with expected and unexpected data. +[This is really quite effective for finding bugs.](https://github.com/rust-fuzz/trophy-case) + +The fuzzers here use [`cargo-fuzz`](https://github.com/rust-fuzz/cargo-fuzz), a utility which allows +Rust to integrate with [libFuzzer](https://llvm.org/docs/LibFuzzer.html), the fuzzer library built +into LLVM. +Each source file present in [`fuzz_targets`](fuzz_targets) is a harness, which is, in effect, a unit +test which can handle different inputs. +When an input is provided to a harness, the harness processes this data and libFuzzer observes the +code coverage and any special values used in comparisons over the course of the run. +Special values are preserved for future mutations and inputs which cover new regions of code are +added to the corpus. + +## Each fuzzer harness in detail + +Each fuzzer harness in [`fuzz_targets`](fuzz_targets) targets a different aspect of Ruff and tests +them in different ways. While there is implementation-specific documentation in the source code +itself, each harness is briefly described below. + +### `ruff_parse_simple` + +This fuzz harness does not perform any "smart" testing of Ruff; it merely checks that the parsing +and unparsing of a particular input (what would normally be a source code file) does not crash. +While this is unlikely to find any issues on its own, it executes very quickly and covers a large +and diverse code region that may speed up the generation of inputs and therefore make a more +valuable corpus quickly. +It is particularly useful if you skip the dataset generation. + +### `ruff_parse_idempotency` + +This fuzz harness checks that Ruff's parser is idempotent in order to check that it is not +incorrectly parsing or unparsing an input. +It can be built in two modes: default (where it is only checked that the parser does not enter an +unstable state) or full idempotency (the parser is checked to ensure that it will _always_ produce +the same output after the first unparsing). +Full idempotency mode can be used by enabling the `full-idempotency` feature when running the +fuzzer, but this may be too strict of a restriction for initial testing. + +### `ruff_fix_validity` + +This fuzz harness checks that fixes applied by Ruff do not introduce new errors using the existing +[`ruff::test::test_snippet`](../crates/ruff/src/test.rs) testing utility. +It currently is only configured to use default settings, but may be extended in future versions to +test non-default linter settings. + +## Experimental settings + +You can optionally use `--no-default-features --features libafl` to use the libafl fuzzer instead of +libfuzzer. +This fuzzer has experimental support, but can vastly improve fuzzer performance. +If you are not already familiar with [LibAFL](https://github.com/AFLplusplus/LibAFL), this mode is +not currently recommended. diff --git a/fuzz/corpus/ruff_parse_idempotency b/fuzz/corpus/ruff_parse_idempotency new file mode 120000 index 00000000000000..61e7ad4b4cd6fa --- /dev/null +++ b/fuzz/corpus/ruff_parse_idempotency @@ -0,0 +1 @@ +ruff_parse_simple \ No newline at end of file diff --git a/fuzz/corpus/ruff_parse_simple b/fuzz/corpus/ruff_parse_simple new file mode 120000 index 00000000000000..018c02efec25c4 --- /dev/null +++ b/fuzz/corpus/ruff_parse_simple @@ -0,0 +1 @@ +ruff_fix_validity/ \ No newline at end of file diff --git a/fuzz/fuzz_targets/ruff_fix_validity.rs b/fuzz/fuzz_targets/ruff_fix_validity.rs new file mode 100644 index 00000000000000..d21959ef009419 --- /dev/null +++ b/fuzz/fuzz_targets/ruff_fix_validity.rs @@ -0,0 +1,30 @@ +//! Fuzzer harness which actively tries to find testcases that cause Ruff to introduce errors into +//! the resulting file. + +#![no_main] + +#[cfg(feature = "libafl")] +extern crate libafl_libfuzzer; + +use libfuzzer_sys::{fuzz_target, Corpus}; +use ruff::settings::Settings; +use std::sync::OnceLock; + +static SETTINGS: OnceLock = OnceLock::new(); + +fn do_fuzz(case: &[u8]) -> Corpus { + // throw away inputs which aren't utf-8 + let Ok(code) = std::str::from_utf8(case) else { return Corpus::Reject; }; + + // the settings are immutable to test_snippet, so we avoid re-initialising here + let settings = SETTINGS.get_or_init(Settings::default); + ruff::test::set_max_iterations(usize::MAX); + + // unlike in the test framework, where the number of iterations is well-defined, we are only + // looking for situations where a fix is bad; thus, we set the iterations to "infinite" + let _ = ruff::test::test_snippet(code, settings); + + Corpus::Keep +} + +fuzz_target!(|case: &[u8]| -> Corpus { do_fuzz(case) }); diff --git a/fuzz/fuzz_targets/ruff_parse_idempotency.rs b/fuzz/fuzz_targets/ruff_parse_idempotency.rs new file mode 100644 index 00000000000000..c09eee16c073b6 --- /dev/null +++ b/fuzz/fuzz_targets/ruff_parse_idempotency.rs @@ -0,0 +1,58 @@ +//! Fuzzer harness which searches for situations where the parser does not parse or unparse a +//! particular source snippet consistently. + +#![no_main] + +#[cfg(feature = "libafl")] +extern crate libafl_libfuzzer; + +use libfuzzer_sys::{fuzz_target, Corpus}; +use ruff_python_ast::source_code::round_trip; +use similar::TextDiff; + +fn do_fuzz(case: &[u8]) -> Corpus { + let Ok(code) = std::str::from_utf8(case) else { return Corpus::Reject; }; + + // round trip it once to get a formatted version + if let Ok(first) = round_trip(code, "fuzzed-source.py") { + // round trip it a second time to get a case to compare against + if let Ok(second) = round_trip(&first, "fuzzed-source.py") { + if cfg!(feature = "full-idempotency") { + // potentially, we don't want to test for full idempotency, but just for unsteady states + // enable the "full-idempotency" feature when fuzzing for full idempotency + let diff = TextDiff::from_lines(&first, &second) + .unified_diff() + .header("Parsed once", "Parsed twice") + .to_string(); + assert_eq!( + first, second, + "\nIdempotency violation (orig => first => second); original: {:?}\ndiff:\n{}", + code, diff + ); + } else if first != second { + // by the third time we've round-tripped it, we shouldn't be introducing any more + // changes; if we do, then it's likely that we're in an unsteady parsing state + let third = round_trip(&second, "fuzzed-source.py") + .expect("Couldn't round-trip the processed source."); + let diff = TextDiff::from_lines(&second, &third) + .unified_diff() + .header("Parsed twice", "Parsed three times") + .to_string(); + assert_eq!( + second, third, + "\nPotential unsteady state (orig => first => second => third); original: {:?}\ndiff:\n{}", + code, diff + ); + } + } else { + panic!( + "Unable to perform the second round trip!\nbefore: {:?}\nfirst: {:?}", + code, first + ); + } + } + + Corpus::Keep +} + +fuzz_target!(|case: &[u8]| -> Corpus { do_fuzz(case) }); diff --git a/fuzz/fuzz_targets/ruff_parse_simple.rs b/fuzz/fuzz_targets/ruff_parse_simple.rs new file mode 100644 index 00000000000000..117ff38f12119c --- /dev/null +++ b/fuzz/fuzz_targets/ruff_parse_simple.rs @@ -0,0 +1,21 @@ +//! Fuzzer harness which merely explores the parse/unparse coverage space and tries to make it +//! crash. On its own, this fuzzer is (hopefully) not going to find a crash. + +#![no_main] + +#[cfg(feature = "libafl")] +extern crate libafl_libfuzzer; + +use libfuzzer_sys::{fuzz_target, Corpus}; +use ruff_python_ast::source_code::round_trip; + +fn do_fuzz(case: &[u8]) -> Corpus { + let Ok(code) = std::str::from_utf8(case) else { return Corpus::Reject; }; + + // just round-trip it once to trigger both parse and unparse + let _ = round_trip(code, "fuzzed-source.py"); + + Corpus::Keep +} + +fuzz_target!(|case: &[u8]| -> Corpus { do_fuzz(case) }); diff --git a/fuzz/init-fuzzer.sh b/fuzz/init-fuzzer.sh new file mode 100755 index 00000000000000..eb7e0265058d2f --- /dev/null +++ b/fuzz/init-fuzzer.sh @@ -0,0 +1,25 @@ +#!/bin/bash + +# https://stackoverflow.com/a/246128/3549270 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +cd "$SCRIPT_DIR" + +if ! cargo fuzz --help >&/dev/null; then + cargo install --git https://github.com/rust-fuzz/cargo-fuzz.git +fi + +if [ ! -d corpus/ruff_fix_validity ]; then + mkdir -p corpus/ruff_fix_validity + read -p "Would you like to build a corpus from a python source code dataset? (this will take a long time!) [Y/n] " -n 1 -r + echo + cd corpus/ruff_fix_validity + if [[ $REPLY =~ ^[Yy]$ ]]; then + curl -L 'https://zenodo.org/record/3628784/files/python-corpus.tar.gz?download=1' | tar xz + fi + cp -r "../../../crates/ruff/resources/test" . + cd - + cargo fuzz cmin -s none ruff_fix_validity +fi + +echo "Done! You are ready to fuzz." diff --git a/fuzz/reinit-fuzzer.sh b/fuzz/reinit-fuzzer.sh new file mode 100755 index 00000000000000..a1acb8328fb653 --- /dev/null +++ b/fuzz/reinit-fuzzer.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +# https://stackoverflow.com/a/246128/3549270 +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +cd "$SCRIPT_DIR" + +cd corpus/ruff_fix_validity +if [[ $REPLY =~ ^[Yy]$ ]]; then + curl -L 'https://zenodo.org/record/3628784/files/python-corpus.tar.gz?download=1' | tar xz +fi +cp -r "../../../crates/ruff/resources/test" . +cd - +cargo fuzz cmin -s none ruff_fix_validity + +echo "Done! You are ready to fuzz."