diff --git a/CHANGELOG.md b/CHANGELOG.md index 77fbc844f..7be360631 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -29,6 +29,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - The Git repository cloning behavior in the `scan` command can now be controlled with the new `--git-clone-mode MODE` parameter. +- In the `scan` command, basic blob metadata is recorded in the datastore for each discovered blob, including blob size in bytes and guessed mime type and charset when available. + A path-based mechanism is used to guess mime type; at present, this only works for plain file inputs (i.e., not for blobs found in Git history). + Optionally, if the `libmagic` Cargo feature is enabled, libmagic (the guts of the `file` command-line program) is used to guess mime type and charset based on content for blobs from all sources. + This metadata is recorded for each blob in which matches are found, but this behavior can be enabled for all blobs using the new `--record-all-blobs true` parameter. + This newly-recorded metadata is included in output of the `report` command. + ### Changes - Existing rules were modified to reduce both false positives and false negatives: @@ -42,7 +48,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - When a Git repository is cloned, the default behavior is to match `git clone --bare` instead of `git clone --mirror`. This new default behavior results in cloning potentially less content, but avoids cloning content from forks from repositories hosted on GitHub. -- The command-line help has been refined for clarity +- The command-line help has been refined for clarity. + +- Scanning performance has been improved on particular workloads by as much as 2x by recording matches to the datastore in larger batches. + This is particularly relevant to heavy multithreaded scanning workloads where the inputs have many matches. ### Fixes diff --git a/Cargo.lock b/Cargo.lock index cd3b632c4..9d6a78a8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,15 +77,15 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41ed9a86bf92ae6580e0a31281f65a1b1d867c0cc68d5346e2ae128dddfa6a7d" +checksum = "3a30da5c5f2d5e72842e00bcb57657162cdabef0931f40e2deb9b4140440cecd" [[package]] name = "anstyle-parse" -version = "0.2.0" +version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e765fd216e48e067936442276d1d57399e37bce53c264d6fefbe298080cb57ee" +checksum = "938874ff5980b03a87c5524b3ae5b59cf99b1d6bc836848df7bc5ada9643c333" dependencies = [ "utf8parse", ] @@ -198,7 +198,7 @@ dependencies = [ "lazycell", "log", "peeking_take_while", - "prettyplease 0.2.8", + "prettyplease 0.2.9", "proc-macro2", "quote", "regex", @@ -333,9 +333,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.3.4" +version = "4.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "80672091db20273a15cf9fdd4e47ed43b5091ec9841bf4c6145c9dfbbcae09ed" +checksum = "2686c4115cb0810d9a984776e197823d08ec94f176549a89a9efded477c456dc" dependencies = [ "clap_builder", "clap_derive", @@ -344,9 +344,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.3.4" +version = "4.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1458a1df40e1e2afebb7ab60ce55c1fa8f431146205aa5f4887e0b111c27636" +checksum = "2e53afce1efce6ed1f633cf0e57612fe51db54a1ee4fd8f8503d078fe02d69ae" dependencies = [ "anstream", "anstyle", @@ -411,6 +411,16 @@ dependencies = [ "windows-sys 0.45.0", ] +[[package]] +name = "content_guesser" +version = "0.13.0-dev" +dependencies = [ + "magic", + "mime", + "mime_guess", + "thiserror", +] + [[package]] name = "core-foundation" version = "0.9.3" @@ -701,6 +711,17 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "errno" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f639046355ee4f37944e44f60642c6f3a7efa3cf6b78c78a0d989a8ce6c396a1" +dependencies = [ + "errno-dragonfly", + "libc", + "winapi", +] + [[package]] name = "errno" version = "0.3.1" @@ -895,9 +916,9 @@ dependencies = [ [[package]] name = "gix" -version = "0.46.0" +version = "0.47.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99368b48a2f68c3fdc26e62c6425bdc4baeb4f30a4f24eb2e0904d29a2ba97ab" +checksum = "10f5281c55e0a7415877d91a15fae4a10ec7444615d64d78e48c07f20bcfcd9b" dependencies = [ "gix-actor", "gix-attributes", @@ -927,6 +948,7 @@ dependencies = [ "gix-revision", "gix-sec", "gix-tempfile", + "gix-trace", "gix-traverse", "gix-url", "gix-utils", @@ -942,9 +964,9 @@ dependencies = [ [[package]] name = "gix-actor" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fe73f9f6be1afbf1bd5be919a9636fa560e2f14d42262a934423ed6760cd838" +checksum = "b70d0d809ee387113df810ab4ebe585a076e35ae6ed59b5b280072146955a3ff" dependencies = [ "bstr", "btoi", @@ -956,9 +978,9 @@ dependencies = [ [[package]] name = "gix-attributes" -version = "0.13.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78b79590ac382f80d87e06416f5fcac6fee5d83dcb152a00ed0bdbaa988acc31" +checksum = "03d7006cc5a508514207154046e18c3c39d98ba98f865ada83b6f3f3886543bb" dependencies = [ "bstr", "gix-glob", @@ -973,36 +995,36 @@ dependencies = [ [[package]] name = "gix-bitmap" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc02feb20ad313d52a450852f2005c2205d24f851e74d82b7807cbe12c371667" +checksum = "311e2fa997be6560c564b070c5da2d56d038b645a94e1e5796d5d85a350da33c" dependencies = [ "thiserror", ] [[package]] name = "gix-chunk" -version = "0.4.2" +version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7acf3bc6c4b91e8fb260086daf5e105ea3a6d913f5fd3318137f7e309d6e540" +checksum = "39db5ed0fc0a2e9b1b8265993f7efdbc30379dec268f3b91b7af0c2de4672fdd" dependencies = [ "thiserror", ] [[package]] name = "gix-command" -version = "0.2.5" +version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f6141b70cfb21255223e42f3379855037cbbe8673b58dd8318d2f09b516fad1" +checksum = "bb49ab557a37b0abb2415bca2b10e541277dff0565deb5bd5e99fd95f93f51eb" dependencies = [ "bstr", ] [[package]] name = "gix-commitgraph" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e8490ae1b3d55c47e6a71d247c082304a2f79f8d0332c1a2f5693d42a2021a09" +checksum = "0e498e98d0b477d6a1c1608bee39db201e7a38873460a130a97ce88b4d95b6e1" dependencies = [ "bstr", "gix-chunk", @@ -1014,9 +1036,9 @@ dependencies = [ [[package]] name = "gix-config" -version = "0.23.0" +version = "0.24.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "51f310120ae1ba8f0ca52fb22876ce9bad5b15c8ffb3eb7302e4b64a3b9f681c" +checksum = "33b32541232a2c626849df7843e05b50cb43ac38a4f675abbe2f661874fc1e9d" dependencies = [ "bstr", "gix-config-value", @@ -1036,9 +1058,9 @@ dependencies = [ [[package]] name = "gix-config-value" -version = "0.12.1" +version = "0.12.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f216df1c33e6e1555923eff0096858a879e8aaadd35b5d788641e4e8064c892" +checksum = "4783caa23062f86acfd1bc9e72c62250923d1673171ce1a524d9486f8a4556a8" dependencies = [ "bitflags 2.3.2", "bstr", @@ -1049,9 +1071,9 @@ dependencies = [ [[package]] name = "gix-credentials" -version = "0.15.0" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f89fea8acd28f5ef8fa5042146f1637afd4d834bc8f13439d8fd1e5aca0d65" +checksum = "7dcec518a8db5b2e342ea7a2e785f46fd176b1b689ddd3f43052701bf3fa8ee3" dependencies = [ "bstr", "gix-command", @@ -1065,9 +1087,9 @@ dependencies = [ [[package]] name = "gix-date" -version = "0.5.1" +version = "0.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc164145670e9130a60a21670d9b6f0f4f8de04e5dd256c51fa5a0340c625902" +checksum = "0213f923d63c2c7d10799c1977f42df38ec586ebbf1d14fd00dfa363ac994c2b" dependencies = [ "bstr", "itoa", @@ -1077,9 +1099,9 @@ dependencies = [ [[package]] name = "gix-diff" -version = "0.30.1" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9029ad0083cc286a4bd2f5b3bf66bb66398abc26f2731a2824cd5edfc41a0e33" +checksum = "5049dd5a60d5608912da0ab184f35064901f192f4adf737716789715faffa080" dependencies = [ "gix-hash", "gix-object", @@ -1089,9 +1111,9 @@ dependencies = [ [[package]] name = "gix-discover" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aba9c6c0d1f2b2efe65581de73de4305004612d49c83773e783202a7ef204f46" +checksum = "c14865cb9c6eb817d6a8d53595f1051239d2d31feae7a5e5b2f00910c94a8eb4" dependencies = [ "bstr", "dunce", @@ -1104,15 +1126,16 @@ dependencies = [ [[package]] name = "gix-features" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a8c493409bf6060d408eec9bbdd1b12ea351266b50012e2a522f75dfc7b8314" +checksum = "ae82dfceec06c034728c530399ee449f97b1e542e191247c52c169ca6af1fd89" dependencies = [ "bytesize", "crc32fast", "crossbeam-channel", "flate2", "gix-hash", + "gix-trace", "jwalk", "libc", "once_cell", @@ -1126,18 +1149,18 @@ dependencies = [ [[package]] name = "gix-fs" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30da8997008adb87f94e15beb7ee229f8a48e97af585a584bfee4a5a1880aab5" +checksum = "bb15956bc0256594c62a2399fcf6958a02a11724217eddfdc2b49b21b6292496" dependencies = [ "gix-features", ] [[package]] name = "gix-glob" -version = "0.8.0" +version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd0ade1e80ab1f079703d1824e1daf73009096386aa7fd2f0477f6e4ac0a558e" +checksum = "f45cd7ab22faf154db0a9f5a8011ba9cda8b298b61b7299f43a21bbaf0b3f208" dependencies = [ "bitflags 2.3.2", "bstr", @@ -1147,9 +1170,9 @@ dependencies = [ [[package]] name = "gix-hash" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ee181c85d3955f54c4426e6bfaeeada4428692e1a39b8788c2ac7785fc301dd8" +checksum = "a0dd58cdbe7ffa4032fc111864c80d5f8cecd9a2c9736c97ae7e5be834188272" dependencies = [ "hex", "thiserror", @@ -1157,9 +1180,9 @@ dependencies = [ [[package]] name = "gix-hashtable" -version = "0.2.1" +version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd259bd0d96e6153e357a8cdaca76c48e103fd34208b6c0ce77b1ad995834bd2" +checksum = "2cfd7f4ea905c13579565e3c264ca2c4103d192bd5fce2300c5a884cf1977d61" dependencies = [ "gix-hash", "hashbrown 0.13.2", @@ -1168,9 +1191,9 @@ dependencies = [ [[package]] name = "gix-ignore" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc6f7f101a0ccce808dbf7008ba131dede94e20257e7bde7a44cbb2f8c775625" +checksum = "27e82dec6975012b710837c6cd56353c3111d2308e016118bfc59275fcc8b5d0" dependencies = [ "bstr", "gix-glob", @@ -1180,9 +1203,9 @@ dependencies = [ [[package]] name = "gix-index" -version = "0.18.0" +version = "0.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca0380cdab7863e67966eee4aed32856c864c20b077e026b637af6bb3a9281b4" +checksum = "2ef2fa392d351e62ac3a6309146f61880abfbe0c07474e075d3b2ac78a6834a5" dependencies = [ "bitflags 2.3.2", "bstr", @@ -1202,9 +1225,9 @@ dependencies = [ [[package]] name = "gix-lock" -version = "6.0.0" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ec5d5e6f07316d3553aa7425e3ecd935ec29882556021fe1696297a448af8d2" +checksum = "328f50aad713ab606caeaf834459ef915ccdfbb9133ac6cd54616d601aa9249f" dependencies = [ "gix-tempfile", "gix-utils", @@ -1213,39 +1236,42 @@ dependencies = [ [[package]] name = "gix-mailmap" -version = "0.13.0" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4653701922c920e009f1bc4309feaff14882ade017770788f9a150928da3fa6a" +checksum = "d0bef8d360a6a9fc5a6d872471588d8ca7db77b940e48ff20c3b4706ad5f481d" dependencies = [ "bstr", "gix-actor", + "gix-date", "thiserror", ] [[package]] name = "gix-negotiate" -version = "0.2.1" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "945c3ef1e912e44a5f405fc9e924edf42000566a1b257ed52cb1293300f6f08c" +checksum = "b626aafb9f4088058f1baa5d2029b2191820c84f6c81e43535ba70bfdc7b7d56" dependencies = [ "bitflags 2.3.2", "gix-commitgraph", + "gix-date", "gix-hash", "gix-object", - "gix-revision", + "gix-revwalk", "smallvec", "thiserror", ] [[package]] name = "gix-object" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8926c8f51c44dec3e709cb5dbc93deb9e8d4064c43c9efc54c158dcdfe8446c7" +checksum = "255e477ae4cc8d10778238f011e6125b01cc0e7067dc8df87acd67a428a81f20" dependencies = [ "bstr", "btoi", "gix-actor", + "gix-date", "gix-features", "gix-hash", "gix-validate", @@ -1258,11 +1284,12 @@ dependencies = [ [[package]] name = "gix-odb" -version = "0.47.0" +version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "91d98eaba4f649fed17250651c4ddfaf997c80a30f5ee4b47ac9bc18ffe3eb16" +checksum = "6b73469f145d1e6afbcfd0ab6499a366fbbcb958c2999d41d283d6c7b94024b9" dependencies = [ "arc-swap", + "gix-date", "gix-features", "gix-hash", "gix-object", @@ -1276,9 +1303,9 @@ dependencies = [ [[package]] name = "gix-pack" -version = "0.37.0" +version = "0.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "82e9e228f18cd87e7596e687b38619b5e4caebc678644ae6bb3d842598166d72" +checksum = "a1f3bcd1aaa72aea7163b147d2bde2480a01eadefc774a479d38f29920f7f1c8" dependencies = [ "clru", "gix-chunk", @@ -1299,11 +1326,12 @@ dependencies = [ [[package]] name = "gix-path" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c1226f2e50adeb4d76c754c1856c06f13a24cad1624801653fbf09b869e5b808" +checksum = "4ea2a19d82dd55e5fad1d606b8a1ad2f7a804e10caa2efbb169cd37e0a07ede0" dependencies = [ "bstr", + "gix-trace", "home", "once_cell", "thiserror", @@ -1311,9 +1339,9 @@ dependencies = [ [[package]] name = "gix-prompt" -version = "0.5.1" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e15fe57fa48572b7d3bf465d6a2a0351cd3c55cba74fd5f0b9c23689f9c1a31e" +checksum = "8dfd363fd89a40c1e7bff9c9c1b136cd2002480f724b0c627c1bc771cd5480ec" dependencies = [ "gix-command", "gix-config-value", @@ -1324,9 +1352,9 @@ dependencies = [ [[package]] name = "gix-quote" -version = "0.4.4" +version = "0.4.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29d59489bff95b06dcdabe763b7266d3dc0a628cac1ac1caf65a7ca0a43eeae0" +checksum = "3874de636c2526de26a3405b8024b23ef1a327bebf4845d770d00d48700b6a40" dependencies = [ "bstr", "btoi", @@ -1335,11 +1363,12 @@ dependencies = [ [[package]] name = "gix-ref" -version = "0.30.0" +version = "0.31.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ebdd999256f4ce8a5eefa89999879c159c263f3493a951d62aa5ce42c0397e1c" +checksum = "9b6c74873a9d8ff5d1310f2325f09164c15a91402ab5cde4d479ae12ff55ed69" dependencies = [ "gix-actor", + "gix-date", "gix-features", "gix-fs", "gix-hash", @@ -1355,9 +1384,9 @@ dependencies = [ [[package]] name = "gix-refspec" -version = "0.11.0" +version = "0.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "72bfd622abc86dd8ad1ec51b9eb77b4f1a766b94e3a1b87cf4a022c5b5570cf4" +checksum = "ca1bc6c40bad62570683d642fcb04e977433ac8f76b674860ef7b1483c1f8990" dependencies = [ "bstr", "gix-hash", @@ -1369,9 +1398,9 @@ dependencies = [ [[package]] name = "gix-revision" -version = "0.15.2" +version = "0.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5044f56cd7a487ce9b034cbe0252ae0b6b47ff56ca3dabd79bc30214d0932cd7" +checksum = "f3751d6643d731fc5829d2f43ca049f4333c968f30908220ba0783c9dfe5010c" dependencies = [ "bstr", "gix-date", @@ -1384,11 +1413,12 @@ dependencies = [ [[package]] name = "gix-revwalk" -version = "0.1.0" +version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc2623ba8747914f151f5e12b65adac576ab459dbed5f50a36c7a3e9cbf2d3ca" +checksum = "144995229c6e5788b1c7386f8a3f7146ace3745c9a6b56cef9123a7d83b110c5" dependencies = [ "gix-commitgraph", + "gix-date", "gix-hash", "gix-hashtable", "gix-object", @@ -1398,9 +1428,9 @@ dependencies = [ [[package]] name = "gix-sec" -version = "0.8.1" +version = "0.8.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b2b7b38b766eb95dcc5350a9c450030b69892c0902fa35f4a6d0809273bd9dae" +checksum = "47f09860e2ddc7b13119e410c46d8e9f870acc7933fb53ae65817af83a8c9f80" dependencies = [ "bitflags 2.3.2", "gix-path", @@ -1410,9 +1440,9 @@ dependencies = [ [[package]] name = "gix-tempfile" -version = "6.0.0" +version = "7.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b3785cb010e9dc5c446dfbf02bc1119fc17d3a48a27c029efcb3a3c32953eb10" +checksum = "4fac8310c17406ea619af72f42ee46dac795110f68f41b4f4fa231b69889c6a2" dependencies = [ "gix-fs", "libc", @@ -1423,13 +1453,20 @@ dependencies = [ "tempfile", ] +[[package]] +name = "gix-trace" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ff8a60073500f4d6edd181432ee11394d843db7dcf05756aa137a1233b1cbf6" + [[package]] name = "gix-traverse" -version = "0.27.0" +version = "0.28.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8673546506391a10fdfd4e48c8e0f3ec92355cf1fac787d2e714c7d45e301ede" +checksum = "c3f6bba1686bfbc7e0e93d4932bc6e14d479c9c9524f7c8d65b25d2a9446a99e" dependencies = [ "gix-commitgraph", + "gix-date", "gix-hash", "gix-hashtable", "gix-object", @@ -1440,9 +1477,9 @@ dependencies = [ [[package]] name = "gix-url" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f1663df25ac42047a2547618d2a6979a26f478073f6306997429235d2cd4c863" +checksum = "ff1f984816338039b151a9f5dae6100e1e51e438cf61242ea8136fedc574d825" dependencies = [ "bstr", "gix-features", @@ -1454,18 +1491,18 @@ dependencies = [ [[package]] name = "gix-utils" -version = "0.1.2" +version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbcfcb150c7ef553d76988467d223254045bdcad0dc6724890f32fbe96415da5" +checksum = "1ca284c260845bc0724050aec59c7a596407678342614cdf5a1d69e044f29a36" dependencies = [ "fastrand", ] [[package]] name = "gix-validate" -version = "0.7.5" +version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57ea5845b506c7728b9d89f4227cc369a5fc5a1d5b26c3add0f0d323413a3a60" +checksum = "8d092b594c8af00a3a31fe526d363ee8a51a6f29d8496cdb991ed2f01ec0ec13" dependencies = [ "bstr", "thiserror", @@ -1473,9 +1510,9 @@ dependencies = [ [[package]] name = "gix-worktree" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b32a0e7ed52577bfb050f5350bdee2741d1b08a9ed02a2f2df6effe353896ca" +checksum = "4ee22549d6723189366235e1c6959ccdac73b58197cdbb437684eaa2169edcb9" dependencies = [ "bstr", "filetime", @@ -1804,9 +1841,9 @@ checksum = "9f2cb48b81b1dc9f39676bf99f5499babfec7cd8fe14307f7b3d747208fb5690" [[package]] name = "insta" -version = "1.29.0" +version = "1.30.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a28d25139df397cbca21408bb742cf6837e04cdbebf1b07b760caf971d6a972" +checksum = "28491f7753051e5704d4d0ae7860d45fae3238d7d235bc4289dcd45c48d3cec3" dependencies = [ "console", "lazy_static", @@ -1999,6 +2036,29 @@ version = "0.4.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b06a4cde4c0f271a446782e3eff8de789548ce57dbc8eca9292c27f4a42004b4" +[[package]] +name = "magic" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87142e3acb1f4daa62eaea96605421a534119d4777a9fb43fb2784798fd89665" +dependencies = [ + "bitflags 1.3.2", + "errno 0.2.8", + "libc", + "magic-sys", + "thiserror", +] + +[[package]] +name = "magic-sys" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eff86ae08895140d628119d407d568f3b657145ee8c265878064f717534bb3bc" +dependencies = [ + "libc", + "vcpkg", +] + [[package]] name = "matches" version = "0.1.10" @@ -2035,6 +2095,16 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "mime_guess" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef" +dependencies = [ + "mime", + "unicase", +] + [[package]] name = "minimal-lexical" version = "0.2.1" @@ -2104,8 +2174,8 @@ dependencies = [ "bstr", "chrono", "console", + "content_guesser", "gix", - "gix-features", "hex", "hyperx", "ignore", @@ -2113,6 +2183,7 @@ dependencies = [ "indicatif", "indoc", "lazy_static", + "mime", "pretty_assertions", "proptest", "regex", @@ -2121,6 +2192,7 @@ dependencies = [ "secrecy", "serde", "serde_yaml", + "thiserror", "tokio", "tracing", "url", @@ -2136,15 +2208,14 @@ dependencies = [ "assert_fs", "clap", "console", - "gix", - "gix-features", - "hex", + "crossbeam-channel", "ignore", "indenter", "indicatif", "indoc", "insta", "lazy_static", + "mime", "noseyparker", "predicates", "pretty_assertions", @@ -2225,9 +2296,9 @@ checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d" [[package]] name = "openssl" -version = "0.10.54" +version = "0.10.55" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69b3f656a17a6cbc115b5c7a40c616947d213ba182135b014d6051b73ab6f019" +checksum = "345df152bc43501c5eb9e4654ff05f794effb78d4efe3d53abc158baddc0703d" dependencies = [ "bitflags 1.3.2", "cfg-if", @@ -2266,9 +2337,9 @@ dependencies = [ [[package]] name = "openssl-sys" -version = "0.9.88" +version = "0.9.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2ce0f250f34a308dcfdbb351f511359857d4ed2134ba715a4eadd46e1ffd617" +checksum = "374533b0e45f3a7ced10fcaeccca020e66656bc03dac384f852e4e5a7a8104a6" dependencies = [ "cc", "libc", @@ -2329,9 +2400,9 @@ checksum = "d4fd5641d01c8f18a23da7b6fe29298ff4b55afcccdf78973b24cf3175fee32e" [[package]] name = "pest" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16833386b02953ca926d19f64af613b9bf742c48dcd5e09b32fbfc9740bf84e2" +checksum = "f73935e4d55e2abf7f130186537b19e7a4abc886a0252380b59248af473a3fc9" dependencies = [ "thiserror", "ucd-trie", @@ -2339,9 +2410,9 @@ dependencies = [ [[package]] name = "pest_derive" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7763190f9406839f99e5197afee8c9e759969f7dbfa40ad3b8dbee8757b745b5" +checksum = "aef623c9bbfa0eedf5a0efba11a5ee83209c326653ca31ff019bec3a95bfff2b" dependencies = [ "pest", "pest_generator", @@ -2349,9 +2420,9 @@ dependencies = [ [[package]] name = "pest_generator" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "249061b22e99973da1f5f5f1410284419e283bb60b79255bf5f42a94b66a2e00" +checksum = "b3e8cba4ec22bada7fc55ffe51e2deb6a0e0db2d0b7ab0b103acc80d2510c190" dependencies = [ "pest", "pest_meta", @@ -2362,9 +2433,9 @@ dependencies = [ [[package]] name = "pest_meta" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "457c310cfc9cf3f22bc58901cc7f0d3410ac5d6298e432a4f9a6138565cb6df6" +checksum = "a01f71cb40bd8bb94232df14b946909e14660e33fc05db3e50ae2a82d7ea0ca0" dependencies = [ "once_cell", "pest", @@ -2456,9 +2527,9 @@ dependencies = [ [[package]] name = "prettyplease" -version = "0.2.8" +version = "0.2.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f2b0377b720bde721213a46cda1289b2f34abf0a436907cad91578c20de0454d" +checksum = "9825a04601d60621feed79c4e6b56d65db77cdca55cef43b46b0de1096d1c282" dependencies = [ "proc-macro2", "syn 2.0.18", @@ -2733,7 +2804,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b96e891d04aa506a6d1f318d2771bcb1c7dfda84e126660ace067c9b474bb2c0" dependencies = [ "bitflags 1.3.2", - "errno", + "errno 0.3.1", "io-lifetimes", "libc", "linux-raw-sys", @@ -3273,9 +3344,9 @@ dependencies = [ [[package]] name = "tracing-attributes" -version = "0.1.25" +version = "0.1.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8803eee176538f94ae9a14b55b2804eb7e1441f8210b1c31290b3bccdccff73b" +checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" dependencies = [ "proc-macro2", "quote", diff --git a/crates/content-guesser/Cargo.toml b/crates/content-guesser/Cargo.toml new file mode 100644 index 000000000..672d5c603 --- /dev/null +++ b/crates/content-guesser/Cargo.toml @@ -0,0 +1,24 @@ +[package] + +edition.workspace = true +rust-version.workspace = true +license.workspace = true +authors.workspace = true +homepage.workspace = true +repository.workspace = true +publish.workspace = true + +name = "content_guesser" +version.workspace = true + +[features] +libmagic = ["magic"] + +[lib] +path = "src/lib.rs" + +[dependencies] +magic = { version = "0.13", optional = true } +mime_guess = "2" +mime = "0.3" +thiserror = "1" diff --git a/crates/content-guesser/src/error.rs b/crates/content-guesser/src/error.rs new file mode 100644 index 000000000..a85b63fde --- /dev/null +++ b/crates/content-guesser/src/error.rs @@ -0,0 +1,6 @@ +#[derive(Debug, thiserror::Error)] +pub enum GuesserError { + #[cfg(feature = "libmagic")] + #[error("libmagic error: {0}")] + MagicError(#[from] magic::MagicError), +} diff --git a/crates/content-guesser/src/guesser.rs b/crates/content-guesser/src/guesser.rs new file mode 100644 index 000000000..4e7582d0d --- /dev/null +++ b/crates/content-guesser/src/guesser.rs @@ -0,0 +1,59 @@ +use mime_guess::MimeGuess; + +use crate::{ + error::GuesserError, + input::Input, + output::Output, +}; + +pub struct Guesser { + #[cfg(feature = "libmagic")] + magic_cookie: magic::Cookie, +} + +// Public Implementation +impl Guesser { + #[cfg(feature = "libmagic")] + pub fn new() -> Result { + use magic::CookieFlags; + let flags = CookieFlags::ERROR | CookieFlags::MIME; + assert!(!flags.contains(CookieFlags::DEBUG)); + let magic_cookie = magic::Cookie::open(flags)?; + // Load the default database + magic_cookie.load::<&str>(&[])?; + Ok(Guesser { magic_cookie }) + } + + #[cfg(not(feature = "libmagic"))] + pub fn new() -> Result { + Ok(Guesser {}) + } + + pub fn guess(&self, input: Input) -> Output + where + T: AsRef<[u8]>, + { + let mime_guess = input.path.map(MimeGuess::from_path); + + #[cfg(feature = "libmagic")] + let magic_guess = { + use crate::input::{Content, PrefixContent}; + match &input.content { + Content::None => None, + Content::Prefix(PrefixContent { content, .. }) | Content::Full(content) => { + match self.magic_cookie.buffer(content.as_ref()) { + Ok(m) => m.parse().ok(), + _ => None, + } + } + } + }; + #[cfg(not(feature = "libmagic"))] + let magic_guess = None; + + Output { + mime_guess, + magic_guess, + } + } +} diff --git a/crates/content-guesser/src/input.rs b/crates/content-guesser/src/input.rs new file mode 100644 index 000000000..890ed39a0 --- /dev/null +++ b/crates/content-guesser/src/input.rs @@ -0,0 +1,86 @@ +use std::io::Read; +use std::path::Path; + +pub enum Content { + /// No content + None, + + /// An incomplete prefix of the entire contents of a file + Prefix(PrefixContent), + + /// The entire contents of a file + Full(T), +} + +#[allow(dead_code)] +pub struct PrefixContent { + /// The prefix of the full content + pub(crate) content: T, + + /// The length of the full content + pub(crate) full_length: Option, +} + +/// The input to a `Guesser`. +#[allow(dead_code)] +pub struct Input<'a, T> { + pub(crate) path: Option<&'a Path>, + pub(crate) content: Content, +} + +impl<'a, T> Input<'a, T> { + /// Create an `Input` from a path without any content. No I/O is performed. + pub fn from_path_no_io(path: &'a Path) -> Self { + Self { + path: Some(path), + content: Content::None, + } + } +} + +impl<'a> Input<'a, &'a [u8]> { + pub fn from_path_and_bytes(path: &'a Path, bytes: &'a [u8]) -> Self { + Input { + path: Some(path), + content: Content::Full(bytes), + } + } + + pub fn from_bytes(bytes: &'a [u8]) -> Self { + Input { + path: None, + content: Content::Full(bytes), + } + } +} + +impl<'a> Input<'a, Vec> { + /// Create an `Input` from the given path, reading at most `max_length` bytes of input. + /// If no `max_length` is given, the entire file contents are read. + pub fn from_path(path: &'a Path, max_length: Option) -> std::io::Result { + let metadata = std::fs::metadata(path)?; + let expected_len = metadata.len(); + + let content = if let Some(max_length) = max_length { + let f = std::fs::File::open(path)?; + let mut buf = Vec::with_capacity(max_length); + let actual_len = f.take(max_length as u64).read_to_end(&mut buf)?; + if actual_len < expected_len as usize { + Content::Prefix(PrefixContent { + full_length: Some(expected_len as usize), + content: buf, + }) + } else { + Content::Full(buf) + } + } else { + Content::Full(std::fs::read(path)?) + }; + + Ok(Self { + path: Some(path), + content, + }) + } +} + diff --git a/crates/content-guesser/src/lib.rs b/crates/content-guesser/src/lib.rs new file mode 100644 index 000000000..824175799 --- /dev/null +++ b/crates/content-guesser/src/lib.rs @@ -0,0 +1,13 @@ +pub use mime::Mime; + +mod input; +pub use input::{Content, PrefixContent, Input}; + +mod output; +pub use output::Output; + +mod error; +pub use error::GuesserError; + +mod guesser; +pub use guesser::Guesser; diff --git a/crates/content-guesser/src/output.rs b/crates/content-guesser/src/output.rs new file mode 100644 index 000000000..a7e127383 --- /dev/null +++ b/crates/content-guesser/src/output.rs @@ -0,0 +1,33 @@ +use mime::Mime; +use mime_guess::MimeGuess; + +#[derive(Debug)] +pub struct Output { + /// Path-based media type guess + pub(crate) mime_guess: Option, + + /// Content-based media type guess + pub(crate) magic_guess: Option, +} + +impl Output { + /// Get the path-based media type guess + #[inline] + pub fn path_guess(&self) -> Option { + self.mime_guess.and_then(|g| g.first()) + } + + /// Get the content-based media type guess + #[inline] + pub fn content_guess(&self) -> Option { + self.magic_guess.clone() + } + + /// Get the guessed mime type that is considered to be the best. + /// + /// If a content-based guess is available, that is used. + /// Otherwise, the path-based guess is used. + pub fn best_guess(&self) -> Option { + self.content_guess().or_else(|| self.path_guess()) + } +} diff --git a/crates/noseyparker-cli/Cargo.toml b/crates/noseyparker-cli/Cargo.toml index 57d90eefa..d3f7bcd3f 100644 --- a/crates/noseyparker-cli/Cargo.toml +++ b/crates/noseyparker-cli/Cargo.toml @@ -21,6 +21,7 @@ build = "build.rs" [features] rule_profiling = ["noseyparker/rule_profiling"] +libmagic = ["noseyparker/libmagic"] [[bin]] name = "noseyparker" @@ -35,15 +36,14 @@ vergen = { version = "8.1", features = ["build", "cargo", "git", "gitcl", "rustc anyhow = { version = "1.0" } clap = { version = "4.3", features = ["cargo", "derive", "env", "unicode", "wrap_help"] } console = "0.15" -gix-features = "0.30" -gix = { version = "0.46", features = ["max-performance"] } -hex = "0.4" +crossbeam-channel = "0.5" indenter = "0.3" # XXX Consider switching from indicatif to status_line: https://docs.rs/status-line/latest/status_line/struct.StatusLine.html indicatif = { version = "0.17", features = ["improved_unicode", "rayon"] } indoc = "2.0" ignore = "0.4" lazy_static = "1.4" +mime = "0.3" noseyparker = { path = "../noseyparker" } prettytable-rs = "0.10" rayon = "1.5" diff --git a/crates/noseyparker-cli/src/bin/noseyparker/args.rs b/crates/noseyparker-cli/src/bin/noseyparker/args.rs index 43ce18b69..6e22f47ad 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/args.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/args.rs @@ -196,7 +196,7 @@ impl GlobalArgs { match self.color { Mode::Never => false, Mode::Always => true, - Mode::Auto => std::io::stdin().is_terminal(), + Mode::Auto => std::io::stdout().is_terminal(), } } @@ -387,6 +387,10 @@ pub struct ScanArgs { #[command(flatten)] pub content_filtering_args: ContentFilteringArgs, + + /// Enable or disable metadata recording for all discovered blobs instead of just those with matches. + #[arg(long, default_value_t=false, action=ArgAction::Set, value_name="BOOL")] + pub record_all_blobs: bool, } /// The mode to use for cloning a Git repository @@ -600,6 +604,7 @@ impl std::fmt::Display for OutputFormat { // ----------------------------------------------------------------------------- // report writer // ----------------------------------------------------------------------------- +// FIXME: refactor this to avoid having to implement bogus methods pub trait Reportable { fn human_format(&self, writer: W) -> Result<()>; fn json_format(&self, writer: W) -> Result<()>; diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs index 8f3dcab1c..9a9e57691 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_report.rs @@ -2,12 +2,14 @@ use anyhow::{bail, Context, Result}; use indenter::indented; use lazy_static::lazy_static; use noseyparker::rules::Rules; -use serde::{Deserialize, Serialize, Serializer}; +use serde::Serialize; use serde_sarif::sarif; use std::fmt::{Display, Formatter, Write}; +use noseyparker::blob_metadata::BlobMetadata; use noseyparker::bstring_escape::Escaped; use noseyparker::datastore::{Datastore, MatchGroupMetadata}; +use noseyparker::digest::sha1_hexdigest; use noseyparker::match_type::Match; use noseyparker::provenance::Provenance; @@ -21,6 +23,22 @@ pub fn run(_global_args: &GlobalArgs, args: &ReportArgs) -> Result<()> { struct DetailsReporter(Datastore); +impl DetailsReporter { + fn get_matches( + &self, + metadata: &MatchGroupMetadata, + limit: Option, + ) -> Result> { + Ok(self + .0 + .get_match_group_data(metadata, limit) + .with_context(|| format!("Failed to get match data for group {metadata:?}"))? + .into_iter() + .map(|(md, m)| BlobMetadataMatch { md, m }) + .collect()) + } +} + impl Reportable for DetailsReporter { fn human_format(&self, mut writer: W) -> Result<()> { let datastore = &self.0; @@ -31,9 +49,7 @@ impl Reportable for DetailsReporter { let num_findings = group_metadata.len(); for (finding_num, metadata) in group_metadata.into_iter().enumerate() { let finding_num = finding_num + 1; - let matches = datastore - .get_match_group_matches(&metadata, Some(3)) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, Some(3))?; let match_group = MatchGroup { metadata, matches }; writeln!( &mut writer, @@ -55,14 +71,11 @@ impl Reportable for DetailsReporter { let es = group_metadata .into_iter() .map(|metadata| { - let matches = datastore - .get_match_group_matches(&metadata, None) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, None)?; Ok(MatchGroup { metadata, matches }) }) .collect::, anyhow::Error>>()?; - let mut ser = serde_json::Serializer::pretty(writer); - ser.collect_seq(es)?; + serde_json::to_writer_pretty(writer, &es)?; Ok(()) } @@ -73,9 +86,7 @@ impl Reportable for DetailsReporter { .context("Failed to get match group metadata from datastore")?; for metadata in group_metadata.into_iter() { - let matches = datastore - .get_match_group_matches(&metadata, None) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, None)?; let match_group = MatchGroup { metadata, matches }; serde_json::to_writer(&mut writer, &match_group)?; @@ -94,13 +105,11 @@ impl Reportable for DetailsReporter { let results: Vec = group_metadata .into_iter() .map(|metadata| { - let matches = datastore - .get_match_group_matches(&metadata, None) - .with_context(|| format!("Failed to get matches for group {metadata:?}"))?; + let matches = self.get_matches(&metadata, None)?; let first_match_blob_id = match matches.first() { - Some(m) => m.blob_id.to_string(), - None => bail!("Failed to get group matches for group {metadata:?}"), + Some(entry) => entry.m.blob_id.to_string(), + None => bail!("Failed to get group match data for group {metadata:?}"), }; let message = sarif::MessageBuilder::default() .text(format!( @@ -119,19 +128,21 @@ impl Reportable for DetailsReporter { // Will store every match location for the runs.results.location array property let locations: Vec = matches .into_iter() - .map(|m| { + .map(|BlobMetadataMatch { md, m }| { let source_span = &m.location.source_span; // let offset_span = &m.location.offset_span; let uri = match m.provenance { - Provenance::File { path } => { - path.display().to_string() - } + Provenance::File { path } => path.display().to_string(), // FIXME: using this path is nonsense here - Provenance::GitRepo { path } => { - path.display().to_string() - } + Provenance::GitRepo { path } => path.display().to_string(), }; + let properties = sarif::PropertyBagBuilder::default().additional_properties([ + (String::from("mime_essence"), serde_json::json!(md.mime_essence)), + (String::from("charset"), serde_json::json!(md.charset)), + (String::from("num_bytes"), serde_json::json!(md.num_bytes)), + ]).build()?; + let location = sarif::LocationBuilder::default() .physical_location( sarif::PhysicalLocationBuilder::default() @@ -164,17 +175,14 @@ impl Reportable for DetailsReporter { .logical_locations([sarif::LogicalLocationBuilder::default() .kind("blob") .name(m.blob_id.to_string()) + .properties(properties) .build()?]) .build()?; Ok(location) }) .collect::>()?; - let sha1_fingerprint = { - let mut h = gix_features::hash::Sha1::default(); - h.update(&metadata.match_content); - hex::encode(h.digest()) - }; + let sha1_fingerprint = sha1_hexdigest(&metadata.match_content); // Build the result for the match let result = sarif::ResultBuilder::default() @@ -265,11 +273,19 @@ fn noseyparker_sarif_tool() -> Result { } /// A group of matches that all have the same rule and capture group content -#[derive(Serialize, Deserialize)] +#[derive(Serialize)] struct MatchGroup { #[serde(flatten)] metadata: MatchGroupMetadata, - matches: Vec, + matches: Vec, +} + +#[derive(Serialize)] +struct BlobMetadataMatch { + #[serde(rename="blob_metadata")] + md: BlobMetadata, + #[serde(flatten)] + m: Match, } lazy_static! { @@ -299,7 +315,6 @@ impl MatchGroup { } } -// XXX this implementation is grotty impl Display for MatchGroup { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { writeln!(f, "{}", STYLE_RULE.apply_to(self.rule_name()))?; @@ -332,20 +347,27 @@ impl Display for MatchGroup { // print matches let mut f = indented(f).with_str(" "); - for (i, m) in self.matches.iter().enumerate() { + for (i, BlobMetadataMatch { md, m }) in self.matches.iter().enumerate() { let i = i + 1; writeln!( f, "{}", STYLE_HEADING.apply_to(format!("Occurrence {}/{}", i, self.total_matches())) )?; + let blob_metadata = + format!("{} bytes, {}, {}", + md.num_bytes(), + md.mime_essence().unwrap_or("unknown type"), + md.charset().unwrap_or("unknown charset"), + ); match &m.provenance { Provenance::File { path } => { writeln!( f, - "{} {}", + "{} {} ({})", STYLE_HEADING.apply_to("File:"), - STYLE_METADATA.apply_to(path.display()) + STYLE_METADATA.apply_to(path.display()), + STYLE_METADATA.apply_to(blob_metadata), )?; } Provenance::GitRepo { path } => { @@ -353,13 +375,14 @@ impl Display for MatchGroup { f, "{} {}", STYLE_HEADING.apply_to("Git repo:"), - STYLE_METADATA.apply_to(path.display()) + STYLE_METADATA.apply_to(path.display()), )?; writeln!( f, - "{} {}", + "{} {} ({})", STYLE_HEADING.apply_to("Blob:"), - STYLE_METADATA.apply_to(&m.blob_id) + STYLE_METADATA.apply_to(&m.blob_id), + STYLE_METADATA.apply_to(blob_metadata), )?; } } diff --git a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs index 74debfeff..2fe6f552b 100644 --- a/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs +++ b/crates/noseyparker-cli/src/bin/noseyparker/cmd_scan.rs @@ -2,7 +2,6 @@ use anyhow::{bail, Context, Result}; use indicatif::{HumanBytes, HumanCount, HumanDuration}; use rayon::prelude::*; use std::str::FromStr; -use std::sync::mpsc; use std::sync::Mutex; use std::time::Instant; use tracing::{debug, debug_span, error, info, warn}; @@ -11,6 +10,7 @@ use crate::args; use noseyparker::blob::Blob; use noseyparker::blob_id_set::BlobIdSet; +use noseyparker::blob_metadata::BlobMetadata; use noseyparker::datastore::Datastore; use noseyparker::defaults::DEFAULT_IGNORE_RULES; use noseyparker::git_binary::{CloneMode, Git}; @@ -25,6 +25,7 @@ use noseyparker::progress::Progress; use noseyparker::provenance::Provenance; use noseyparker::rules::Rules; use noseyparker::rules_database::RulesDatabase; +use noseyparker::{content_guesser, content_guesser::Guesser}; /// This command scans multiple filesystem inputs for secrets. /// The implementation enumerates content in parallel, scans the enumerated content in parallel, @@ -80,8 +81,9 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> ); let mut num_found: u64 = 0; let api_url = args.input_specifier_args.github_api_url.clone(); - for repo_string in github::enumerate_repo_urls(&repo_specifiers, api_url, Some(&mut progress)) - .context("Failed to enumerate GitHub repositories")? + for repo_string in + github::enumerate_repo_urls(&repo_specifiers, api_url, Some(&mut progress)) + .context("Failed to enumerate GitHub repositories")? { match GitUrl::from_str(&repo_string) { Ok(repo_url) => repo_urls.push(repo_url), @@ -123,14 +125,13 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> args::GitCloneMode::Mirror => CloneMode::Mirror, args::GitCloneMode::Bare => CloneMode::Bare, }; - let clones_dir = datastore.clones_dir(); let git = Git::new(); let mut progress = Progress::new_bar(repo_urls.len() as u64, "Fetching Git repos", progress_enabled); for repo_url in repo_urls { - let output_dir = match clone_destination(&clones_dir, &repo_url) { + let output_dir = match datastore.clone_destination(&repo_url) { Err(e) => { progress.suspend(|| { error!("Failed to determine output directory for {repo_url}: {e}"); @@ -267,15 +268,17 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let matcher_stats = Mutex::new(MatcherStats::default()); let seen_blobs = BlobIdSet::new(); - let make_matcher = || -> Result { + let make_matcher = || -> Result<(Matcher, Guesser)> { *num_matchers_counter.lock().unwrap() += 1; - Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats)) + let matcher = Matcher::new(&rules_db, &seen_blobs, Some(&matcher_stats))?; + let guesser = content_guesser::Guesser::new()?; + Ok((matcher, guesser)) }; // a function to convert BlobMatch into regular Match let convert_blob_matches = |blob: &Blob, matches: Vec, provenance: Provenance| -> Vec { - assert!(!matches.is_empty()); + // assert!(!matches.is_empty()); let loc_mapping = { match matches .iter() @@ -297,48 +300,169 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let mut progress = Progress::new_bytes_bar(total_blob_bytes, "Scanning content", progress_enabled); - // Create a channel pair so that matcher threads can get their results to the database - // recorder. - let (send_matches, recv_matches) = mpsc::sync_channel::>(512); + // Create a channel pair for matcher threads to get their results to the datastore recorder. + // let channel_size = std::cmp::max(args.num_jobs * 32, 1024); + type DatastoreMessage = (BlobMetadata, Vec); + // let (send_ds, recv_ds) = crossbeam_channel::bounded::(channel_size); + let (send_ds, recv_ds) = crossbeam_channel::unbounded::(); - // We create a separate thread for writing matches to the database. - // The database uses SQLite, which does best with a single writer. - let match_writer = { + // We create a separate thread for writing matches to the datastore. + // The datastore uses SQLite, which does best with a single writer. + let datastore_writer = { std::thread::Builder::new() .name("Datastore Writer".to_string()) .spawn(move || { let mut num_matches = 0u64; let mut num_added = 0usize; + // keep reading until all the senders hang up; panic if recording matches fails - while let Ok(matches) = recv_matches.recv() { - num_matches += matches.len() as u64; + // + // accumulate messages in batches to avoid an excessive number of tiny datastore + // transactions (which kills performance) + + let mut last_tx_time = std::time::Instant::now(); + + const BUF_SIZE: usize = 16384; + let mut batch_matches: Vec> = Vec::with_capacity(BUF_SIZE); + let mut batch_matches_count: usize = 0; + let mut batch_metadata: Vec = Vec::with_capacity(BUF_SIZE); + + // Try to commit at least every second + const COMMIT_INTERVAL: std::time::Duration = std::time::Duration::from_millis(1000); + + for (metadata, matches) in recv_ds.iter() { + batch_matches_count += matches.len(); + batch_matches.push(matches); + + batch_metadata.push(metadata); + + if batch_matches_count >= BUF_SIZE + || batch_metadata.len() >= BUF_SIZE + || last_tx_time.elapsed() >= COMMIT_INTERVAL + { + let mut committed = false; + if batch_matches_count > 0 { + // let t1 = std::time::Instant::now(); + num_matches += batch_matches_count as u64; + num_added += datastore + .record_matches(batch_matches.iter().flatten()) + .expect("should be able to record matches to the datastore"); + // debug!("*** commit matches: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_matches_count, recv_ds.len()); + batch_matches.clear(); + batch_matches_count = 0; + committed = true; + } + + if !batch_metadata.is_empty() { + // let t1 = std::time::Instant::now(); + datastore + .record_blob_metadata(&batch_metadata) + .expect("should be able to record blob metadata to the datastore"); + // debug!("*** commit metadata: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_metadata.len(), recv_ds.len()); + batch_metadata.clear(); + committed = true; + } + + if committed { + last_tx_time = std::time::Instant::now(); + } + } + } + + // record any remaining batched up items + if !batch_matches.is_empty() { + // let t1 = std::time::Instant::now(); + num_matches += batch_matches_count as u64; num_added += datastore - .record_matches(&matches) - .expect("should be able to record matches to the database"); + .record_matches(batch_matches.iter().flatten()) + .expect("should be able to record matches to the datastore"); + // debug!("*** commit matches: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_matches_count, recv_ds.len()); + batch_matches.clear(); + // batch_matches_count = 0; } + + if !batch_metadata.is_empty() { + // let t1 = std::time::Instant::now(); + datastore + .record_blob_metadata(&batch_metadata) + .expect("should be able to record blob metadata to the datastore"); + // debug!("*** commit metadata: {:.3}s {} {}", t1.elapsed().as_secs_f64(), batch_metadata.len(), recv_ds.len()); + batch_metadata.clear(); + } + datastore .analyze() - .expect("should be able to analyze the database"); + .expect("should be able to analyze the datastore"); // FIXME: `num_added` is not computed correctly (datastore, num_matches, num_added as u64) }) .expect("should be able to start datastore writer thread") }; + let run_matcher = |matcher_guesser: &mut (Matcher, Guesser), + provenance: Provenance, + blob: Blob| + -> Result<()> { + #[allow(unused_variables)] + let (matcher, guesser) = matcher_guesser; + + let matches = match matcher.scan_blob(&blob, &provenance) { + Err(e) => { + error!("Failed to scan blob {} from {}: {}", blob.id, provenance, e); + return Ok(()); + } + Ok(v) => v, + }; + + if matches.is_empty() && !args.record_all_blobs { + return Ok(()); + } + + let (mime_essence, charset) = { + let input = match &provenance { + Provenance::File { path } => { + content_guesser::Input::from_path_and_bytes(path, &blob.bytes) + } + Provenance::GitRepo { .. } => content_guesser::Input::from_bytes(&blob.bytes), + }; + let guess = guesser.guess(input); + match guess.best_guess() { + None => (None, None), + Some(m) => { + let essence = m.essence_str().to_owned(); + let charset = m.get_param(mime::CHARSET).map(|n| n.to_string()); + (Some(essence), charset) + } + } + }; + + let metadata = BlobMetadata { + id: blob.id, + num_bytes: blob.len(), + mime_essence, + charset, + }; + let matches = convert_blob_matches(&blob, matches, provenance); + send_ds.send((metadata, matches))?; + + Ok(()) + }; + // --------------------------------------------------------------------------------------------- // Scan plain files // --------------------------------------------------------------------------------------------- - inputs.files.par_iter().for_each_init( + inputs.files.par_iter().try_for_each_init( || { let matcher = make_matcher().expect("should be able to create a matcher"); + (matcher, progress.clone()) }, - |(matcher, progress), file_result: &FileResult| { + |(matcher, progress), file_result: &FileResult| -> Result<()> { let fname = &file_result.path; let blob = match Blob::from_file(fname) { Err(e) => { error!("Failed to load blob from {}: {}", fname.display(), e); - return; + return Ok(()); } Ok(v) => v, }; @@ -346,106 +470,82 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> let provenance = Provenance::File { path: fname.clone(), }; - let matches = match matcher.scan_blob(&blob, &provenance) { - Err(e) => { - error!("Failed to scan blob from {}: {}", fname.display(), e); - return; - } - Ok(v) => v, - }; - if matches.is_empty() { - return; - } - let matches = convert_blob_matches(&blob, matches, provenance); - send_matches - .send(matches) - .expect("should be able to send all matches"); + + run_matcher(matcher, provenance, blob)?; + + Ok(()) }, - ); + )?; // --------------------------------------------------------------------------------------------- // Scan Git repo inputs // --------------------------------------------------------------------------------------------- - inputs.git_repos.par_iter().for_each(|git_repo_result| { - let repository = match open_git_repo(&git_repo_result.path) { - Ok(Some(repository)) => repository.into_sync(), - Ok(None) => { - error!( - "Failed to re-open previously-found repository at {}", - git_repo_result.path.display() - ); - return; - } - Err(err) => { - error!( - "Failed to re-open previously-found repository at {}: {err}", - git_repo_result.path.display() - ); - return; - } - }; - - git_repo_result.blobs.par_iter().for_each_init( - || { - let matcher = make_matcher().expect("should be able to create a matcher"); - let repo = repository.to_thread_local(); - (repo, matcher, progress.clone()) - }, - |(repo, matcher, progress), (blob_id, size)| { - progress.inc(*size); - let path = &git_repo_result.path; - // debug!("Scanning {} size {} from {:?}", oid, size, path); - - // Check for duplicates before even loading the entire blob contents - if seen_blobs.contains(blob_id) { - return; + inputs + .git_repos + .par_iter() + .try_for_each(|git_repo_result| -> Result<()> { + let repository = match open_git_repo(&git_repo_result.path) { + Ok(Some(repository)) => repository.into_sync(), + Ok(None) => { + error!( + "Failed to re-open previously-found repository at {}", + git_repo_result.path.display() + ); + return Ok(()); } - let blob = match repo.find_object(gix::hash::ObjectId::from(blob_id.as_bytes())) { - Err(e) => { - error!( - "Failed to read blob {} from Git repository at {}: {}", - blob_id, - path.display(), - e - ); - return; - } - // FIXME: get rid of this extra copy - Ok(blob) => Blob::new(*blob_id, blob.data.to_owned()), - }; - let provenance = Provenance::GitRepo { - path: path.to_path_buf(), - }; - match matcher.scan_blob(&blob, &provenance) { - Err(e) => { - error!( - "Failed to scan blob {} from Git repository at {}: {}", - blob_id, - path.display(), - e - ); - } - Ok(matches) => { - if matches.is_empty() { - return; - } - let matches = convert_blob_matches(&blob, matches, provenance); - send_matches - .send(matches) - .expect("should be able to send all matches"); - } + Err(err) => { + error!( + "Failed to re-open previously-found repository at {}: {err}", + git_repo_result.path.display() + ); + return Ok(()); } - }, - ); - }); + }; + + git_repo_result.blobs.par_iter().try_for_each_init( + || { + let repo = repository.to_thread_local(); + let matcher = make_matcher().expect("should be able to create a matcher"); + (repo, matcher, progress.clone()) + }, + |(repo, matcher, progress), (blob_id, size)| -> Result<()> { + progress.inc(*size); + let path = &git_repo_result.path; + // debug!("Scanning {} size {} from {:?}", oid, size, path); + + let blob = match repo.find_object(blob_id) { + Err(e) => { + error!( + "Failed to read blob {} from Git repository at {}: {}", + blob_id, + path.display(), + e + ); + return Ok(()); + } + Ok(mut blob) => { + let data = std::mem::take(&mut blob.data); // avoid a copy + Blob::new(*blob_id, data) + } + }; + let provenance = Provenance::GitRepo { + path: path.to_path_buf(), + }; + + run_matcher(matcher, provenance, blob)?; + + Ok(()) + }, + ) + })?; // --------------------------------------------------------------------------------------------- // Wait for all inputs to be scanned and the database thread to finish // --------------------------------------------------------------------------------------------- // Get rid of the reference to the sending channel after starting the scanners, // to ensure things terminate as expected. - drop(send_matches); - let (datastore, num_matches, num_new_matches) = match_writer.join().unwrap(); + drop(send_ds); + let (datastore, num_matches, num_new_matches) = datastore_writer.join().unwrap(); progress.finish(); // --------------------------------------------------------------------------------------------- @@ -503,39 +603,3 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()> Ok(()) } - -/// Get a path for a local clone of the given git URL underneath `root`. -fn clone_destination(root: &std::path::Path, repo: &GitUrl) -> Result { - Ok(root.join(repo.to_path_buf())) -} - -#[cfg(test)] -mod test { - macro_rules! clone_destination_success_tests { - ($($case_name:ident: ($root:expr, $repo:expr) => $expected:expr,)*) => { - mod clone_destination { - use noseyparker::git_url::GitUrl; - use pretty_assertions::assert_eq; - use std::path::{PathBuf, Path}; - use std::str::FromStr; - use super::super::clone_destination; - - $( - #[test] - fn $case_name() { - let expected: Option = Some(Path::new($expected).to_owned()); - - let root = Path::new($root); - let repo = GitUrl::from_str($repo).expect("repo should be a URL"); - assert_eq!(clone_destination(root, &repo).ok(), expected); - } - )* - } - } - } - - clone_destination_success_tests! { - https_01: ("rel_root", "https://example.com/testrepo.git") => "rel_root/https/example.com/testrepo.git", - https_02: ("/abs_root", "https://example.com/testrepo.git") => "/abs_root/https/example.com/testrepo.git", - } -} diff --git a/crates/noseyparker-cli/tests/help/mod.rs b/crates/noseyparker-cli/tests/help/mod.rs index 7d7536930..15f467fe4 100644 --- a/crates/noseyparker-cli/tests/help/mod.rs +++ b/crates/noseyparker-cli/tests/help/mod.rs @@ -99,7 +99,7 @@ fn version_short() { fn version_long() { with_settings!({ filters => vec![ - (r"(?m)^( [^:]+:\s+).+$", r"$1") + (r"(?m)^( [^:]+:[ \t]+).*$", r"$1") ], }, { assert_cmd_snapshot!(noseyparker_success!("--version")); diff --git a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap index 7da6d74cf..f3650b84c 100644 --- a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap +++ b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan-2.snap @@ -129,6 +129,13 @@ Content Filtering Options: This option can be repeated. + --record-all-blobs + Enable or disable metadata recording for all discovered blobs instead of just those with + matches + + [default: false] + [possible values: true, false] + Global Options: -v, --verbose... Enable verbose output diff --git a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap index 40eee5d3a..64296c2e2 100644 --- a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap +++ b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__help_scan_short-2.snap @@ -29,6 +29,9 @@ Input Specifier Options: Content Filtering Options: --max-file-size Do not scan files larger than the specified size [default: 100] -i, --ignore Use custom path-based ignore rules from the specified file + --record-all-blobs Enable or disable metadata recording for all discovered blobs + instead of just those with matches [default: false] [possible + values: true, false] Global Options: -v, --verbose... Enable verbose output diff --git a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap index d634dc078..c8255f891 100644 --- a/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap +++ b/crates/noseyparker-cli/tests/help/snapshots/test_noseyparker__help__version_long-2.snap @@ -1,5 +1,5 @@ --- -source: crates/noseyparker-cli/tests/test_noseyparker_help.rs +source: crates/noseyparker-cli/tests/help/mod.rs expression: stdout --- noseyparker 0.13.0-dev @@ -12,8 +12,8 @@ Build Configuration: Commit Branch: Commit SHA: - Cargo Features: - + Cargo Features: + Debug: Optimization: Target Triple: diff --git a/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap b/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap index f84f90d11..70ad45797 100644 --- a/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap +++ b/crates/noseyparker-cli/tests/scan/basic/snapshots/test_noseyparker__scan__basic__scan_secrets1-7.snap @@ -1,5 +1,5 @@ --- -source: tests/test_noseyparker_scan.rs +source: crates/noseyparker-cli/tests/scan/basic/mod.rs expression: json_output --- [ @@ -8,6 +8,12 @@ expression: json_output "matches": [ { "blob_id": "7980f2571d9c04d65eb338f65f21edbff4469a11", + "blob_metadata": { + "charset": null, + "id": "7980f2571d9c04d65eb338f65f21edbff4469a11", + "mime_essence": "text/plain", + "num_bytes": 81 + }, "capture_group_index": 1, "location": { "offset_span": { diff --git a/crates/noseyparker-cli/tests/scan/mod.rs b/crates/noseyparker-cli/tests/scan/mod.rs index 46d86996b..18019d923 100644 --- a/crates/noseyparker-cli/tests/scan/mod.rs +++ b/crates/noseyparker-cli/tests/scan/mod.rs @@ -13,3 +13,4 @@ mod snippet_length; // TODO: add test for scanning with `--git-clone-mode bare` and `--git-clone-mode mirror` // TODO: add test for scanning with `--github-api-url` // TODO: add tests for SARIF output format +// TODO: add tests for blob metadata recording diff --git a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap index 27b15a3a1..3493e3abc 100644 --- a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap +++ b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-14.snap @@ -8,6 +8,12 @@ expression: json_output "matches": [ { "blob_id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "blob_metadata": { + "charset": null, + "id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "mime_essence": "text/plain", + "num_bytes": 1425 + }, "capture_group_index": 1, "location": { "offset_span": { diff --git a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap index 13a9d11fd..b885d4ec1 100644 --- a/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap +++ b/crates/noseyparker-cli/tests/scan/snippet_length/snapshots/test_noseyparker__scan__snippet_length__scan_changing_snippet_length-7.snap @@ -8,6 +8,12 @@ expression: json_output "matches": [ { "blob_id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "blob_metadata": { + "charset": null, + "id": "c3c55e6f7e1304573e25e85202e9f019bfc05087", + "mime_essence": "text/plain", + "num_bytes": 1425 + }, "capture_group_index": 1, "location": { "offset_span": { diff --git a/crates/noseyparker/Cargo.toml b/crates/noseyparker/Cargo.toml index 4c157b335..ecbef4e56 100644 --- a/crates/noseyparker/Cargo.toml +++ b/crates/noseyparker/Cargo.toml @@ -17,6 +17,7 @@ path = "src/lib.rs" [features] rule_profiling = [] +libmagic = ["content_guesser/libmagic"] [dependencies] # anyhow = { version = "1.0", features = ["backtrace"] } # add backtraces to errors -- not sure how expensive this is @@ -25,8 +26,8 @@ atoi = "2.0" bstr = { version = "1.0", features = ["serde"] } chrono = { version = "0.4", default_features = false, features = ["std"] } console = "0.15" -gix-features = "0.30" -gix = { version = "0.46", features = ["max-performance"] } +content_guesser = { path = "../content-guesser" } +gix = { version = "0.47", features = ["max-performance"] } hex = "0.4" hyperx = "1.4" include_dir = { version = "0.7", features = ["glob"] } @@ -35,12 +36,14 @@ indicatif = { version = "0.17", features = ["improved_unicode", "rayon"] } indoc = "2.0" ignore = "0.4" lazy_static = "1.4" +mime = "0.3" regex = "1.7" reqwest = { version = "0.11", features = ["json", "native-tls-vendored"] } rusqlite = { version = "0.29", features = ["bundled", "backup"] } secrecy = "0.8.0" serde = { version = "1.0", features = ["derive"] } serde_yaml = "0.9" +thiserror = "1" tokio = "1.23" tracing = "0.1" url = "2.3" diff --git a/crates/noseyparker/src/blob_id.rs b/crates/noseyparker/src/blob_id.rs index 206feeec8..dec8f65ca 100644 --- a/crates/noseyparker/src/blob_id.rs +++ b/crates/noseyparker/src/blob_id.rs @@ -12,7 +12,7 @@ impl BlobId { /// Create a new BlobId computed from the given input. #[inline] pub fn new(input: &[u8]) -> Self { - use gix_features::hash::Sha1; + use crate::digest::Sha1; use std::io::Write; // XXX implement a Write instance for `Sha1`, in an attempt to avoid allocations for @@ -82,7 +82,13 @@ impl<'a> From<&'a gix::ObjectId> for BlobId { .expect("oid should be a 20-byte value"), ) } - } +} + +impl<'a> From<&'a BlobId> for gix::ObjectId { + fn from(blob_id: &'a BlobId) -> Self { + gix::hash::ObjectId::from(blob_id.as_bytes()) + } +} // ------------------------------------------------------------------------------------------------- // test diff --git a/crates/noseyparker/src/blob_metadata.rs b/crates/noseyparker/src/blob_metadata.rs new file mode 100644 index 000000000..1e781764a --- /dev/null +++ b/crates/noseyparker/src/blob_metadata.rs @@ -0,0 +1,35 @@ +use crate::blob_id::BlobId; + +/// Metadata about a blob +#[derive(Debug, serde::Deserialize, serde::Serialize)] +pub struct BlobMetadata { + /// The blob ID this metadata applies to + pub id: BlobId, + + /// The length in bytes of the blob + pub num_bytes: usize, + + /// The guessed multimedia type of the blob + pub mime_essence: Option, + + /// The guessed charset of the blob + pub charset: Option, +} + +impl BlobMetadata { + /// Get the length of the blob in bytes. + #[inline] + pub fn num_bytes(&self) -> usize { + self.num_bytes + } + + #[inline] + pub fn mime_essence(&self) -> Option<&str> { + self.mime_essence.as_deref() + } + + #[inline] + pub fn charset(&self) -> Option<&str> { + self.charset.as_deref() + } +} diff --git a/crates/noseyparker/src/datastore.rs b/crates/noseyparker/src/datastore.rs index 7e226b3dd..7eb5bf9fb 100644 --- a/crates/noseyparker/src/datastore.rs +++ b/crates/noseyparker/src/datastore.rs @@ -2,11 +2,13 @@ use anyhow::{bail, Context, Result}; use bstr::BString; use indoc::indoc; use rusqlite::Connection; -use serde::{Deserialize, Serialize}; +use serde::Serialize; use std::path::{Path, PathBuf}; use tracing::{debug, debug_span}; use crate::blob_id::BlobId; +use crate::blob_metadata::BlobMetadata; +use crate::git_url::GitUrl; use crate::location::{Location, OffsetSpan, SourcePoint, SourceSpan}; use crate::match_type::Match; use crate::provenance::Provenance; @@ -15,6 +17,21 @@ use crate::snippet::Snippet; // ------------------------------------------------------------------------------------------------- // Datastore // ------------------------------------------------------------------------------------------------- + +/// The source of truth for Nosey Parker findings and runtime state. +/// +/// A `Datastore` resides on disk as a directory, and stores a number of things: +/// +/// - A sqlite database for recording findings and scan information +/// - A scratch directory for providing temporary directories and files +/// - A directory used for storing clones of Git repositories +/// +/// Note that a `Datastore` is not `Sync`, and thus cannot be directly shared between threads. +/// The recommended pattern in a case that requires concurrent access is to have a single thread +/// that mediates access to the `Datastore`. +/// +/// Accessing a single `Datastore` from multiple processes is untested and may not work correctly. +/// This implementation has not built-in mechanism to check for or prevent multi-process access. pub struct Datastore { /// The root directory of everything contained in this `Datastore`. root_dir: PathBuf, @@ -23,6 +40,7 @@ pub struct Datastore { conn: Connection, } +// Public implementation impl Datastore { /// Create a new datastore at `root_dir` if one does not exist, /// or open an existing one if present. @@ -92,101 +110,46 @@ impl Datastore { self.root_dir.join("clones") } - fn new_connection(path: &Path) -> Result { - let conn = Connection::open(path)?; - - conn.pragma_update(None, "journal_mode", "wal")?; // https://www.sqlite.org/wal.html - conn.pragma_update(None, "foreign_keys", "on")?; // https://sqlite.org/foreignkeys.html - conn.pragma_update(None, "synchronous", "normal")?; // https://sqlite.org/pragma.html#pragma_synchronous - // - let limit: i64 = -512 * 1024; // 512MiB limit - conn.pragma_update(None, "cache_size", limit)?; // https://sqlite.org/pragma.html#pragma_cache_size - - Ok(conn) + /// Get a path for a local clone of the given git URL within this datastore's clones directory. + pub fn clone_destination(&self, repo: &GitUrl) -> Result { + clone_destination(&self.clones_dir(), repo) } - fn migrate(&mut self) -> Result { - let _span = debug_span!("Datastore::migrate", "{}", self.root_dir.display()).entered(); - let tx = self.conn.transaction()?; - - let get_user_version = || -> Result { - let user_version = tx.pragma_query_value(None, "user_version", |r| r.get(0))?; - Ok(user_version) - }; - - let set_user_version = |user_version: u64| -> Result<()> { - tx.pragma_update(None, "user_version", user_version)?; - Ok(()) - }; - - let user_version: u64 = get_user_version()?; - if user_version == 0 { - let new_user_version = user_version + 1; - debug!( - "Migrating database schema from version {} to {}", - user_version, new_user_version - ); - tx.execute_batch(indoc! {r#" - create table matches - -- This table is a fully denormalized representation of the matches found from - -- scanning. - -- - -- See the `noseyparker::match::Match` type for correspondence. - -- - -- Eventually we should refine the database schema, normalizing where appropriate. - -- Doing so could allow for better write performance and smaller databases. - ( - blob_id text not null, - - start_byte integer not null, - end_byte integer not null, - - start_line integer not null, - start_column integer not null, - - end_line integer not null, - end_column integer not null, - - before_snippet blob not null, - matching_input blob not null, - after_snippet blob not null, - - group_index integer not null, - group_input blob not null, - - rule_name text not null, - - provenance_type text not null, - provenance blob not null, + /// Analyze the datastore's sqlite database, potentially allowing for better query planning + pub fn analyze(&self) -> Result<()> { + self.conn.execute("analyze", [])?; + // self.conn.execute("pragma wal_checkpoint(truncate)", [])?; + Ok(()) + } - -- NOTE: We really want this entire table to have unique values. - -- But checking just these fields ought to be sufficient to ensure that; - -- the remaining fields are either derived from these or are not relevant - -- to match deduping (like provenance). - -- Checking fewer fields should be cheaper than checking _all_ fields. - unique ( - blob_id, - start_byte, - end_byte, - group_index, - rule_name - ) - ); + /// Record the given blob metadata into the datastore. + /// + /// The given entries are recorded in a single transaction. + pub fn record_blob_metadata<'a, T: IntoIterator>( + &mut self, + blob_metadata: T, + ) -> Result<()> { + let _span = debug_span!("Datastore::record_blob_metadata", "{}", self.root_dir.display()).entered(); - -- An index to allow quick grouping of equivalent matches - create index matches_grouping_index on matches (group_input, rule_name); + let tx = self.conn.transaction()?; + { + let mut stmt = tx.prepare_cached(indoc! {r#" + insert or replace into blob_metadata(blob_id, size, mime_essence, charset) + values (?, ?, ?, ?) "#})?; - set_user_version(new_user_version)?; - tx.commit()?; + + for md in blob_metadata { + stmt.execute((&md.id.hex(), md.num_bytes(), md.mime_essence(), md.charset()))?; + } } - Ok(user_version) - } - pub fn analyze(&self) -> Result<()> { - self.conn.execute("analyze", [])?; + tx.commit()?; Ok(()) } + /// Record the given matches into the datastore. + /// + /// The given entries are recorded in a single transaction. pub fn record_matches<'a, T: IntoIterator>( &mut self, matches: T, @@ -245,6 +208,7 @@ impl Datastore { Ok(num_changed) } + /// Summarize all recorded findings. pub fn summarize(&self) -> Result { let _span = debug_span!("Datastore::summarize", "{}", self.root_dir.display()).entered(); @@ -272,10 +236,12 @@ impl Datastore { Ok(MatchSummary(es)) } + /// Get the root directory that contains this `Datastore`. pub fn root_dir(&self) -> &Path { &self.root_dir } + /// Get metadata for all groups of identical matches recorded within this `Datastore`. pub fn get_match_group_metadata(&self) -> Result> { let _span = debug_span!("Datastore::get_match_group_metadata", "{}", self.root_dir.display()).entered(); @@ -300,31 +266,37 @@ impl Datastore { Ok(es) } - pub fn get_match_group_matches( + /// Get up to `limit` matches that belong to the group with the given group metadata. + pub fn get_match_group_data( &self, metadata: &MatchGroupMetadata, limit: Option, - ) -> Result> { - let _span = debug_span!("Datastore::match_groups", "{}", self.root_dir.display()).entered(); + ) -> Result> { + let _span = debug_span!("Datastore::get_match_group_data", "{}", self.root_dir.display()).entered(); let mut stmt = self.conn.prepare_cached(indoc! {r#" select - blob_id, - start_byte, - end_byte, - start_line, - start_column, - end_line, - end_column, - before_snippet, - matching_input, - after_snippet, - group_index, - provenance_type, - provenance - from matches - where rule_name = ? and group_input = ? - order by blob_id, start_byte, end_byte + m.blob_id, + m.start_byte, + m.end_byte, + m.start_line, + m.start_column, + m.end_line, + m.end_column, + m.before_snippet, + m.matching_input, + m.after_snippet, + m.group_index, + m.provenance_type, + m.provenance, + + b.size, + b.mime_essence, + b.charset + from matches m + inner join blob_metadata b on (m.blob_id = b.blob_id) + where m.rule_name = ? and m.group_input = ? + order by m.blob_id, m.start_byte, m.end_byte limit ? "#})?; @@ -334,8 +306,9 @@ impl Datastore { }; let entries = stmt.query_map((&metadata.rule_name, metadata.match_content.as_slice(), limit), |row| { let v0: String = row.get(0)?; - Ok(Match { - blob_id: BlobId::from_hex(&v0).expect("blob id from database should be valid"), + let blob_id = BlobId::from_hex(&v0).expect("blob id from database should be valid"); + let m = Match { + blob_id, location: Location { offset_span: OffsetSpan { start: row.get(1)?, @@ -362,7 +335,16 @@ impl Datastore { rule_name: metadata.rule_name.clone(), provenance: provenance_from_parts(row.get(11)?, row.get(12)?) .expect("provenance value from database should be valid"), - }) + }; + let mime_essence: Option = row.get(14)?; + let charset: Option = row.get(15)?; + let b = BlobMetadata { + id: blob_id, + num_bytes: row.get(13)?, + mime_essence, + charset, + }; + Ok((b, m)) })?; let mut es = Vec::new(); for e in entries { @@ -372,6 +354,138 @@ impl Datastore { } } + +// Private implementation +impl Datastore { + fn new_connection(path: &Path) -> Result { + let conn = Connection::open(path)?; + + conn.pragma_update(None, "journal_mode", "wal")?; // https://www.sqlite.org/wal.html + conn.pragma_update(None, "foreign_keys", "on")?; // https://sqlite.org/foreignkeys.html + conn.pragma_update(None, "synchronous", "normal")?; // https://sqlite.org/pragma.html#pragma_synchronous + + // FIXME: make this a command-line parameter + let limit: i64 = -8 * 1024 * 1024; // 8GiB limit + conn.pragma_update(None, "cache_size", limit)?; // https://sqlite.org/pragma.html#pragma_cache_size + + Ok(conn) + } + + fn migrate(&mut self) -> Result<()> { + let _span = debug_span!("Datastore::migrate", "{}", self.root_dir.display()).entered(); + let tx = self.conn.transaction()?; + + let get_user_version = || -> Result { + let user_version = tx.pragma_query_value(None, "user_version", |r| r.get(0))?; + Ok(user_version) + }; + + let set_user_version = |user_version: u64| -> Result<()> { + tx.pragma_update(None, "user_version", user_version)?; + Ok(()) + }; + + // ----------------------------------------------------------------------------------------- + // migration 1 + // ----------------------------------------------------------------------------------------- + let user_version: u64 = get_user_version()?; + if user_version == 0 { + let new_user_version = user_version + 1; + debug!( + "Migrating database schema from version {} to {}", + user_version, new_user_version + ); + tx.execute_batch(indoc! {r#" + create table matches + -- This table is a fully denormalized representation of the matches found from + -- scanning. + -- + -- See the `noseyparker::match::Match` type for correspondence. + -- + -- Eventually we should refine the database schema, normalizing where appropriate. + -- Doing so could allow for better write performance and smaller databases. + ( + blob_id text not null, + + start_byte integer not null, + end_byte integer not null, + + start_line integer not null, + start_column integer not null, + + end_line integer not null, + end_column integer not null, + + before_snippet blob not null, + matching_input blob not null, + after_snippet blob not null, + + group_index integer not null, + group_input blob not null, + + rule_name text not null, + + provenance_type text not null, + provenance blob not null, + + -- NOTE: We really want this entire table to have unique values. + -- But checking just these fields ought to be sufficient to ensure that; + -- the remaining fields are either derived from these or are not relevant + -- to match deduping (like provenance). + -- Checking fewer fields should be cheaper than checking _all_ fields. + unique ( + blob_id, + start_byte, + end_byte, + group_index, + rule_name + ) + ); + + -- An index to allow quick grouping of equivalent matches + create index matches_grouping_index on matches (group_input, rule_name); + "#})?; + set_user_version(new_user_version)?; + } + + // ----------------------------------------------------------------------------------------- + // migration 2 + // ----------------------------------------------------------------------------------------- + let user_version: u64 = get_user_version()?; + if user_version == 1 { + let new_user_version = user_version + 1; + debug!( + "Migrating database schema from version {} to {}", + user_version, new_user_version + ); + + tx.execute_batch(indoc! {r#" + create table blob_metadata + -- This table records various bits of metadata about blobs. + ( + blob_id text primary key, + size integer not null, + mime_essence text, + charset text, + + constraint valid_blob_id check( + length(blob_id) == 40 and not glob('*[^abcdefABCDEF1234567890]*', blob_id) + ), + constraint valid_size check(0 <= size) + ); + "#})?; + set_user_version(new_user_version)?; + } + + tx.commit()?; + Ok(()) + } +} + + +// ------------------------------------------------------------------------------------------------- +// Implementation Utilities +// ------------------------------------------------------------------------------------------------- fn provenance_from_parts(tag: String, path: String) -> Result { match tag.as_str() { "git" => Ok(Provenance::GitRepo { @@ -384,14 +498,52 @@ fn provenance_from_parts(tag: String, path: String) -> Result { } } + +/// Get a path for a local clone of the given git URL underneath `root`. +fn clone_destination(root: &std::path::Path, repo: &GitUrl) -> Result { + Ok(root.join(repo.to_path_buf())) +} + +#[cfg(test)] +mod test { + macro_rules! clone_destination_success_tests { + ($($case_name:ident: ($root:expr, $repo:expr) => $expected:expr,)*) => { + mod clone_destination { + use crate::git_url::GitUrl; + use pretty_assertions::assert_eq; + use std::path::{PathBuf, Path}; + use std::str::FromStr; + use super::super::clone_destination; + + $( + #[test] + fn $case_name() { + let expected: Option = Some(Path::new($expected).to_owned()); + + let root = Path::new($root); + let repo = GitUrl::from_str($repo).expect("repo should be a URL"); + assert_eq!(clone_destination(root, &repo).ok(), expected); + } + )* + } + } + } + + clone_destination_success_tests! { + https_01: ("rel_root", "https://example.com/testrepo.git") => "rel_root/https/example.com/testrepo.git", + https_02: ("/abs_root", "https://example.com/testrepo.git") => "/abs_root/https/example.com/testrepo.git", + } +} + // ------------------------------------------------------------------------------------------------- // MatchSummary // ------------------------------------------------------------------------------------------------- -/// A summary of matches in a `Datastore` -#[derive(Deserialize, Serialize)] + +/// A summary of matches in a `Datastore`. +#[derive(Serialize)] pub struct MatchSummary(pub Vec); -#[derive(Deserialize, Serialize)] +#[derive(Serialize)] pub struct MatchSummaryEntry { pub rule_name: String, pub distinct_count: usize, @@ -410,7 +562,9 @@ impl std::fmt::Display for MatchSummary { // ------------------------------------------------------------------------------------------------- // MatchGroupMetadata // ------------------------------------------------------------------------------------------------- -#[derive(Debug, Deserialize, Serialize)] + +/// Metadata for a group of matches that have identical match content. +#[derive(Debug, Serialize)] pub struct MatchGroupMetadata { /// The name of the rule of all the matches in the group pub rule_name: String, diff --git a/crates/noseyparker/src/digest.rs b/crates/noseyparker/src/digest.rs new file mode 100644 index 000000000..6bd6db6a6 --- /dev/null +++ b/crates/noseyparker/src/digest.rs @@ -0,0 +1,8 @@ +pub use gix::features::hash::Sha1; +use hex::encode; + +pub fn sha1_hexdigest(input: &[u8]) -> String { + let mut h = Sha1::default(); + h.update(input); + encode(h.digest()) +} diff --git a/crates/noseyparker/src/git_binary.rs b/crates/noseyparker/src/git_binary.rs index a31a26730..c8530d065 100644 --- a/crates/noseyparker/src/git_binary.rs +++ b/crates/noseyparker/src/git_binary.rs @@ -4,9 +4,15 @@ use tracing::{debug, debug_span}; use crate::git_url::GitUrl; -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum GitError { - IOError(std::io::Error), + #[error("git execution failed: {0}")] + IOError(#[from] std::io::Error), + + #[error("git execution failed\ncode={}\nstdout=```\n{}```\nstderr=```\n{}```", + .status, + String::from_utf8_lossy(.stdout), + String::from_utf8_lossy(.stderr))] GitError { stdout: Vec, stderr: Vec, @@ -14,39 +20,6 @@ pub enum GitError { }, } -impl From for GitError { - fn from(err: std::io::Error) -> GitError { - GitError::IOError(err) - } -} - -impl std::fmt::Display for GitError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - GitError::IOError(e) => write!(f, "git execution failed: {e}"), - GitError::GitError { - stdout, - stderr, - status, - } => write!( - f, - "git execution failed\ncode={status}\nstdout=```\n{}```\nstderr=```\n{}```", - String::from_utf8_lossy(stdout), - String::from_utf8_lossy(stderr) - ), - } - } -} - -impl std::error::Error for GitError { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - GitError::IOError(e) => Some(e), - GitError::GitError { .. } => None, - } - } -} - pub struct Git { credentials: Vec, } diff --git a/crates/noseyparker/src/github/client.rs b/crates/noseyparker/src/github/client.rs index 820fa2673..8b60d7553 100644 --- a/crates/noseyparker/src/github/client.rs +++ b/crates/noseyparker/src/github/client.rs @@ -34,13 +34,13 @@ impl Client { pub async fn get_rate_limit(&self) -> Result { let response = self.get(&["rate_limit"]).await?; - let body = response.json().await.map_err(Error::ReqwestError)?; + let body = response.json().await?; Ok(body) } pub async fn get_user(&self, username: &str) -> Result { let response = self.get(&["users", username]).await?; - let body = response.json().await.map_err(Error::ReqwestError)?; + let body = response.json().await?; Ok(body) } @@ -126,12 +126,12 @@ fn url_from_path_parts_and_params( } buf.push_str(p); } - let url = base_url.join(&buf).map_err(Error::UrlParseError)?; + let url = base_url.join(&buf)?; let url = if params.is_empty() { - Url::parse(url.as_str()).map_err(Error::UrlParseError)? + Url::parse(url.as_str()) } else { - Url::parse_with_params(url.as_str(), params).map_err(Error::UrlParseError)? - }; + Url::parse_with_params(url.as_str(), params) + }?; Ok(url) } @@ -258,7 +258,7 @@ impl Client { }; // send request and wait for response - let response = request_builder.send().await.map_err(Error::ReqwestError)?; + let response = request_builder.send().await?; // Check for rate limiting. // @@ -279,7 +279,7 @@ impl Client { if response.status() == StatusCode::FORBIDDEN { if let Some(retry_after) = response.headers().get("Retry-After") { let wait = atoi::atoi::(retry_after.as_bytes()).map(Duration::seconds); - let client_error = response.json().await.map_err(Error::ReqwestError)?; + let client_error = response.json().await?; return Err(Error::RateLimited { client_error, wait }); } @@ -304,11 +304,12 @@ impl Client { Some(reset_time - date) }(); - let client_error = response.json().await.map_err(Error::ReqwestError)?; + let client_error = response.json().await?; return Err(Error::RateLimited { client_error, wait }); } } - response.error_for_status().map_err(Error::ReqwestError) + let response = response.error_for_status()?; + Ok(response) } } diff --git a/crates/noseyparker/src/github/client_builder.rs b/crates/noseyparker/src/github/client_builder.rs index bae0a9e48..e625e65b5 100644 --- a/crates/noseyparker/src/github/client_builder.rs +++ b/crates/noseyparker/src/github/client_builder.rs @@ -26,7 +26,7 @@ impl ClientBuilder { /// Use the specified base URL. pub fn base_url(mut self, url: T) -> Result { - self.base_url = url.into_url().map_err(Error::ReqwestError)?; + self.base_url = url.into_url()?; Ok(self) } @@ -62,8 +62,7 @@ impl ClientBuilder { pub fn build(self) -> Result { let inner = reqwest::ClientBuilder::new() .user_agent(Self::USER_AGENT) - .build() - .map_err(Error::ReqwestError)?; + .build()?; Ok(Client { base_url: self.base_url, auth: self.auth, diff --git a/crates/noseyparker/src/github/error.rs b/crates/noseyparker/src/github/error.rs index 7d2f7585d..1bc8555fe 100644 --- a/crates/noseyparker/src/github/error.rs +++ b/crates/noseyparker/src/github/error.rs @@ -4,8 +4,9 @@ use super::models; // ------------------------------------------------------------------------------------------------- // Error // ------------------------------------------------------------------------------------------------- -#[derive(Debug)] +#[derive(Debug, thiserror::Error)] pub enum Error { + #[error("request was rate-limited: {}", .client_error.message)] RateLimited { /// The client error returned by GitHub client_error: models::ClientError, @@ -13,35 +14,19 @@ pub enum Error { /// The duration to wait until trying again wait: Option, }, + + #[error("invalid base url: {0}")] UrlBaseError(url::Url), - UrlParseError(url::ParseError), + + #[error("error parsing URL: {0}")] + UrlParseError(#[from] url::ParseError), + + #[error("error building URL: component {0:?} contains a slash")] UrlSlashError(String), - ReqwestError(reqwest::Error), - InvalidTokenEnvVar(String), -} -impl std::fmt::Display for Error { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - match self { - Error::RateLimited{client_error, ..} => write!(f, "request was rate-limited: {}", client_error.message), - Error::UrlBaseError(u) =>write!(f, "invalid base url: {u}"), - Error::UrlParseError(e) => write!(f, "error parsing URL: {e}"), - Error::UrlSlashError(p) => write!(f, "error building URL: component {p:?} contains a slash"), - Error::ReqwestError(e) => write!(f, "error making request: {e}"), - Error::InvalidTokenEnvVar(v) => write!(f, "error loading token: ill-formed value of {v} environment variable"), - } - } -} + #[error("error making request: {0}")] + ReqwestError(#[from] reqwest::Error), -impl std::error::Error for Error { - fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { - match self { - Error::RateLimited{..} => None, - Error::UrlBaseError(_) => None, - Error::UrlParseError(e) => Some(e), - Error::UrlSlashError(_) => None, - Error::ReqwestError(e) => Some(e), - Error::InvalidTokenEnvVar(_) => None, - } - } + #[error("error loading token: ill-formed value of {0} environment variable")] + InvalidTokenEnvVar(String), } diff --git a/crates/noseyparker/src/github/models/page.rs b/crates/noseyparker/src/github/models/page.rs index e522d1ed2..8e9d4a130 100644 --- a/crates/noseyparker/src/github/models/page.rs +++ b/crates/noseyparker/src/github/models/page.rs @@ -1,4 +1,4 @@ -use crate::github::{Error, Result}; +use crate::github::Result; use url::Url; // ------------------------------------------------------------------------------------------------- @@ -9,10 +9,10 @@ pub struct Page { pub links: HeaderLinks, } -impl Page { +impl Page { pub async fn from_response(response: reqwest::Response) -> Result { let links = get_header_links(&response)?; - let items = response.json().await.map_err(Error::ReqwestError)?; + let items = response.json().await?; Ok(Page { items, links }) } } @@ -46,13 +46,17 @@ fn get_header_links(response: &reqwest::Response) -> Result { _ => None, }; if let Some(dst) = dst { - *dst = Some(Url::parse(value.link()).map_err(Error::UrlParseError)?); - }; + *dst = Some(Url::parse(value.link())?); + } } } } } - Ok(HeaderLinks { first, prev, next, last }) + Ok(HeaderLinks { + first, + prev, + next, + last, + }) } - diff --git a/crates/noseyparker/src/input_enumerator.rs b/crates/noseyparker/src/input_enumerator.rs index 6e4ad82ce..64e27eaef 100644 --- a/crates/noseyparker/src/input_enumerator.rs +++ b/crates/noseyparker/src/input_enumerator.rs @@ -105,7 +105,7 @@ impl<'t> Drop for Visitor<'t> { impl<'t> ignore::ParallelVisitor for Visitor<'t> { fn visit(&mut self, result: Result) -> ignore::WalkState { - // FIXME: dedupe based on (device, inode) on platforms where available + // FIXME: dedupe based on (device, inode) on platforms where available; see https://docs.rs/same-file/1.0.6/same_file/ for ideas let entry = match result { Err(e) => { @@ -172,6 +172,13 @@ impl<'t> ignore::ParallelVisitor for Visitor<'t> { } } +/// Provides capabitilies to recursively enumerate a filesystem. +/// +/// This provides a handful of features, including: +/// +/// - Enumeration of found files +/// - Enumeration of blobs found in Git repositories +/// - Support for ignoring files based on size or using path-based gitignore-style rules pub struct FilesystemEnumerator { walk_builder: WalkBuilder, @@ -185,6 +192,12 @@ impl FilesystemEnumerator { pub const DEFAULT_MAX_FILESIZE: u64 = 100 * 1024 * 1024; pub const DEFAULT_FOLLOW_LINKS: bool = false; + /// Create a new `FilesystemEnumerator` with the given set of input roots using default + /// settings. + /// + /// The default maximum file size is 100 MiB. + /// + /// The default behavior is to not follow symlinks. pub fn new>(inputs: &[T]) -> Result { let mut builder = WalkBuilder::new(&inputs[0]); for input in &inputs[1..] { @@ -201,11 +214,13 @@ impl FilesystemEnumerator { }) } + /// Set the number of parallel enumeration threads. pub fn threads(&mut self, threads: usize) -> &mut Self { self.walk_builder.threads(threads); self } + /// Add a set of gitignore-style rules from the given ignore file. pub fn add_ignore>(&mut self, path: T) -> Result<&mut Self> { match self.walk_builder.add_ignore(path) { Some(e) => Err(e)?, @@ -213,20 +228,27 @@ impl FilesystemEnumerator { } } + /// Enable or disable whether symbolic links are followed. pub fn follow_links(&mut self, follow_links: bool) -> &mut Self { self.walk_builder.follow_links(follow_links); self } + /// Set the maximum file size for enumerated files. + /// + /// Files larger than this value will be skipped. pub fn max_filesize(&mut self, max_filesize: Option) -> &mut Self { self.walk_builder.max_filesize(max_filesize); self.max_file_size = max_filesize; self } + /// Specify an ad-hoc filtering function to control which entries are enumerated. + /// + /// This can be used to skip entire directories. pub fn filter_entry

(&mut self, filter: P) -> &mut Self - where - P: Fn(&DirEntry) -> bool + Send + Sync + 'static + where + P: Fn(&DirEntry) -> bool + Send + Sync + 'static, { self.walk_builder.filter_entry(filter); self @@ -262,7 +284,7 @@ pub fn open_git_repo(path: &Path) -> Result> { match gix::open_opts(path, opts) { Err(gix::open::Error::NotARepository { .. }) => Ok(None), Err(err) => Err(err.into()), - Ok(r) => Ok(Some(r)), + Ok(repo) => Ok(Some(repo)), } } diff --git a/crates/noseyparker/src/lib.rs b/crates/noseyparker/src/lib.rs index f55471089..aa5ee4af9 100644 --- a/crates/noseyparker/src/lib.rs +++ b/crates/noseyparker/src/lib.rs @@ -1,9 +1,11 @@ pub mod blob; pub mod blob_id; pub mod blob_id_set; +pub mod blob_metadata; pub mod bstring_escape; pub mod datastore; pub mod defaults; +pub mod digest; pub mod git_binary; pub mod git_url; pub mod github; @@ -12,6 +14,7 @@ pub mod location; pub mod match_type; pub mod matcher; pub mod matcher_stats; +pub use content_guesser; pub mod progress; pub mod provenance; #[cfg(feature = "rule_profiling")] diff --git a/crates/noseyparker/src/match_type.rs b/crates/noseyparker/src/match_type.rs index 8d18d1c6c..2816c237b 100644 --- a/crates/noseyparker/src/match_type.rs +++ b/crates/noseyparker/src/match_type.rs @@ -6,12 +6,11 @@ use crate::snippet::Snippet; use crate::utils::BStringSerde; use bstr::BString; -use serde::{Deserialize, Serialize}; // ------------------------------------------------------------------------------------------------- // Match // ------------------------------------------------------------------------------------------------- -#[derive(Debug, Clone, Deserialize, Serialize)] +#[derive(Debug, Clone, serde::Serialize)] pub struct Match { /// The blob this match comes from pub blob_id: BlobId, diff --git a/crates/noseyparker/src/provenance.rs b/crates/noseyparker/src/provenance.rs index 0edefbe3b..60a68f4c7 100644 --- a/crates/noseyparker/src/provenance.rs +++ b/crates/noseyparker/src/provenance.rs @@ -15,3 +15,12 @@ pub enum Provenance { path: PathBuf, }, } + +impl std::fmt::Display for Provenance { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Provenance::File { path } => write!(f, "file {:?}", path), + Provenance::GitRepo { path } => write!(f, "git repo {:?}", path), + } + } +} diff --git a/crates/noseyparker/src/utils.rs b/crates/noseyparker/src/utils.rs index dbf1c9f2e..a557283a3 100644 --- a/crates/noseyparker/src/utils.rs +++ b/crates/noseyparker/src/utils.rs @@ -32,16 +32,14 @@ impl From for BString { } } -#[inline] -pub fn serialize_bytes_string_lossy( +fn serialize_bytes_string_lossy( bytes: &[u8], s: S, ) -> Result { s.serialize_str(&String::from_utf8_lossy(bytes)) } -#[inline] -pub fn deserialize_bytes_string<'de, D: serde::Deserializer<'de>>( +fn deserialize_bytes_string<'de, D: serde::Deserializer<'de>>( d: D, ) -> Result, D::Error> { let s: &str = serde::Deserialize::deserialize(d)?;