From 2127fb174ea762f8c000017aa8864a805c86850d Mon Sep 17 00:00:00 2001
From: Eric Arellano <14852634+Eric-Arellano@users.noreply.github.com>
Date: Sat, 29 Jan 2022 14:59:20 -0700
Subject: [PATCH] Merge `validate` goal with `lint` goal (#14102)

This adds generic support for `lint` implementations that do not deal with targets. That allows us to merge `validate` into `lint`, which is much cleaner.

## CLI specs

As before with the `validate` goal, it's not very intuitive how to get Pants to run on files not owned by targets, which you want for `validate`. `::` only matches files owned by targets, whereas `**` matches _all_ files regardless of targets.

So, users of `regex-lint` should typically use `./pants lint '**'` rather than `./pants lint ::`, which is not intuitive

https://docs.google.com/document/d/1WWQM-X6kHoSCKwItqf61NiKFWNSlpnTC5QNu3ul9RDk/edit#heading=h.1h4j0d5mazhu proposes changing `::` to match all files, so you can simply use `./pants lint ::`. I don't think we need to block on this proposal? This is still forward progress, and also `validate`/`regex-lint` is not used very much fwict.

## Batching

We don't yet batch per https://github.com/pantsbuild/pants/pull/14186, although it would be trivial for us to hook up. I'm only waiting to do it till we can better reason about if it makes sense to apply here too.

## The `fmt` goal

Note that we need more design for `fmt` before we can apply this same change there. fmt is tricky because we run each formatter for a certain language sequentially so that they don't overwrite each other; but we run distinct languages in parallel. We would need some way to know which "language" target-less files are for.

## "Inferred targets"

A related technology would be inferred targets, where you don't need a BUILD flie but we still have a target: https://github.com/pantsbuild/pants/issues/14074.

This is a complementary technology. The main difference here is that we can operate on files that will _never_ have an owning target, such as a BUILD file itself.

[ci skip-rust]
[ci skip-build-wheels]
---
 .github/workflows/test-cron.yaml              |   6 +-
 .github/workflows/test.yaml                   |   6 +-
 .../bin/generate_github_workflows.py          |   4 +-
 .../pants/backend/project_info/regex_lint.py  | 161 ++++++++++++++----
 src/python/pants/core/goals/lint.py           |  79 ++++++---
 src/python/pants/core/goals/lint_test.py      |  52 ++++--
 .../pants/help/help_integration_test.py       |   2 +-
 7 files changed, 229 insertions(+), 81 deletions(-)

diff --git a/.github/workflows/test-cron.yaml b/.github/workflows/test-cron.yaml
index a1a72497e67..26e327f881f 100644
--- a/.github/workflows/test-cron.yaml
+++ b/.github/workflows/test-cron.yaml
@@ -340,11 +340,9 @@ jobs:
 
         '
     - name: Lint
-      run: './pants validate ''**''
+      run: './pants update-build-files --check
 
-        ./pants update-build-files --check
-
-        ./pants lint check ::
+        ./pants lint check ''**''
 
         '
     - if: always()
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index e6ccfb5a8c7..63b24c1fe3e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -515,11 +515,9 @@ jobs:
 
         '
     - name: Lint
-      run: './pants validate ''**''
+      run: './pants update-build-files --check
 
-        ./pants update-build-files --check
-
-        ./pants lint check ::
+        ./pants lint check ''**''
 
         '
     - if: always()
diff --git a/build-support/bin/generate_github_workflows.py b/build-support/bin/generate_github_workflows.py
index 64322da78e8..d0863fe29af 100644
--- a/build-support/bin/generate_github_workflows.py
+++ b/build-support/bin/generate_github_workflows.py
@@ -380,9 +380,9 @@ def test_workflow_jobs(python_versions: list[str], *, cron: bool) -> Jobs:
                 {
                     "name": "Lint",
                     "run": (
-                        "./pants validate '**'\n"
                         "./pants update-build-files --check\n"
-                        "./pants lint check ::\n"
+                        # Note: we use `**` rather than `::` because regex-lint.
+                        "./pants lint check '**'\n"
                     ),
                 },
                 upload_log_artifacts(name="lint"),
diff --git a/src/python/pants/backend/project_info/regex_lint.py b/src/python/pants/backend/project_info/regex_lint.py
index db5d4447331..495e72b11e9 100644
--- a/src/python/pants/backend/project_info/regex_lint.py
+++ b/src/python/pants/backend/project_info/regex_lint.py
@@ -10,15 +10,18 @@
 from enum import Enum
 from typing import Any, cast
 
-from pants.base.deprecated import resolve_conflicting_options
+from pants.base.deprecated import resolve_conflicting_options, warn_or_error
 from pants.base.exiter import PANTS_FAILED_EXIT_CODE, PANTS_SUCCEEDED_EXIT_CODE
+from pants.core.goals.lint import LintFilesRequest, LintResult, LintResults
 from pants.engine.collection import Collection
 from pants.engine.console import Console
-from pants.engine.fs import Digest, DigestContents, SpecsSnapshot
+from pants.engine.fs import Digest, DigestContents, PathGlobs, SpecsSnapshot
 from pants.engine.goal import Goal, GoalSubsystem
-from pants.engine.rules import Get, collect_rules, goal_rule
+from pants.engine.rules import Get, collect_rules, goal_rule, rule
+from pants.engine.unions import UnionRule
 from pants.option.subsystem import Subsystem
 from pants.util.frozendict import FrozenDict
+from pants.util.logging import LogLevel
 from pants.util.memo import memoized_method
 
 logger = logging.getLogger(__name__)
@@ -43,7 +46,7 @@ class DetailLevel(Enum):
 
 class ValidateSubsystem(GoalSubsystem):
     name = "validate"
-    help = "Validate sources against regexes."
+    help = "Deprecated: to use, set `[regex-lint].config` and run with the `lint` goal"
 
     @classmethod
     def register_options(cls, register):
@@ -94,7 +97,14 @@ def from_dict(cls, d: dict[str, Any]) -> ValidationConfig:
 
 class RegexLintSubsystem(Subsystem):
     options_scope = "regex-lint"
-    help = "Lint your code using regex patterns, e.g. to check for copyright headers."
+    help = (
+        "Lint your code using regex patterns, e.g. to check for copyright headers.\n\n"
+        "To activate this with the `lint` goal, you must set `[regex-lint].config`.\n\n"
+        "Unlike other linters, this can run on files not owned by targets, such as BUILD files. To "
+        "run on those, use `lint '**'` rather than `lint ::`, for example. Unfortunately, "
+        "`--changed-since=<sha>` does not yet cause this linter to run. We are exploring how to "
+        "improve both these gotchas."
+    )
 
     deprecated_options_scope = "sourcefile-validation"
     deprecated_options_scope_removal_version = "2.11.0.dev0"
@@ -103,39 +113,47 @@ class RegexLintSubsystem(Subsystem):
     def register_options(cls, register):
         schema_help = textwrap.dedent(
             """\
-            Config schema is as follows:
-
+            ```
+            {
+              'required_matches': {
+                'path_pattern1': [content_pattern1, content_pattern2],
+                'path_pattern2': [content_pattern1, content_pattern3],
+                ...
+              },
+              'path_patterns': [
+                {
+                  'name': path_pattern1',
+                  'pattern': <path regex pattern>,
+                  'inverted': True|False (defaults to False),
+                  'content_encoding': <encoding> (defaults to utf8)
+                },
+                ...
+              ],
+              'content_patterns': [
                 {
-                  'required_matches': {
-                    'path_pattern1': [content_pattern1, content_pattern2],
-                    'path_pattern2': [content_pattern1, content_pattern3],
-                    ...
-                  },
-                  'path_patterns': [
-                    {
-                      'name': path_pattern1',
-                      'pattern': <path regex pattern>,
-                      'inverted': True|False (defaults to False),
-                      'content_encoding': <encoding> (defaults to utf8)
-                    },
-                    ...
-                  ],
-                  'content_patterns': [
-                    {
-                      'name': 'content_pattern1',
-                      'pattern': <content regex pattern>,
-                      'inverted': True|False (defaults to False)
-                    }
-                    ...
-                  ]
+                  'name': 'content_pattern1',
+                  'pattern': <content regex pattern>,
+                  'inverted': True|False (defaults to False)
                 }
-
-            Meaning: if a file matches some path pattern, its content must match all
-            the corresponding content patterns.
+                ...
+              ]
+            }
+            ```
             """
         )
         super().register_options(register)
-        register("--config", type=dict, fromfile=True, help=schema_help)
+        register(
+            "--config",
+            type=dict,
+            fromfile=True,
+            help=(
+                f"Config schema is as follows:\n\n{schema_help}\n\n"
+                "Meaning: if a file matches some path pattern, its content must match all the "
+                "corresponding content patterns.\n\n"
+                "It's often helpful to load this config from a JSON or YAML file. To do that, set "
+                "`[regex-lint].config = '@path/to/config.yaml'`, for example."
+            ),
+        )
         register(
             "--detail-level",
             type=DetailLevel,
@@ -303,15 +321,27 @@ def get_applicable_content_pattern_names(self, path):
         return applicable_content_pattern_names, content_encoding
 
 
-# TODO: Consider switching this to `lint`. The main downside is that we would no longer be able to
-#  run on files with no owning targets, such as running on BUILD files.
 @goal_rule
-async def validate(
+async def validate_goal(
     console: Console,
     specs_snapshot: SpecsSnapshot,
     validate_subsystem: ValidateSubsystem,
     regex_lint_subsystem: RegexLintSubsystem,
 ) -> Validate:
+    warn_or_error(
+        "2.11.0.dev0",
+        "the `validate` goal",
+        (
+            "The `regex-lint` check run by the `validate` goal is now run as part of the "
+            "`lint` goal. So long as you have set up `[regex-lint].config` "
+            "(or the deprecated `[sourcefile-validation].config`), this checker will run.\n\n"
+            "Note that if you were running `validate '**'` in order to check files without "
+            "owning targets (e.g. to check BUILD file contents), then you will need to run "
+            "`lint '**'` rather than `lint ::`. Also, `--changed-since=<sha>` will not cause "
+            "`regex-lint` to run, same as it how `validate` worked. We are exploring how to remove "
+            "both these gotchas."
+        ),
+    )
     multi_matcher = regex_lint_subsystem.get_multi_matcher()
     if multi_matcher is None:
         logger.error(
@@ -365,5 +395,62 @@ async def validate(
     return Validate(exit_code)
 
 
+class RegexLintRequest(LintFilesRequest):
+    name = "regex-lint"
+
+
+@rule(desc="Lint with regex patterns", level=LogLevel.DEBUG)
+async def lint_with_regex_patterns(
+    request: RegexLintRequest,
+    validate_subsystem: ValidateSubsystem,
+    regex_lint_subsystem: RegexLintSubsystem,
+) -> LintResults:
+    multi_matcher = regex_lint_subsystem.get_multi_matcher()
+    if multi_matcher is None:
+        return LintResults((), linter_name=request.name)
+
+    digest_contents = await Get(DigestContents, PathGlobs(request.file_paths))
+    regex_match_results = RegexMatchResults(
+        multi_matcher.check_source_file(file_content.path, file_content.content)
+        for file_content in sorted(digest_contents, key=lambda fc: fc.path)
+    )
+
+    stdout = ""
+    detail_level = regex_lint_subsystem.detail_level(validate_subsystem)
+    num_matched_all = 0
+    num_nonmatched_some = 0
+    for rmr in regex_match_results:
+        if not rmr.matching and not rmr.nonmatching:
+            continue
+        if detail_level == DetailLevel.names:
+            if rmr.nonmatching:
+                stdout += f"{rmr.path}\n"
+            continue
+
+        if rmr.nonmatching:
+            icon = "X"
+            num_nonmatched_some += 1
+        else:
+            icon = "V"
+            num_matched_all += 1
+        matched_msg = " Matched: {}".format(",".join(rmr.matching)) if rmr.matching else ""
+        nonmatched_msg = (
+            " Didn't match: {}".format(",".join(rmr.nonmatching)) if rmr.nonmatching else ""
+        )
+        if detail_level == DetailLevel.all or (
+            detail_level == DetailLevel.nonmatching and nonmatched_msg
+        ):
+            stdout += f"{icon} {rmr.path}:{matched_msg}{nonmatched_msg}\n"
+
+    if detail_level not in (DetailLevel.none, DetailLevel.names):
+        if stdout:
+            stdout += "\n"
+        stdout += f"{num_matched_all} files matched all required patterns.\n"
+        stdout += f"{num_nonmatched_some} files failed to match at least one required pattern."
+
+    exit_code = PANTS_FAILED_EXIT_CODE if num_nonmatched_some else PANTS_SUCCEEDED_EXIT_CODE
+    return LintResults((LintResult(exit_code, stdout, ""),), linter_name=request.name)
+
+
 def rules():
-    return collect_rules()
+    return (*collect_rules(), UnionRule(LintFilesRequest, RegexLintRequest))
diff --git a/src/python/pants/core/goals/lint.py b/src/python/pants/core/goals/lint.py
index 0c7255af0ed..f8ea86c07df 100644
--- a/src/python/pants/core/goals/lint.py
+++ b/src/python/pants/core/goals/lint.py
@@ -6,13 +6,13 @@
 import itertools
 import logging
 from dataclasses import dataclass
-from typing import Any, Iterable, cast
+from typing import Any, ClassVar, Iterable, cast
 
 from pants.core.goals.style_request import StyleRequest, style_batch_size_help, write_reports
 from pants.core.util_rules.distdir import DistDir
 from pants.engine.console import Console
-from pants.engine.engine_aware import EngineAwareReturnType
-from pants.engine.fs import EMPTY_DIGEST, Digest, Workspace
+from pants.engine.engine_aware import EngineAwareParameter, EngineAwareReturnType
+from pants.engine.fs import EMPTY_DIGEST, Digest, SpecsSnapshot, Workspace
 from pants.engine.goal import Goal, GoalSubsystem
 from pants.engine.process import FallibleProcessResult
 from pants.engine.rules import Get, MultiGet, collect_rules, goal_rule
@@ -134,6 +134,19 @@ class LintRequest(StyleRequest):
     """
 
 
+@union
+@dataclass(frozen=True)
+class LintFilesRequest(EngineAwareParameter):
+    """The entry point for linters that do not use targets."""
+
+    name: ClassVar[str]
+
+    file_paths: tuple[str, ...]
+
+    def debug_hint(self) -> str:
+        return self.name
+
+
 # If a user wants linter reports to show up in dist/ they must ensure that the reports
 # are written under this directory. E.g.,
 # ./pants --flake8-args="--output-file=reports/report.txt" lint <target>
@@ -199,43 +212,63 @@ async def lint(
     console: Console,
     workspace: Workspace,
     targets: Targets,
+    specs_snapshot: SpecsSnapshot,
     lint_subsystem: LintSubsystem,
     union_membership: UnionMembership,
     dist_dir: DistDir,
 ) -> Lint:
-    request_types = cast("Iterable[type[LintRequest]]", union_membership[LintRequest])
-    requests = tuple(
+    target_request_types = cast("Iterable[type[LintRequest]]", union_membership[LintRequest])
+    target_requests = tuple(
         request_type(
             request_type.field_set_type.create(target)
             for target in targets
             if request_type.field_set_type.is_applicable(target)
         )
-        for request_type in request_types
+        for request_type in target_request_types
+    )
+    file_requests = tuple(
+        request_type(specs_snapshot.snapshot.files)
+        for request_type in union_membership[LintFilesRequest]
     )
 
     if lint_subsystem.per_file_caching:
-        all_batch_results = await MultiGet(
-            Get(LintResults, LintRequest, request.__class__([field_set]))
-            for request in requests
-            if request.field_sets
-            for field_set in request.field_sets
-        )
+        all_requests = [
+            *(
+                Get(LintResults, LintRequest, request.__class__([field_set]))
+                for request in target_requests
+                if request.field_sets
+                for field_set in request.field_sets
+            ),
+            *(
+                Get(LintResults, LintFilesRequest, request.__class__((fp,)))
+                for request in file_requests
+                for fp in request.file_paths
+            ),
+        ]
     else:
 
         def address_str(fs: FieldSet) -> str:
             return fs.address.spec
 
-        all_batch_results = await MultiGet(
-            Get(LintResults, LintRequest, request.__class__(field_set_batch))
-            for request in requests
-            if request.field_sets
-            for field_set_batch in partition_sequentially(
-                request.field_sets,
-                key=address_str,
-                size_target=lint_subsystem.batch_size,
-                size_max=4 * lint_subsystem.batch_size,
-            )
-        )
+        all_requests = [
+            *(
+                Get(LintResults, LintRequest, request.__class__(field_set_batch))
+                for request in target_requests
+                if request.field_sets
+                for field_set_batch in partition_sequentially(
+                    request.field_sets,
+                    key=address_str,
+                    size_target=lint_subsystem.batch_size,
+                    size_max=4 * lint_subsystem.batch_size,
+                )
+            ),
+            *(Get(LintResults, LintFilesRequest, request) for request in file_requests),
+        ]
+
+    all_batch_results = cast(
+        "tuple[LintResults, ...]",
+        await MultiGet(all_requests),  # type: ignore[arg-type]
+    )
 
     def key_fn(results: LintResults):
         return results.linter_name
diff --git a/src/python/pants/core/goals/lint_test.py b/src/python/pants/core/goals/lint_test.py
index f21dd6caae3..f180f444352 100644
--- a/src/python/pants/core/goals/lint_test.py
+++ b/src/python/pants/core/goals/lint_test.py
@@ -8,10 +8,18 @@
 
 import pytest
 
-from pants.core.goals.lint import Lint, LintRequest, LintResult, LintResults, LintSubsystem, lint
+from pants.core.goals.lint import (
+    Lint,
+    LintFilesRequest,
+    LintRequest,
+    LintResult,
+    LintResults,
+    LintSubsystem,
+    lint,
+)
 from pants.core.util_rules.distdir import DistDir
 from pants.engine.addresses import Address
-from pants.engine.fs import Workspace
+from pants.engine.fs import SpecsSnapshot, Workspace
 from pants.engine.target import FieldSet, MultipleSourcesField, Target, Targets
 from pants.engine.unions import UnionMembership
 from pants.testutil.option_util import create_goal_subsystem
@@ -97,6 +105,14 @@ def exit_code(_: Iterable[Address]) -> int:
         return -1
 
 
+class MockFilesRequest(LintFilesRequest):
+    name = "FilesLinter"
+
+    @property
+    def lint_results(self) -> LintResults:
+        return LintResults([LintResult(0, "", "")], linter_name=self.name)
+
+
 @pytest.fixture
 def rule_runner() -> RuleRunner:
     return RuleRunner()
@@ -111,22 +127,31 @@ def run_lint_rule(
     *,
     lint_request_types: List[Type[LintRequest]],
     targets: List[Target],
+    run_files_linter: bool = False,
     per_file_caching: bool = False,
     batch_size: int = 128,
 ) -> Tuple[int, str]:
+    union_membership = UnionMembership(
+        {
+            LintRequest: lint_request_types,
+            LintFilesRequest: [MockFilesRequest] if run_files_linter else [],
+        }
+    )
+    lint_subsystem = create_goal_subsystem(
+        LintSubsystem,
+        per_file_caching=per_file_caching,
+        batch_size=batch_size,
+    )
+    specs_snapshot = SpecsSnapshot(rule_runner.make_snapshot_of_empty_files(["f.txt"]))
     with mock_console(rule_runner.options_bootstrapper) as (console, stdio_reader):
-        union_membership = UnionMembership({LintRequest: lint_request_types})
         result: Lint = run_rule_with_mocks(
             lint,
             rule_args=[
                 console,
                 Workspace(rule_runner.scheduler, _enforce_effects=False),
                 Targets(targets),
-                create_goal_subsystem(
-                    LintSubsystem,
-                    per_file_caching=per_file_caching,
-                    batch_size=batch_size,
-                ),
+                specs_snapshot,
+                lint_subsystem,
                 union_membership,
                 DistDir(relpath=Path("dist")),
             ],
@@ -134,8 +159,13 @@ def run_lint_rule(
                 MockGet(
                     output_type=LintResults,
                     input_type=LintRequest,
-                    mock=lambda field_set_collection: field_set_collection.lint_results,
-                )
+                    mock=lambda mock_request: mock_request.lint_results,
+                ),
+                MockGet(
+                    output_type=LintResults,
+                    input_type=LintFilesRequest,
+                    mock=lambda mock_request: mock_request.lint_results,
+                ),
             ],
             union_membership=union_membership,
         )
@@ -176,6 +206,7 @@ def test_summary(rule_runner: RuleRunner, per_file_caching: bool) -> None:
         ],
         targets=[make_target(good_address), make_target(bad_address)],
         per_file_caching=per_file_caching,
+        run_files_linter=True,
     )
     assert exit_code == FailingRequest.exit_code([bad_address])
     assert stderr == dedent(
@@ -183,6 +214,7 @@ def test_summary(rule_runner: RuleRunner, per_file_caching: bool) -> None:
 
         𐄂 ConditionallySucceedsLinter failed.
         𐄂 FailingLinter failed.
+        ✓ FilesLinter succeeded.
         ✓ SuccessfulLinter succeeded.
         """
     )
diff --git a/src/python/pants/help/help_integration_test.py b/src/python/pants/help/help_integration_test.py
index b5b686ace34..9f55f2ce1c4 100644
--- a/src/python/pants/help/help_integration_test.py
+++ b/src/python/pants/help/help_integration_test.py
@@ -92,7 +92,7 @@ def test_help_goals() -> None:
 def test_help_goals_only_show_implemented() -> None:
     # Some core goals, such as `./pants test`, require downstream implementations to work
     # properly. We should only show those goals when an implementation is provided.
-    goals_that_need_implementation = ["binary", "fmt", "lint", "run", "test"]
+    goals_that_need_implementation = ["fmt", "test"]
     command = ["--pants-config-files=[]", "help", "goals"]
 
     not_implemented_run = run_pants(["--backend-packages=[]", *command])