Use URLs as default canonical IDs in common repo rules

RELNOTES[INC]: The `--experimental_repository_cache_urls_as_default_canonical_id` flag is no longer available. Instead, the `http_archive`, `http_file`, `http_jar`, `jvm_maven_import_external`, and `jvm_import_external` repository rules now use the URLs as the canonical ID if none is provided explicitly. If this behavior is not desired, it can be disabled via `--repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0`. Fixes bazelbuild#19749 Closes bazelbuild#20047. PiperOrigin-RevId: 580826938 Change-Id: I32bfb04084ce75b74f664794e1924d3188f58805
fmeum · Nov 9, 2023 · 99bf3ee · 99bf3ee
1 parent b4c199e
commit 99bf3ee
Show file tree

Hide file tree

Showing 15 changed files with 145 additions and 65 deletions.
diff --git a/MODULE.bazel.lock b/MODULE.bazel.lock
diff --git a/scripts/bootstrap/bootstrap.sh b/scripts/bootstrap/bootstrap.sh
@@ -31,11 +31,14 @@ fi
 
 : ${JAVA_VERSION:="11"}
 
+# TODO: remove `--repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0` once all dependencies are
+#  mirrored. See https://github.com/bazelbuild/bazel/pull/19549 for more context.
 _BAZEL_ARGS="--spawn_strategy=standalone \
       --nojava_header_compilation \
       --strategy=Javac=worker --worker_quit_after_build --ignore_unsupported_sandboxing \
       --compilation_mode=opt \
       --repository_cache=derived/repository_cache \
+      --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 \
       --extra_toolchains=//scripts/bootstrap:all \
       --extra_toolchains=@bazel_tools//tools/python:autodetecting_toolchain \
       --enable_bzlmod \

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/BazelRepositoryModule.java b/src/main/java/com/google/devtools/build/lib/bazel/BazelRepositoryModule.java
@@ -345,7 +345,6 @@ public void beforeCommand(CommandEnvironment env) throws AbruptExitException {
       if (repoOptions.repositoryDownloaderRetries >= 0) {
         downloadManager.setRetries(repoOptions.repositoryDownloaderRetries);
       }
-      downloadManager.setUrlsAsDefaultCanonicalId(repoOptions.urlsAsDefaultCanonicalId);
 
       repositoryCache.setHardlink(repoOptions.useHardlinks);
       if (repoOptions.experimentalScaleTimeouts > 0.0) {

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/RepositoryOptions.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/RepositoryOptions.java
@@ -264,15 +264,15 @@ public Converter() {
 
   @Option(
       name = "experimental_repository_cache_urls_as_default_canonical_id",
-      defaultValue = "false",
-      documentationCategory = OptionDocumentationCategory.BAZEL_CLIENT_OPTIONS,
-      effectTags = {OptionEffectTag.LOADING_AND_ANALYSIS},
-      metadataTags = {OptionMetadataTag.EXPERIMENTAL},
-      help =
-          "If true, use a string derived from the URLs of repository downloads as the canonical_id "
-              + "if not specified. This causes a change in the URLs to result in a redownload even "
-              + "if the cache contains a download with the same hash. This can be used to verify "
-              + "that URL changes don't result in broken repositories being masked by the cache.")
+      defaultValue = "true",
+      documentationCategory = OptionDocumentationCategory.UNDOCUMENTED,
+      metadataTags = OptionMetadataTag.DEPRECATED,
+      effectTags = {OptionEffectTag.NO_OP},
+      deprecationWarning =
+          "This behavior is enabled by default for http_* and jvm_* rules and no "
+              + "longer controlled by this flag. Use "
+              + "--repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 to disable it instead.",
+      help = "No-op.")
   public boolean urlsAsDefaultCanonicalId;
 
   @Option(

diff --git a/src/main/java/com/google/devtools/build/lib/bazel/repository/downloader/DownloadManager.java b/src/main/java/com/google/devtools/build/lib/bazel/repository/downloader/DownloadManager.java
@@ -42,7 +42,6 @@
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Optional;
-import java.util.stream.Collectors;
 import javax.annotation.Nullable;
 
 /**
@@ -59,7 +58,6 @@ public class DownloadManager {
   private final Downloader downloader;
   private boolean disableDownload = false;
   private int retries = 0;
-  private boolean urlsAsDefaultCanonicalId;
   @Nullable private Credentials netrcCreds;
   private CredentialFactory credentialFactory = StaticCredentials::new;
 
@@ -90,10 +88,6 @@ public void setRetries(int retries) {
     this.retries = retries;
   }
 
-  public void setUrlsAsDefaultCanonicalId(boolean urlsAsDefaultCanonicalId) {
-    this.urlsAsDefaultCanonicalId = urlsAsDefaultCanonicalId;
-  }
-
   public void setNetrcCreds(Credentials netrcCreds) {
     this.netrcCreds = netrcCreds;
   }
@@ -134,9 +128,6 @@ public Path download(
     if (Thread.interrupted()) {
       throw new InterruptedException();
     }
-    if (Strings.isNullOrEmpty(canonicalId) && urlsAsDefaultCanonicalId) {
-      canonicalId = originalUrls.stream().map(URL::toExternalForm).collect(Collectors.joining(" "));
-    }
 
     // TODO(andreisolo): This code path is inconsistent as the authHeaders are fetched from a
     //  .netrc only if it comes from a http_{archive,file,jar} - and it is handled directly

diff --git a/src/test/java/com/google/devtools/build/lib/blackbox/bazel/DefaultToolsSetup.java b/src/test/java/com/google/devtools/build/lib/blackbox/bazel/DefaultToolsSetup.java
@@ -78,6 +78,9 @@ public void setup(BlackBoxTestContext context) throws IOException {
     String sharedRepoCache = System.getenv("REPOSITORY_CACHE");
     if (sharedRepoCache != null) {
       lines.add("common --repository_cache=" + sharedRepoCache);
+      // TODO: Remove this flag once all dependencies are mirrored.
+      // See https://github.com/bazelbuild/bazel/pull/19549 for more context.
+      lines.add("common --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0");
       if (OS.getCurrent() == OS.DARWIN) {
         // For reducing SSD usage on our physical Mac machines.
         lines.add("common --experimental_repository_cache_hardlinks");

diff --git a/src/test/py/bazel/bzlmod/bazel_fetch_test.py b/src/test/py/bazel/bzlmod/bazel_fetch_test.py
@@ -61,6 +61,10 @@ def generatBuiltinModules(self):
     self.ScratchFile('tools_mock/WORKSPACE')
     self.ScratchFile('tools_mock/MODULE.bazel', ['module(name="bazel_tools")'])
     self.ScratchFile('tools_mock/tools/build_defs/repo/BUILD')
+    self.CopyFile(
+        self.Rlocation('io_bazel/tools/build_defs/repo/cache.bzl'),
+        'tools_mock/tools/build_defs/repo/cache.bzl',
+    )
     self.CopyFile(
         self.Rlocation('io_bazel/tools/build_defs/repo/http.bzl'),
         'tools_mock/tools/build_defs/repo/http.bzl',

diff --git a/src/test/py/bazel/test_base.py b/src/test/py/bazel/test_base.py
@@ -127,6 +127,12 @@ def setUp(self):
       shared_repo_cache = os.environ.get('REPOSITORY_CACHE')
       if shared_repo_cache:
         f.write('common --repository_cache={}\n'.format(shared_repo_cache))
+        # TODO(pcloudy): Remove this flag once all dependencies are mirrored.
+        # See https://github.com/bazelbuild/bazel/pull/19549 for more context.
+        f.write(
+            'common'
+            ' --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0\n'
+        )
         if TestBase.IsDarwin():
           # For reducing SSD usage on our physical Mac machines.
           f.write('common --experimental_repository_cache_hardlinks\n')

diff --git a/src/test/shell/bazel/bazel_repository_cache_test.sh b/src/test/shell/bazel/bazel_repository_cache_test.sh
@@ -221,6 +221,9 @@ function test_fetch_value_with_existing_cache_and_no_network() {
   cache_entry="$repo_cache_dir/content_addressable/sha256/$sha256"
   mkdir -p "$cache_entry"
   cp "$repo2_zip" "$cache_entry/file" # Artifacts are named uniformly as "file" in the cache
+  http_archive_url="http://localhost:$nc_port/bleh"
+  canonical_id_hash=$(printf "$http_archive_url" | sha256sum | cut -f 1 -d ' ')
+  touch "$cache_entry/id-$canonical_id_hash"
 
   # Fetch without a server
   shutdown_server
@@ -271,6 +274,7 @@ EOF
   # to do without checksum. But we can safely do so, as the loopback device
   # is reasonably safe against man-in-the-middle attacks.
   bazel fetch --repository_cache="$repo_cache_dir" \
+        --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 \
         //zoo:breeding-program >& $TEST_log \
     || fail "expected fetch to succeed"
 
@@ -283,6 +287,7 @@ EOF
 
   # As we don't have a predicted cache, we expect fetching to fail now.
   bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+        --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 \
     && fail "expected failure" || :
 
   # However, if we add the hash, the value is taken from cache
@@ -298,6 +303,7 @@ http_archive(
 )
 EOF
   bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+        --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 \
     || fail "expected fetch to succeed"
 }
 
@@ -465,15 +471,19 @@ EOF
   expect_log "Error downloading"
 }
 
-function test_break_url() {
+function test_http_archive_no_default_canonical_id() {
   setup_repository
 
-  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+  bazel fetch --repository_cache="$repo_cache_dir" \
+    --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 \
+    //zoo:breeding-program >& $TEST_log \
     || echo "Expected fetch to succeed"
 
   shutdown_server
 
-  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+  bazel fetch --repository_cache="$repo_cache_dir" \
+    --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 \
+    //zoo:breeding-program >& $TEST_log \
     || echo "Expected fetch to succeed"
 
   # Break url in WORKSPACE
@@ -489,24 +499,27 @@ http_archive(
 )
 EOF
 
-  # By default, cache entry will still match by sha256, even if url is changed.
-  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
+  # Without the default canonical id, cache entry will still match by sha256, even if url is
+  # changed.
+  bazel fetch --repository_cache="$repo_cache_dir" \
+    --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0 \
+    //zoo:breeding-program >& $TEST_log \
     || echo "Expected fetch to succeed"
 }
 
-function test_experimental_repository_cache_urls_as_default_canonical_id() {
+
+function test_http_archive_urls_as_default_canonical_id() {
+  # TODO: Remove when the integration test setup itself no longer relies on this.
+  add_to_bazelrc "common --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=1"
+
   setup_repository
 
-  bazel fetch --repository_cache="$repo_cache_dir" \
-        --experimental_repository_cache_urls_as_default_canonical_id \
-        //zoo:breeding-program >& $TEST_log \
+  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
     || echo "Expected fetch to succeed"
 
   shutdown_server
 
-  bazel fetch --repository_cache="$repo_cache_dir" \
-        --experimental_repository_cache_urls_as_default_canonical_id \
-        //zoo:breeding-program >& $TEST_log \
+  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
     || echo "Expected fetch to succeed"
 
   # Break url in WORKSPACE
@@ -523,9 +536,7 @@ http_archive(
 EOF
 
   # As repository cache key should depend on urls, we expect fetching to fail now.
-  bazel fetch --repository_cache="$repo_cache_dir" \
-        --experimental_repository_cache_urls_as_default_canonical_id \
-        //zoo:breeding-program >& $TEST_log \
+  bazel fetch --repository_cache="$repo_cache_dir" //zoo:breeding-program >& $TEST_log \
     && fail "expected failure" || :
 }
 

diff --git a/src/test/shell/testenv.sh.tmpl b/src/test/shell/testenv.sh.tmpl
@@ -327,6 +327,9 @@ EOF
   if [[ -n ${REPOSITORY_CACHE:-} ]]; then
     echo "testenv.sh: Using repository cache at $REPOSITORY_CACHE."
     echo "common --repository_cache=$REPOSITORY_CACHE" >> $TEST_TMPDIR/bazelrc
+    # TODO: Remove this flag once all dependencies are mirrored.
+    # See https://github.com/bazelbuild/bazel/pull/19549 for more context.
+    echo "common --repo_env=BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID=0" >> $TEST_TMPDIR/bazelrc
     if is_darwin; then
       # For reducing SSD usage on our physical Mac machines.
       echo "testenv.sh: Enabling --experimental_repository_cache_hardlinks"

diff --git a/src/test/tools/bzlmod/MODULE.bazel.lock b/src/test/tools/bzlmod/MODULE.bazel.lock
diff --git a/tools/build_defs/repo/BUILD b/tools/build_defs/repo/BUILD
@@ -15,6 +15,7 @@ filegroup(
 filegroup(
     name = "http_src",
     srcs = [
+        "cache.bzl",
         "http.bzl",
         "utils.bzl",
     ],

diff --git a/tools/build_defs/repo/cache.bzl b/tools/build_defs/repo/cache.bzl
@@ -0,0 +1,50 @@
+# Copyright 2023 The Bazel Authors. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# WARNING:
+# https://github.com/bazelbuild/bazel/issues/17713
+# .bzl files in this package (tools/build_defs/repo) are evaluated
+# in a Starlark environment without "@_builtins" injection, and must not refer
+# to symbols associated with build/workspace .bzl files
+
+"""Returns the default canonical id to use for downloads."""
+
+visibility("private")
+
+DEFAULT_CANONICAL_ID_ENV = "BAZEL_HTTP_RULES_URLS_AS_DEFAULT_CANONICAL_ID"
+
+CANONICAL_ID_DOC = """A canonical ID of the file downloaded.
+
+If specified and non-empty, Bazel will not take the file from cache, unless it
+was added to the cache by a request with the same canonical ID.
+
+If unspecified or empty, Bazel by default uses the URLs of the file as the
+canonical ID. This helps catch the common mistake of updating the URLs without
+also updating the hash, resulting in builds that succeed locally but fail on
+machines without the file in the cache. This behavior can be disabled with
+--repo_env={env}=0.
+""".format(env = DEFAULT_CANONICAL_ID_ENV)
+
+def get_default_canonical_id(repository_ctx, urls):
+    """Returns the default canonical id to use for downloads."""
+    if repository_ctx.os.environ.get(DEFAULT_CANONICAL_ID_ENV) == "0":
+        return ""
+
+    # Do not sort URLs to prevent the following scenario:
+    # 1. http_archive with urls = [B, A] created.
+    # 2. Successful fetch from B results in canonical ID "A B".
+    # 3. Order of urls is flipped to [A, B].
+    # 4. Fetch would reuse cache entry for "A B", even though A may be broken (it has never been
+    #    fetched before).
+    return " ".join(urls)