Skip to content

Commit

Permalink
Fix ctx.actions.write to write as ISO 8859-1 so it is symetric with B…
Browse files Browse the repository at this point in the history
…UILD and .bzl input parsing.

Internally BUILD and .bzl files are read as uninterpreted raw byte strings and
not as UTF-8, so we should write content back out the same way.  For example,
if a BUILD file contains:

```
...
notice = "Copyright © 2019 Acme LLC",
```

Bazel will store the copyright symbol as two distinct octets ([0xc2, 0xa9]).
When writing that with an action, the same two octets should be emitted.
The behavior fixed by this change is to not interpret each character as
a standalone Unicode code point to be encoded as UTF-8.

While this change is appropriate for strings read from BUILD files, there is still a discrepancy in handling file paths with non Latin1 characters on Windows file systems.  That is described in #11602.

While this change does produce an observable behavior change, I am considering it a bug fix rather than an incompatible change because the previous behavior was both wrong and also (as described in #11602) inconsistent across different OSes.

Closes #10174

RELNOTES:
Fix behavior of ctx.actions.write so content is written without an incorrect encoding to UTF-8.
See #10174 for details.
PiperOrigin-RevId: 318056290
  • Loading branch information
aiuto authored and copybara-github committed Jun 24, 2020
1 parent 1a38fa0 commit 27e1103
Show file tree
Hide file tree
Showing 2 changed files with 107 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

package com.google.devtools.build.lib.analysis.actions;

import static java.nio.charset.StandardCharsets.UTF_8;
import static java.nio.charset.StandardCharsets.ISO_8859_1;

import com.google.common.annotations.VisibleForTesting;
import com.google.devtools.build.lib.actions.ActionExecutionContext;
Expand All @@ -39,7 +39,12 @@
/**
* Action to write a file whose contents are known at analysis time.
*
* <p>The output is always UTF-8 encoded.
* <p>The output file is generally encoded as UTF-8, but by an unusual path. BUILD files and
* directory entries, which are actually UTF-8, are misinterpreted by Bazel as Latin1, so that most
* Strings within the build language use this unusual representation. FileWriteAction writes those
* Strings out again as Latin1.
*
* <p>TODO(b/146554973): Change this implementation when that is addressed.
*
* <p>TODO(bazel-team): Choose a better name to distinguish this class from {@link
* BinaryFileWriteAction}.
Expand Down Expand Up @@ -171,7 +176,7 @@ private static final class CompressedString extends LazyString {
final int uncompressedSize;

CompressedString(String chars) {
byte[] dataToCompress = chars.getBytes(UTF_8);
byte[] dataToCompress = chars.getBytes(ISO_8859_1);
ByteArrayOutputStream byteStream = new ByteArrayOutputStream(dataToCompress.length);
try (GZIPOutputStream zipStream = new GZIPOutputStream(byteStream)) {
zipStream.write(dataToCompress);
Expand Down Expand Up @@ -202,7 +207,7 @@ public String toString() {
// This should be impossible since we're reading from a byte array.
throw new RuntimeException(e);
}
return new String(uncompressedBytes, UTF_8);
return new String(uncompressedBytes, ISO_8859_1);
}
}

Expand All @@ -216,6 +221,11 @@ boolean usesCompression() {
*
* <p>Note that if the string is lazily computed or compressed, calling this method will force its
* computation or decompression. No attempt is made by FileWriteAction to cache the result.
*
* <p>Note that the content is a not a normal Java String. When Bazel parses BUILD files, it
* misinterprets the bytes as Latin1, so a code point with a 3-byte UTF-8 encoding will take 3
* chars internally. To reverse this process, you must encode this string as Latin1, giving you
* back the correct UTF-8 encoding of the original input.
*/
public String getFileContents() {
return fileContents.toString();
Expand All @@ -235,7 +245,7 @@ public DeterministicWriter newDeterministicWriter(ActionExecutionContext ctx) {
return new DeterministicWriter() {
@Override
public void writeOutputFile(OutputStream out) throws IOException {
out.write(getFileContents().getBytes(UTF_8));
out.write(getFileContents().getBytes(ISO_8859_1));
}
};
}
Expand Down
92 changes: 92 additions & 0 deletions src/test/shell/integration/loading_phase_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -498,4 +498,96 @@ EOF
done
}

# Test that actions.write correctly emits a UTF-8 encoded attribute value as
# UTF-8.
function test_actions_write_utf8_attribute() {
local -r pkg="${FUNCNAME}"
mkdir -p "$pkg" || fail "could not create \"$pkg\""

cat >"${pkg}/def.bzl" <<'EOF'
def _write_attribute_impl(ctx):
ctx.actions.write(
output = ctx.outputs.out,
# adding a NL at the end to make the diff below easier
content = ctx.attr.text + '\n',
)
return []
write_attribute = rule(
implementation = _write_attribute_impl,
attrs = {
"text": attr.string(),
"out": attr.output(),
},
)
EOF

cat >"${pkg}/BUILD" <<'EOF'
load(":def.bzl", "write_attribute")
write_attribute(
name = "text_with_non_latin1_chars",
# U+41, U+2117, U+4E16, U+1F63F (1,2,3,4-byte UTF-8 encodings), 10 bytes.
text = "A©世😿",
out = "out",
)
EOF
bazel build "${pkg}:text_with_non_latin1_chars" || fail "Expected build to succeed"
diff $(bazel info "${PRODUCT_NAME}-bin")/$pkg/out <(echo 'A©世😿') || fail 'diff failed'
}

# Test that actions.write emits a file name containing non-Latin1 characters as
# a UTF-8 encoded string.
function test_actions_write_not_latin1_path() {
# TODO(https://github.com/bazelbuild/bazel/issues/11602): Enable after that is fixed.
if $is_windows ; then
echo 'Skipping test_actions_write_not_latin1_path on Windows. See #11602'
return
fi

local -r pkg="${FUNCNAME}"
mkdir -p "$pkg" || fail "could not create \"$pkg\""

filename='A©世😿.file' # see above for an explanation.
echo hello >"${pkg}/${filename}"

cat >"${pkg}/def.bzl" <<'EOF'
def _write_paths_impl(ctx):
# srcs is a list, but we only expect one entry.
if len(ctx.attr.srcs) != 1:
fail('expected exactly 1 file for srcs. got %d' % len(ctx.attr.srcs))
file_name = ctx.attr.srcs[0].label.name
ctx.actions.write(
output = ctx.outputs.out,
content = file_name,
)
return []
write_paths = rule(
implementation = _write_paths_impl,
attrs = {
"srcs": attr.label_list(allow_files=True),
"out": attr.output(),
},
)
EOF

cat >"${pkg}/BUILD" <<'EOF'
load(":def.bzl", "write_paths")
write_paths(
name = "path_with_non_latin1",
# Use a glob to ensure that the value is read from the file system and not
# out of BUILD.
srcs = glob(["*.file"]),
out = "paths.txt",
)
EOF

bazel build "${pkg}:path_with_non_latin1" >output 2>&1 || (
echo '== build output'
cat output
fail "Expected build to succeed"
)
assert_contains "^${filename}$" $(bazel info "${PRODUCT_NAME}-bin")/$pkg/paths.txt
}

run_suite "Integration tests of ${PRODUCT_NAME} using loading/analysis phases."

0 comments on commit 27e1103

Please sign in to comment.