-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Limit repr of arrays containing long strings #3900
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
import functools | ||
from datetime import datetime, timedelta | ||
from itertools import zip_longest | ||
from typing import Hashable | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
@@ -14,7 +15,7 @@ | |
from .pycompat import dask_array_type, sparse_array_type | ||
|
||
|
||
def pretty_print(x, numchars): | ||
def pretty_print(x, numchars: int): | ||
"""Given an object `x`, call `str(x)` and format the returned string so | ||
that it is numchars long, padding with trailing spaces or truncating with | ||
ellipses as necessary | ||
|
@@ -163,7 +164,7 @@ def format_items(x): | |
return formatted | ||
|
||
|
||
def format_array_flat(array, max_width): | ||
def format_array_flat(array, max_width: int): | ||
"""Return a formatted string for as many items in the flattened version of | ||
array that will fit within max_width characters. | ||
""" | ||
|
@@ -198,11 +199,20 @@ def format_array_flat(array, max_width): | |
num_back = count - num_front | ||
# note that num_back is 0 <--> array.size is 0 or 1 | ||
# <--> relevant_back_items is [] | ||
pprint_str = ( | ||
" ".join(relevant_front_items[:num_front]) | ||
+ padding | ||
+ " ".join(relevant_back_items[-num_back:]) | ||
pprint_str = "".join( | ||
[ | ||
" ".join(relevant_front_items[:num_front]), | ||
padding, | ||
" ".join(relevant_back_items[-num_back:]), | ||
] | ||
) | ||
|
||
# As a final check, if it's still too long even with the limit in values, | ||
# replace the end with an ellipsis | ||
# NB: this will still returns a full 3-character ellipsis when max_width < 3 | ||
if len(pprint_str) > max_width: | ||
pprint_str = pprint_str[: max(max_width - 3, 0)] + "..." | ||
|
||
return pprint_str | ||
|
||
|
||
|
@@ -258,10 +268,16 @@ def inline_variable_array_repr(var, max_width): | |
return "..." | ||
|
||
|
||
def summarize_variable(name, var, col_width, marker=" ", max_width=None): | ||
def summarize_variable( | ||
name: Hashable, var, col_width: int, marker: str = " ", max_width: int = None | ||
): | ||
"""Summarize a variable in one line, e.g., for the Dataset.__repr__.""" | ||
if max_width is None: | ||
max_width = OPTIONS["display_width"] | ||
max_width_options = OPTIONS["display_width"] | ||
if not isinstance(max_width_options, int): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this a reasonable way to enforce types for objects coming from dicts? Or too much cruft and we should just use type hints? |
||
raise TypeError(f"`max_width` value of `{max_width}` is not a valid int") | ||
else: | ||
max_width = max_width_options | ||
first_col = pretty_print(f" {marker} {name} ", col_width) | ||
if var.dims: | ||
dims_str = "({}) ".format(", ".join(map(str, var.dims))) | ||
|
@@ -295,7 +311,7 @@ def summarize_datavar(name, var, col_width): | |
return summarize_variable(name, var.variable, col_width) | ||
|
||
|
||
def summarize_coord(name, var, col_width): | ||
def summarize_coord(name: Hashable, var, col_width: int): | ||
is_index = name in var.dims | ||
marker = "*" if is_index else " " | ||
if is_index: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -115,7 +115,7 @@ def test_format_items(self): | |
|
||
def test_format_array_flat(self): | ||
actual = formatting.format_array_flat(np.arange(100), 2) | ||
expected = "0 ... 99" | ||
expected = "..." | ||
assert expected == actual | ||
|
||
actual = formatting.format_array_flat(np.arange(100), 9) | ||
|
@@ -134,11 +134,13 @@ def test_format_array_flat(self): | |
expected = "0 1 2 ... 98 99" | ||
assert expected == actual | ||
|
||
# NB: Probably not ideal; an alternative would be cutting after the | ||
# first ellipsis | ||
actual = formatting.format_array_flat(np.arange(100.0), 11) | ||
expected = "0.0 ... 99.0" | ||
expected = "0.0 ... ..." | ||
Comment on lines
+137
to
+140
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd cut after the first ellipsis, two of them in a row look strange to me. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, but what's the rule? Backtrack and see whether the previous values were There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. something like that, yes: if len(pprint_str) > max_width:
pprint_str = pprint_str[: max(max_width - 3, 0)].rstrip()
if not pprint_str.endswith("..."):
pprint_str = pprint_str + "..." Does that seem reasonable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Something like this is very reasonable, for sure. Just a tradeoff of complexity vs. results. For example, this case wouldn't change as a result of the function's change, because it ends with a space... We could add that logic in though There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. true, but we could also run My suggestion might result in something like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes 100%, I agree that's better. I'm probably a bit less motivated to make these stat as I feel like the egregious tails have been limited, which was the pressing issue. But can come back to it in the future if no one else takes it up. |
||
assert expected == actual | ||
|
||
actual = formatting.format_array_flat(np.arange(100.0), 1) | ||
actual = formatting.format_array_flat(np.arange(100.0), 12) | ||
expected = "0.0 ... 99.0" | ||
assert expected == actual | ||
|
||
|
@@ -154,16 +156,25 @@ def test_format_array_flat(self): | |
expected = "" | ||
assert expected == actual | ||
|
||
actual = formatting.format_array_flat(np.arange(1), 0) | ||
actual = formatting.format_array_flat(np.arange(1), 1) | ||
expected = "0" | ||
assert expected == actual | ||
|
||
actual = formatting.format_array_flat(np.arange(2), 0) | ||
actual = formatting.format_array_flat(np.arange(2), 3) | ||
expected = "0 1" | ||
assert expected == actual | ||
|
||
actual = formatting.format_array_flat(np.arange(4), 0) | ||
expected = "0 ... 3" | ||
actual = formatting.format_array_flat(np.arange(4), 7) | ||
expected = "0 1 2 3" | ||
assert expected == actual | ||
|
||
actual = formatting.format_array_flat(np.arange(5), 7) | ||
expected = "0 ... 4" | ||
assert expected == actual | ||
|
||
long_str = [" ".join(["hello world" for _ in range(100)])] | ||
actual = formatting.format_array_flat(np.asarray([long_str]), 21) | ||
expected = "'hello world hello..." | ||
assert expected == actual | ||
|
||
def test_pretty_print(self): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can't seem to accept this, maybe because the PR is closed...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, we'd need a new PR for that.