Skip to content

Commit

Permalink
Merge branch 'pandas-dev:main' into Fix#58748
Browse files Browse the repository at this point in the history
  • Loading branch information
SiddheshBangar authored May 28, 2024
2 parents 520fafe + b162331 commit 181a1d1
Show file tree
Hide file tree
Showing 12 changed files with 94 additions and 50 deletions.
10 changes: 8 additions & 2 deletions doc/source/development/maintaining.rst
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ Here's a typical workflow for triaging a newly opened issue.
example. See https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports
for a good explanation. If the example is not reproducible, or if it's
*clearly* not minimal, feel free to ask the reporter if they can provide
and example or simplify the provided one. Do acknowledge that writing
an example or simplify the provided one. Do acknowledge that writing
minimal reproducible examples is hard work. If the reporter is struggling,
you can try to write one yourself and we'll edit the original post to include it.

Expand All @@ -93,6 +93,9 @@ Here's a typical workflow for triaging a newly opened issue.
If a reproducible example is provided, but you see a simplification,
edit the original post with your simpler reproducible example.

If this is a regression report, post the result of a ``git bisect`` run.
More info on this can be found in the :ref:`maintaining.regressions` section.

Ensure the issue exists on the main branch and that it has the "Needs Triage" tag
until all steps have been completed. Add a comment to the issue once you have
verified it exists on the main branch, so others know it has been confirmed.
Expand Down Expand Up @@ -125,7 +128,10 @@ Here's a typical workflow for triaging a newly opened issue.
If the issue is clearly defined and the fix seems relatively straightforward,
label the issue as "Good first issue".

Once you have completed the above, make sure to remove the "needs triage" label.
If the issue is a regression report, add the "Regression" label and the next patch
release milestone.

Once you have completed the above, make sure to remove the "Needs Triage" label.

.. _maintaining.regressions:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,7 @@ Missing
MultiIndex
^^^^^^^^^^
- :func:`DataFrame.loc` with ``axis=0`` and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`)
- :meth:`DataFrame.melt` would not accept multiple names in ``var_name`` when the columns were a :class:`MultiIndex` (:issue:`58033`)
-

I/O
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -664,7 +664,7 @@ def _apply_str(self, obj, func: str, *args, **kwargs):
# people may aggregate on a non-callable attribute
# but don't let them think they can pass args to it
assert len(args) == 0
assert len([kwarg for kwarg in kwargs if kwarg not in ["axis"]]) == 0
assert not any(kwarg == "axis" for kwarg in kwargs)
return f
elif hasattr(np, func) and hasattr(obj, "__array__"):
# in particular exclude Window
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/computation/eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,8 @@ def eval(
corresponding bitwise operators. :class:`~pandas.Series` and
:class:`~pandas.DataFrame` objects are supported and behave as they would
with plain ol' Python evaluation.
`eval` can run arbitrary code which can make you vulnerable to code
injection if you pass user input to this function.
Parameters
----------
Expand Down
3 changes: 3 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4472,6 +4472,9 @@ def query(self, expr: str, *, inplace: bool = False, **kwargs) -> DataFrame | No
"""
Query the columns of a DataFrame with a boolean expression.
This method can run arbitrary code which can make you vulnerable to code
injection if you pass user input to this function.
Parameters
----------
expr : str
Expand Down
12 changes: 9 additions & 3 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1750,19 +1750,25 @@ def _get_label_or_level_values(self, key: Level, axis: AxisInt = 0) -> ArrayLike
if `key` matches multiple labels
"""
axis = self._get_axis_number(axis)
other_axes = [ax for ax in range(self._AXIS_LEN) if ax != axis]
first_other_axes = next(
(ax for ax in range(self._AXIS_LEN) if ax != axis), None
)

if self._is_label_reference(key, axis=axis):
self._check_label_or_level_ambiguity(key, axis=axis)
values = self.xs(key, axis=other_axes[0])._values
if first_other_axes is None:
raise ValueError("axis matched all axes")
values = self.xs(key, axis=first_other_axes)._values
elif self._is_level_reference(key, axis=axis):
values = self.axes[axis].get_level_values(key)._values
else:
raise KeyError(key)

# Check for duplicates
if values.ndim > 1:
if other_axes and isinstance(self._get_axis(other_axes[0]), MultiIndex):
if first_other_axes is not None and isinstance(
self._get_axis(first_other_axes), MultiIndex
):
multi_message = (
"\n"
"For a multi-index, the label must be a "
Expand Down
21 changes: 17 additions & 4 deletions pandas/core/reshape/melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@

import numpy as np

from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.common import (
is_iterator,
is_list_like,
)
from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.missing import notna

Expand Down Expand Up @@ -64,9 +67,10 @@ def melt(
value_vars : scalar, tuple, list, or ndarray, optional
Column(s) to unpivot. If not specified, uses all columns that
are not set as `id_vars`.
var_name : scalar, default None
var_name : scalar, tuple, list, or ndarray, optional
Name to use for the 'variable' column. If None it uses
``frame.columns.name`` or 'variable'.
``frame.columns.name`` or 'variable'. Must be a scalar if columns are a
MultiIndex.
value_name : scalar, default 'value'
Name to use for the 'value' column, can't be an existing column label.
col_level : scalar, optional
Expand Down Expand Up @@ -217,7 +221,16 @@ def melt(
frame.columns.name if frame.columns.name is not None else "variable"
]
elif is_list_like(var_name):
raise ValueError(f"{var_name=} must be a scalar.")
if isinstance(frame.columns, MultiIndex):
if is_iterator(var_name):
var_name = list(var_name)
if len(var_name) > len(frame.columns):
raise ValueError(
f"{var_name=} has {len(var_name)} items, "
f"but the dataframe columns only have {len(frame.columns)} levels."
)
else:
raise ValueError(f"{var_name=} must be a scalar.")
else:
var_name = [var_name]

Expand Down
19 changes: 9 additions & 10 deletions pandas/io/excel/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -857,24 +857,23 @@ def _parse_sheet(
# a row containing just the index name(s)
has_index_names = False
if is_list_header and not is_len_one_list_header and index_col is not None:
index_col_list: Sequence[int]
index_col_set: set[int]
if isinstance(index_col, int):
index_col_list = [index_col]
index_col_set = {index_col}
else:
assert isinstance(index_col, Sequence)
index_col_list = index_col
index_col_set = set(index_col)

# We have to handle mi without names. If any of the entries in the data
# columns are not empty, this is a regular row
assert isinstance(header, Sequence)
if len(header) < len(data):
potential_index_names = data[len(header)]
potential_data = [
x
has_index_names = all(
x == "" or x is None
for i, x in enumerate(potential_index_names)
if not control_row[i] and i not in index_col_list
]
has_index_names = all(x == "" or x is None for x in potential_data)
if not control_row[i] and i not in index_col_set
)

if is_list_like(index_col):
# Forward fill values for MultiIndex index.
Expand Down Expand Up @@ -1457,9 +1456,9 @@ def inspect_excel_format(
with zipfile.ZipFile(stream) as zf:
# Workaround for some third party files that use forward slashes and
# lower case names.
component_names = [
component_names = {
name.replace("\\", "/").lower() for name in zf.namelist()
]
}

if "xl/workbook.xml" in component_names:
return "xlsx"
Expand Down
36 changes: 16 additions & 20 deletions pandas/io/excel/_odfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,29 +122,25 @@ def get_sheet_data(
table: list[list[Scalar | NaTType]] = []

for sheet_row in sheet_rows:
sheet_cells = [
x
for x in sheet_row.childNodes
if hasattr(x, "qname") and x.qname in cell_names
]
empty_cells = 0
table_row: list[Scalar | NaTType] = []

for sheet_cell in sheet_cells:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value

column_repeat = self._get_column_repeat(sheet_cell)

# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)
for sheet_cell in sheet_row.childNodes:
if hasattr(sheet_cell, "qname") and sheet_cell.qname in cell_names:
if sheet_cell.qname == table_cell_name:
value = self._get_cell_value(sheet_cell)
else:
value = self.empty_value

column_repeat = self._get_column_repeat(sheet_cell)

# Queue up empty values, writing only if content succeeds them
if value == self.empty_value:
empty_cells += column_repeat
else:
table_row.extend([self.empty_value] * empty_cells)
empty_cells = 0
table_row.extend([value] * column_repeat)

if max_row_len < len(table_row):
max_row_len = len(table_row)
Expand Down
11 changes: 4 additions & 7 deletions pandas/io/excel/_xlrd.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,13 @@ def _parse_cell(cell_contents, cell_typ):
cell_contents = val
return cell_contents

data = []

nrows = sheet.nrows
if file_rows_needed is not None:
nrows = min(nrows, file_rows_needed)
for i in range(nrows):
row = [
return [
[
_parse_cell(value, typ)
for value, typ in zip(sheet.row_values(i), sheet.row_types(i))
]
data.append(row)

return data
for i in range(nrows)
]
7 changes: 4 additions & 3 deletions pandas/io/sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ def _convert_arrays_to_dataframe(
dtype_backend: DtypeBackend | Literal["numpy"] = "numpy",
) -> DataFrame:
content = lib.to_object_array_tuples(data)
idx_len = content.shape[0]
arrays = convert_object_array(
list(content.T),
dtype=None,
Expand All @@ -177,9 +178,9 @@ def _convert_arrays_to_dataframe(
result_arrays.append(ArrowExtensionArray(pa_array))
arrays = result_arrays # type: ignore[assignment]
if arrays:
df = DataFrame(dict(zip(range(len(columns)), arrays)))
df.columns = columns
return df
return DataFrame._from_arrays(
arrays, columns=columns, index=range(idx_len), verify_integrity=False
)
else:
return DataFrame(columns=columns)

Expand Down
20 changes: 20 additions & 0 deletions pandas/tests/reshape/test_melt.py
Original file line number Diff line number Diff line change
Expand Up @@ -533,6 +533,26 @@ def test_melt_non_scalar_var_name_raises(self):
with pytest.raises(ValueError, match=r".* must be a scalar."):
df.melt(id_vars=["a"], var_name=[1, 2])

def test_melt_multiindex_columns_var_name(self):
# GH 58033
df = DataFrame({("A", "a"): [1], ("A", "b"): [2]})

expected = DataFrame(
[("A", "a", 1), ("A", "b", 2)], columns=["first", "second", "value"]
)

tm.assert_frame_equal(df.melt(var_name=["first", "second"]), expected)
tm.assert_frame_equal(df.melt(var_name=["first"]), expected[["first", "value"]])

def test_melt_multiindex_columns_var_name_too_many(self):
# GH 58033
df = DataFrame({("A", "a"): [1], ("A", "b"): [2]})

with pytest.raises(
ValueError, match="but the dataframe columns only have 2 levels"
):
df.melt(var_name=["first", "second", "third"])


class TestLreshape:
def test_pairs(self):
Expand Down

0 comments on commit 181a1d1

Please sign in to comment.