diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index d3bb28c2aee65..49d16a7b5290f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -495,6 +495,7 @@ Other Enhancements - :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row. ``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`) - :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`) +- :func:`read_html` now reads all ```` elements in a ````, not just the first. (:issue:`20690`) - :meth:`~pandas.core.window.Rolling.quantile` and :meth:`~pandas.core.window.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`) - zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`) - :class:`pandas.tseries.api.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`). diff --git a/pandas/io/html.py b/pandas/io/html.py index ba5da1b4e3a76..8fd876e85889f 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -324,7 +324,7 @@ def _parse_thead(self, table): raise com.AbstractMethodError(self) def _parse_tbody(self, table): - """Return the body of the table. + """Return the list of tbody elements from the parsed table element. Parameters ---------- @@ -333,8 +333,8 @@ def _parse_tbody(self, table): Returns ------- - tbody : node-like - A ... element. + tbodys : list of node-like + A list of ... elements """ raise com.AbstractMethodError(self) @@ -388,13 +388,17 @@ def _parse_raw_tfoot(self, table): np.array(res).squeeze()) if res and len(res) == 1 else res def _parse_raw_tbody(self, table): - tbody = self._parse_tbody(table) + tbodies = self._parse_tbody(table) - try: - res = self._parse_tr(tbody[0]) - except IndexError: - res = self._parse_tr(table) - return self._parse_raw_data(res) + raw_data = [] + + if tbodies: + for tbody in tbodies: + raw_data.extend(self._parse_tr(tbody)) + else: + raw_data.extend(self._parse_tr(table)) + + return self._parse_raw_data(raw_data) def _handle_hidden_tables(self, tbl_list, attr_name): """Returns list of tables, potentially removing hidden elements diff --git a/pandas/tests/io/test_html.py b/pandas/tests/io/test_html.py index 078b5f8448d46..a56946b82b027 100644 --- a/pandas/tests/io/test_html.py +++ b/pandas/tests/io/test_html.py @@ -396,6 +396,33 @@ def test_empty_tables(self): res2 = self.read_html(StringIO(data2)) assert_framelist_equal(res1, res2) + def test_multiple_tbody(self): + # GH-20690 + # Read all tbody tags within a single table. + data = '''
+ + + + + + + + + + + + + + + + + + +
AB
12
34
''' + expected = DataFrame({'A': [1, 3], 'B': [2, 4]}) + result = self.read_html(StringIO(data))[0] + tm.assert_frame_equal(result, expected) + def test_header_and_one_column(self): """ Don't fail with bs4 when there is a header and only one column