Skip to content

Commit

Permalink
Read from multiple <tbody> within a <table> (#20891)
Browse files Browse the repository at this point in the history
* Read from multiple <tbody> within a <table>

refs #20690
  • Loading branch information
adamhooper authored and TomAugspurger committed May 1, 2018
1 parent 7b683f4 commit 926f241
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 9 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.23.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -495,6 +495,7 @@ Other Enhancements
- :meth:`DataFrame.to_sql` now performs a multivalue insert if the underlying connection supports itk rather than inserting row by row.
``SQLAlchemy`` dialects supporting multivalue inserts include: ``mysql``, ``postgresql``, ``sqlite`` and any dialect with ``supports_multivalues_insert``. (:issue:`14315`, :issue:`8953`)
- :func:`read_html` now accepts a ``displayed_only`` keyword argument to controls whether or not hidden elements are parsed (``True`` by default) (:issue:`20027`)
- :func:`read_html` now reads all ``<tbody>`` elements in a ``<table>``, not just the first. (:issue:`20690`)
- :meth:`~pandas.core.window.Rolling.quantile` and :meth:`~pandas.core.window.Expanding.quantile` now accept the ``interpolation`` keyword, ``linear`` by default (:issue:`20497`)
- zip compression is supported via ``compression=zip`` in :func:`DataFrame.to_pickle`, :func:`Series.to_pickle`, :func:`DataFrame.to_csv`, :func:`Series.to_csv`, :func:`DataFrame.to_json`, :func:`Series.to_json`. (:issue:`17778`)
- :class:`pandas.tseries.api.offsets.WeekOfMonth` constructor now supports ``n=0`` (:issue:`20517`).
Expand Down
22 changes: 13 additions & 9 deletions pandas/io/html.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,7 +324,7 @@ def _parse_thead(self, table):
raise com.AbstractMethodError(self)

def _parse_tbody(self, table):
"""Return the body of the table.
"""Return the list of tbody elements from the parsed table element.
Parameters
----------
Expand All @@ -333,8 +333,8 @@ def _parse_tbody(self, table):
Returns
-------
tbody : node-like
A <tbody>...</tbody> element.
tbodys : list of node-like
A list of <tbody>...</tbody> elements
"""
raise com.AbstractMethodError(self)

Expand Down Expand Up @@ -388,13 +388,17 @@ def _parse_raw_tfoot(self, table):
np.array(res).squeeze()) if res and len(res) == 1 else res

def _parse_raw_tbody(self, table):
tbody = self._parse_tbody(table)
tbodies = self._parse_tbody(table)

try:
res = self._parse_tr(tbody[0])
except IndexError:
res = self._parse_tr(table)
return self._parse_raw_data(res)
raw_data = []

if tbodies:
for tbody in tbodies:
raw_data.extend(self._parse_tr(tbody))
else:
raw_data.extend(self._parse_tr(table))

return self._parse_raw_data(raw_data)

def _handle_hidden_tables(self, tbl_list, attr_name):
"""Returns list of tables, potentially removing hidden elements
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/io/test_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,33 @@ def test_empty_tables(self):
res2 = self.read_html(StringIO(data2))
assert_framelist_equal(res1, res2)

def test_multiple_tbody(self):
# GH-20690
# Read all tbody tags within a single table.
data = '''<table>
<thead>
<tr>
<th>A</th>
<th>B</th>
</tr>
</thead>
<tbody>
<tr>
<td>1</td>
<td>2</td>
</tr>
</tbody>
<tbody>
<tr>
<td>3</td>
<td>4</td>
</tr>
</tbody>
</table>'''
expected = DataFrame({'A': [1, 3], 'B': [2, 4]})
result = self.read_html(StringIO(data))[0]
tm.assert_frame_equal(result, expected)

def test_header_and_one_column(self):
"""
Don't fail with bs4 when there is a header and only one column
Expand Down

0 comments on commit 926f241

Please sign in to comment.