From f6309d93daf6df0a49ed27f848131b5685e38bbc Mon Sep 17 00:00:00 2001 From: Steven Beardwell <46458390+stevenbw@users.noreply.github.com> Date: Sun, 13 Jan 2019 11:13:05 +0000 Subject: [PATCH 1/6] Split test_excel.py (#24472) --- pandas/tests/io/excel/__init__.py | 0 pandas/tests/io/excel/base.py | 2314 +++++++++++++++++++++++++++++ pandas/tests/io/excel/xlrd.py | 352 +++++ 3 files changed, 2666 insertions(+) create mode 100644 pandas/tests/io/excel/__init__.py create mode 100644 pandas/tests/io/excel/base.py create mode 100644 pandas/tests/io/excel/xlrd.py diff --git a/pandas/tests/io/excel/__init__.py b/pandas/tests/io/excel/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/pandas/tests/io/excel/base.py b/pandas/tests/io/excel/base.py new file mode 100644 index 0000000000000..636855711100b --- /dev/null +++ b/pandas/tests/io/excel/base.py @@ -0,0 +1,2314 @@ +from collections import OrderedDict +from datetime import date, datetime, time, timedelta +from distutils.version import LooseVersion +from functools import partial +import os +import warnings +from warnings import catch_warnings + +import numpy as np +from numpy import nan +import pytest + +from pandas.compat import PY36, BytesIO, iteritems, map, range, u +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +from pandas.core.config import get_option, set_option +import pandas.util.testing as tm +from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf + +from pandas.io.common import URLError +from pandas.io.excel import ( + ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, + read_excel, register_writer) +from pandas.io.formats.excel import ExcelFormatter +from pandas.io.parsers import read_csv + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() +_frame = DataFrame(_seriesd)[:10] +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] +_tsframe = tm.makeTimeDataFrame()[:5] +_mixed_frame = _frame.copy() +_mixed_frame['foo'] = 'bar' + + +class SharedItems(object): + + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def get_csv_refdf(self, basename): + """ + Obtain the reference data from read_csv with the Python engine. + + Parameters + ---------- + + basename : str + File base name, excluding file extension. + + Returns + ------- + + dfref : DataFrame + """ + pref = os.path.join(self.dirpath, basename + '.csv') + dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') + return dfref + + +class ReadingTestsBase(SharedItems): + # This is based on ExcelWriterBase + + @pytest.fixture(autouse=True, params=[None]) + def set_engine(self, request): + func_name = "get_exceldf" + old_func = getattr(self, func_name) + new_func = partial(old_func, engine=request.param) + setattr(self, func_name, new_func) + yield + setattr(self, func_name, old_func) + + @pytest.mark.parametrize("usecols", [ + [0, 1, 3], [0, 3, 1], + [1, 0, 3], [1, 3, 0], + [3, 0, 1], [3, 1, 0], + ]) + + def test_usecols_diff_positional_int_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["A", "C"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.parametrize("usecols", [ + ["B", "D"], ["D", "B"] + ]) + + def test_usecols_diff_positional_str_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["B", "D"]] + expected.index = range(len(expected)) + + result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_read_excel_without_slicing(self, ext): + expected = self.get_csv_refdf("test1") + result = self.get_exceldf("test1", ext, "Sheet1", index_col=0) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str(self, ext): + expected = self.get_csv_refdf("test1")[["C", "D"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols="A,D:E") + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str_invalid(self, ext): + msg = "Invalid column name: E1" + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1") + + def test_index_col_label_error(self, ext): + msg = "list indices must be integers.*, not str" + + with pytest.raises(TypeError, match=msg): + self.get_exceldf("test1", ext, "Sheet1", index_col=["A"], + usecols=["A", "C"]) + + def test_index_col_empty(self, ext): + # see gh-9208 + result = self.get_exceldf("test1", ext, "Sheet3", + index_col=["A", "B", "C"]) + expected = DataFrame(columns=["D", "E", "F"], + index=MultiIndex(levels=[[]] * 3, + codes=[[]] * 3, + names=["A", "B", "C"])) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("index_col", [None, 2]) + def test_index_col_with_unnamed(self, ext, index_col): + # see gh-18792 + result = self.get_exceldf("test1", ext, "Sheet4", + index_col=index_col) + expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], + columns=["Unnamed: 0", "col1", "col2"]) + if index_col: + expected = expected.set_index(expected.columns[index_col]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_pass_non_existent_column(self, ext): + msg = ("Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]") + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, usecols=["E"]) + + def test_usecols_wrong_type(self, ext): + msg = ("'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable.") + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, usecols=["E1", 0]) + + def test_excel_stop_iterator(self, ext): + + parsed = self.get_exceldf('test2', ext, 'Sheet1') + expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) + tm.assert_frame_equal(parsed, expected) + + def test_excel_cell_error_na(self, ext): + + parsed = self.get_exceldf('test3', ext, 'Sheet1') + expected = DataFrame([[np.nan]], columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + def test_excel_passes_na(self, ext): + + excel = self.get_excelfile('test4', ext) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + # 13967 + excel = self.get_excelfile('test5', ext) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + def test_excel_table(self, ext): + + dfref = self.get_csv_refdf('test1') + + df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0) + df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0) + # TODO add index to file + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + + df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + def test_reader_special_dtypes(self, ext): + + expected = DataFrame.from_dict(OrderedDict([ + ("IntCol", [1, 2, -3, 4, 0]), + ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), + ("BoolCol", [True, False, True, True, False]), + ("StrCol", [1, 2, 3, 4, 5]), + # GH5394 - this is why convert_float isn't vectorized + ("Str2Col", ["a", 3, "c", "d", "e"]), + ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), + datetime(1905, 1, 1), datetime(2013, 12, 14), + datetime(2015, 3, 14)]) + ])) + basename = 'test_types' + + # should read in correctly and infer types + actual = self.get_exceldf(basename, ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + # if not coercing number, then int comes in as float + float_expected = expected.copy() + float_expected["IntCol"] = float_expected["IntCol"].astype(float) + float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 + actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False) + tm.assert_frame_equal(actual, float_expected) + + # check setting Index (assuming xls and xlsx are the same here) + for icol, name in enumerate(expected.columns): + actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol) + exp = expected.set_index(name) + tm.assert_frame_equal(actual, exp) + + # convert_float and converters should be different but both accepted + expected["StrCol"] = expected["StrCol"].apply(str) + actual = self.get_exceldf( + basename, ext, 'Sheet1', converters={"StrCol": str}) + tm.assert_frame_equal(actual, expected) + + no_convert_float = float_expected.copy() + no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) + actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False, + converters={"StrCol": str}) + tm.assert_frame_equal(actual, no_convert_float) + + # GH8212 - support for converters and missing values + def test_reader_converters(self, ext): + + basename = 'test_converters' + + expected = DataFrame.from_dict(OrderedDict([ + ("IntCol", [1, 2, -3, -1000, 0]), + ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), + ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), + ("StrCol", ['1', np.nan, '3', '4', '5']), + ])) + + converters = {'IntCol': lambda x: int(x) if x != '' else -1000, + 'FloatCol': lambda x: 10 * x if x else np.nan, + 2: lambda x: 'Found' if x != '' else 'Not found', + 3: lambda x: str(x) if x else '', + } + + # should read in correctly and set types of single cells (not array + # dtypes) + actual = self.get_exceldf(basename, ext, 'Sheet1', + converters=converters) + tm.assert_frame_equal(actual, expected) + + def test_reader_dtype(self, ext): + # GH 8212 + basename = 'testdtype' + actual = self.get_exceldf(basename, ext) + + expected = DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [2.5, 3.5, 4.5, 5.5], + 'c': [1, 2, 3, 4], + 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( + columns=['a', 'b', 'c', 'd']) + + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf(basename, ext, + dtype={'a': 'float64', + 'b': 'float32', + 'c': str}) + + expected['a'] = expected['a'].astype('float64') + expected['b'] = expected['b'].astype('float32') + expected['c'] = ['001', '002', '003', '004'] + tm.assert_frame_equal(actual, expected) + + with pytest.raises(ValueError): + self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + + @pytest.mark.parametrize("dtype,expected", [ + (None, + DataFrame({ + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0] + })), + ({"a": "float64", + "b": "float32", + "c": str, + "d": str + }, + DataFrame({ + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": ["001", "002", "003", "004"], + "d": ["1", "2", np.nan, "4"] + })), + ]) + def test_reader_dtype_str(self, ext, dtype, expected): + # see gh-20377 + basename = "testdtype" + + actual = self.get_exceldf(basename, ext, dtype=dtype) + tm.assert_frame_equal(actual, expected) + + def test_reading_all_sheets(self, ext): + # Test reading all sheetnames by setting sheetname to None, + # Ensure a dict is returned. + # See PR #9450 + basename = 'test_multisheet' + dfs = self.get_exceldf(basename, ext, sheet_name=None) + # ensure this is not alphabetical to test order preservation + expected_keys = ['Charlie', 'Alpha', 'Beta'] + tm.assert_contains_all(expected_keys, dfs.keys()) + # Issue 9930 + # Ensure sheet order is preserved + assert expected_keys == list(dfs.keys()) + + def test_reading_multiple_specific_sheets(self, ext): + # Test reading specific sheetnames by specifying a mixed list + # of integers and strings, and confirm that duplicated sheet + # references (positions/names) are removed properly. + # Ensure a dict is returned + # See PR #9450 + basename = 'test_multisheet' + # Explicitly request duplicates. Only the set should be returned. + expected_keys = [2, 'Charlie', 'Charlie'] + dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys) + expected_keys = list(set(expected_keys)) + tm.assert_contains_all(expected_keys, dfs.keys()) + assert len(expected_keys) == len(dfs.keys()) + + def test_reading_all_sheets_with_blank(self, ext): + # Test reading all sheetnames by setting sheetname to None, + # In the case where some sheets are blank. + # Issue #11711 + basename = 'blank_with_header' + dfs = self.get_exceldf(basename, ext, sheet_name=None) + expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] + tm.assert_contains_all(expected_keys, dfs.keys()) + + # GH6403 + def test_read_excel_blank(self, ext): + actual = self.get_exceldf('blank', ext, 'Sheet1') + tm.assert_frame_equal(actual, DataFrame()) + + def test_read_excel_blank_with_header(self, ext): + expected = DataFrame(columns=['col_1', 'col_2']) + actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([np.nan] * 4)), + (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) + ]) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, filename, index=False, header=False) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([0] + [np.nan] * 4)), + (0, DataFrame([np.nan] * 4)) + ]) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, 'with_header', index=False, header=True) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_set_column_names_in_parameter(self, ext): + # GH 12870 : pass down column names associated with + # keyword argument names + refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], + [3, 'baz']], columns=['a', 'b']) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as writer: + refdf.to_excel(writer, 'Data_no_head', + header=False, index=False) + refdf.to_excel(writer, 'Data_with_head', index=False) + + refdf.columns = ['A', 'B'] + + with ExcelFile(pth) as reader: + xlsdf_no_head = read_excel(reader, 'Data_no_head', + header=None, names=['A', 'B']) + xlsdf_with_head = read_excel(reader, 'Data_with_head', + index_col=None, names=['A', 'B']) + + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) + + def test_date_conversion_overflow(self, ext): + # GH 10001 : pandas.ExcelFile ignore parse_dates=False + expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], + [pd.Timestamp('2016-03-16'), 'Jack Black'], + [1e+20, 'Timothy Brown']], + columns=['DateColWithBigInt', 'StringCol']) + + result = self.get_exceldf('testdateoverflow', ext) + tm.assert_frame_equal(result, expected) + + def test_sheet_name_both_raises(self, ext): + with pytest.raises(TypeError, match="Cannot specify both"): + self.get_exceldf('test1', ext, sheetname='Sheet1', + sheet_name='Sheet1') + + excel = self.get_excelfile('test1', ext) + with pytest.raises(TypeError, match="Cannot specify both"): + excel.parse(sheetname='Sheet1', + sheet_name='Sheet1') + + def test_excel_read_buffer(self, ext): + + pth = os.path.join(self.dirpath, 'test1' + ext) + expected = read_excel(pth, 'Sheet1', index_col=0) + with open(pth, 'rb') as f: + actual = read_excel(f, 'Sheet1', index_col=0) + tm.assert_frame_equal(expected, actual) + + with open(pth, 'rb') as f: + xls = ExcelFile(f) + actual = read_excel(xls, 'Sheet1', index_col=0) + tm.assert_frame_equal(expected, actual) + + def test_bad_engine_raises(self, ext): + bad_engine = 'foo' + with pytest.raises(ValueError, message="Unknown engine: foo"): + read_excel('', engine=bad_engine) + + @tm.network + def test_read_from_http_url(self, ext): + url = ('https://raw.github.com/pandas-dev/pandas/master/' + 'pandas/tests/io/data/test1' + ext) + url_table = read_excel(url) + local_table = self.get_exceldf('test1', ext) + tm.assert_frame_equal(url_table, local_table) + + @td.skip_if_not_us_locale + def test_read_from_s3_url(self, ext, s3_resource): + # Bucket "pandas-test" created in tests/io/conftest.py + file_name = os.path.join(self.dirpath, 'test1' + ext) + + with open(file_name, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext, + Body=f) + + url = ('s3://pandas-test/test1' + ext) + url_table = read_excel(url) + local_table = self.get_exceldf('test1', ext) + tm.assert_frame_equal(url_table, local_table) + + @pytest.mark.slow + # ignore warning from old xlrd + @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning") + def test_read_from_file_url(self, ext): + + # FILE + localtable = os.path.join(self.dirpath, 'test1' + ext) + local_table = read_excel(localtable) + + try: + url_table = read_excel('file://localhost/' + localtable) + except URLError: + # fails on some systems + import platform + pytest.skip("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + @td.skip_if_no('pathlib') + def test_read_from_pathlib_path(self, ext): + + # GH12655 + from pathlib import Path + + str_path = os.path.join(self.dirpath, 'test1' + ext) + expected = read_excel(str_path, 'Sheet1', index_col=0) + + path_obj = Path(self.dirpath, 'test1' + ext) + actual = read_excel(path_obj, 'Sheet1', index_col=0) + + tm.assert_frame_equal(expected, actual) + + @td.skip_if_no('py.path') + def test_read_from_py_localpath(self, ext): + + # GH12655 + from py.path import local as LocalPath + + str_path = os.path.join(self.dirpath, 'test1' + ext) + expected = read_excel(str_path, 'Sheet1', index_col=0) + + abs_dir = os.path.abspath(self.dirpath) + path_obj = LocalPath(abs_dir).join('test1' + ext) + actual = read_excel(path_obj, 'Sheet1', index_col=0) + + tm.assert_frame_equal(expected, actual) + + def test_reader_closes_file(self, ext): + + pth = os.path.join(self.dirpath, 'test1' + ext) + f = open(pth, 'rb') + with ExcelFile(f) as xlsx: + # parses okay + read_excel(xlsx, 'Sheet1', index_col=0) + + assert f.closed + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_creating_and_reading_multiple_sheets(self, ext): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[col_sheet_name]) + + sheets = ["AAA", "BBB", "CCC"] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets, dfs)) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in iteritems(dfs): + df.to_excel(ew, sheetname) + + dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) + + def test_reader_seconds(self, ext): + + # Test reading times with and without milliseconds. GH5945. + expected = DataFrame.from_dict({"Time": [time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54)]}) + + actual = self.get_exceldf('times_1900', ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf('times_1904', ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + def test_read_excel_multiindex(self, ext): + # see gh-4679 + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) + mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + + # "mi_column" sheet + expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True]], + columns=mi) + + actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + # "mi_index" sheet + expected.index = mi + expected.columns = ["a", "b", "c", "d"] + + actual = read_excel(mi_file, "mi_index", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + # "both" sheet + expected.columns = mi + + actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + # "mi_index_name" sheet + expected.columns = ["a", "b", "c", "d"] + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "mi_column_name" sheet + expected.index = list(range(4)) + expected.columns = mi.set_names(["c1", "c2"]) + actual = read_excel(mi_file, "mi_column_name", + header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + # see gh-11317 + # "name_with_int" sheet + expected.columns = mi.set_levels( + [1, 2], level=1).set_names(["c1", "c2"]) + + actual = read_excel(mi_file, "name_with_int", + index_col=0, header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "both_name" sheet + expected.columns = mi.set_names(["c1", "c2"]) + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = read_excel(mi_file, "both_name", + index_col=[0, 1], header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "both_skiprows" sheet + actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], + header=[0, 1], skiprows=2) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_multiindex_header_only(self, ext): + # see gh-11733. + # + # Don't try to parse a header name if there isn't one. + mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + result = read_excel(mi_file, "index_col_none", header=[0, 1]) + + exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) + expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlsxwriter") + def test_read_excel_multiindex_empty_level(self, ext): + # see gh-12453 + with ensure_clean(ext) as path: + df = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0} + }) + + expected = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame({ + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + expected = pd.DataFrame({ + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels): + # see gh-4679 + with ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip("Column index name cannot be " + "serialized unless it's a MultiIndex") + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = mkdf(5, 5, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels) + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + def test_excel_old_index_format(self, ext): + # see gh-4679 + filename = "test_index_name_pre17" + ext + in_file = os.path.join(self.dirpath, filename) + + # We detect headers to determine if index names exist, so + # that "index" name in the "names" version of the data will + # now be interpreted as rows that include null data. + data = np.array([[None, None, None, None, None], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", + "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", + "R_l1_g2", "R_l1_g3", "R_l1_g4"]], + codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None]) + si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(in_file, "single_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # The analogous versions of the "names" version data + # where there are explicitly no names for the indices. + data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", + "R_l1_g3", "R_l1_g4"]], + codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + names=[None, None]) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(in_file, "single_no_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + def test_read_excel_bool_header_arg(self, ext): + # GH 6114 + for arg in [True, False]: + with pytest.raises(TypeError): + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + header=arg) + + def test_read_excel_chunksize(self, ext): + # GH 8011 + with pytest.raises(NotImplementedError): + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + chunksize=100) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_read_excel_parse_dates(self, ext): + # see gh-11544, gh-12051 + df = DataFrame( + {"col": [1, 2, 3], + "date_strings": pd.date_range("2012-01-01", periods=3)}) + df2 = df.copy() + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") + + with ensure_clean(ext) as pth: + df2.to_excel(pth) + + res = read_excel(pth, index_col=0) + tm.assert_frame_equal(df2, res) + + res = read_excel(pth, parse_dates=["date_strings"], index_col=0) + tm.assert_frame_equal(df, res) + + date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + res = read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) + tm.assert_frame_equal(df, res) + + def test_read_excel_skiprows_list(self, ext): + # GH 4903 + actual = pd.read_excel(os.path.join(self.dirpath, + 'testskiprows' + ext), + 'skiprows_list', skiprows=[0, 2]) + expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], + [2, 3.5, pd.Timestamp('2015-01-02'), False], + [3, 4.5, pd.Timestamp('2015-01-03'), False], + [4, 5.5, pd.Timestamp('2015-01-04'), True]], + columns=['a', 'b', 'c', 'd']) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel(os.path.join(self.dirpath, + 'testskiprows' + ext), + 'skiprows_list', skiprows=np.array([0, 2])) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows(self, ext): + # GH 16645 + num_rows_to_pull = 5 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + nrows=num_rows_to_pull) + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + ext)) + expected = expected[:num_rows_to_pull] + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_greater_than_nrows_in_file(self, ext): + # GH 16645 + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + ext)) + num_records_in_file = len(expected) + num_rows_to_pull = num_records_in_file + 10 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + nrows=num_rows_to_pull) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_non_integer_parameter(self, ext): + # GH 16645 + msg = "'nrows' must be an integer >=0" + with pytest.raises(ValueError, match=msg): + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + nrows='5') + + def test_read_excel_squeeze(self, ext): + # GH 12157 + f = os.path.join(self.dirpath, 'test_squeeze' + ext) + + actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) + expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') + expected.index.name = 'a' + tm.assert_series_equal(actual, expected) + + actual = pd.read_excel(f, 'two_columns', squeeze=True) + expected = pd.DataFrame({'a': [4, 5, 6], + 'b': [2, 3, 4]}) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel(f, 'one_column', squeeze=True) + expected = pd.Series([1, 2, 3], name='a') + tm.assert_series_equal(actual, expected) + + +class _WriterBase(SharedItems): + + @pytest.fixture(autouse=True) + def set_engine_and_path(self, request, merge_cells, engine, ext): + """Fixture to set engine and open file for use in each test case + + Rather than requiring `engine=...` to be provided explicitly as an + argument in each test, this fixture sets a global option to dictate + which engine should be used to write Excel files. After executing + the test it rolls back said change to the global option. + + It also uses a context manager to open a temporary excel file for + the function to write to, accessible via `self.path` + + Notes + ----- + This fixture will run as part of each test method defined in the + class and any subclasses, on account of the `autouse=True` + argument + """ + option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) + prev_engine = get_option(option_name) + set_option(option_name, engine) + with ensure_clean(ext) as path: + self.path = path + yield + set_option(option_name, prev_engine) # Roll back option change + + +@pytest.mark.parametrize("merge_cells", [True, False]) +@pytest.mark.parametrize("engine,ext", [ + pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param('xlwt', '.xls', marks=pytest.mark.skipif( + not td.safe_import('xlwt'), reason='No xlwt')), + pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) +]) + + +class TestExcelWriter(_WriterBase): + # Base class for test cases to run with different Excel writers. + + def test_excel_writer_context_manager(self, *_): + with ExcelWriter(self.path) as writer: + self.frame.to_excel(writer, "Data1") + self.frame2.to_excel(writer, "Data2") + + with ExcelFile(self.path) as reader: + found_df = read_excel(reader, "Data1", index_col=0) + found_df2 = read_excel(reader, "Data2", index_col=0) + + tm.assert_frame_equal(found_df, self.frame) + tm.assert_frame_equal(found_df2, self.frame2) + + def test_roundtrip(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # test roundtrip + self.frame.to_excel(self.path, 'test1') + recons = read_excel(self.path, 'test1', index_col=0) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, 'test1', index=False) + recons = read_excel(self.path, 'test1', index_col=None) + recons.index = self.frame.index + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, 'test1', na_rep='NA') + recons = read_excel(self.path, 'test1', index_col=0, na_values=['NA']) + tm.assert_frame_equal(self.frame, recons) + + # GH 3611 + self.frame.to_excel(self.path, 'test1', na_rep='88') + recons = read_excel(self.path, 'test1', index_col=0, na_values=['88']) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, 'test1', na_rep='88') + recons = read_excel(self.path, 'test1', index_col=0, + na_values=[88, 88.0]) + tm.assert_frame_equal(self.frame, recons) + + # GH 6573 + self.frame.to_excel(self.path, 'Sheet1') + recons = read_excel(self.path, index_col=0) + tm.assert_frame_equal(self.frame, recons) + + self.frame.to_excel(self.path, '0') + recons = read_excel(self.path, index_col=0) + tm.assert_frame_equal(self.frame, recons) + + # GH 8825 Pandas Series should provide to_excel method + s = self.frame["A"] + s.to_excel(self.path) + recons = read_excel(self.path, index_col=0) + tm.assert_frame_equal(s.to_frame(), recons) + + def test_mixed(self, merge_cells, engine, ext): + self.mixed_frame.to_excel(self.path, 'test1') + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', index_col=0) + tm.assert_frame_equal(self.mixed_frame, recons) + + def test_ts_frame(self, *_): + df = tm.makeTimeDataFrame()[:5] + + df.to_excel(self.path, "test1") + reader = ExcelFile(self.path) + + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(df, recons) + + def test_basics_with_nan(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + @pytest.mark.parametrize("np_type", [ + np.int8, np.int16, np.int32, np.int64]) + def test_int_types(self, merge_cells, engine, ext, np_type): + # Test np.int values read come back as int + # (rather than float which is Excel's format). + frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), + dtype=np_type) + frame.to_excel(self.path, "test1") + + reader = ExcelFile(self.path) + recons = read_excel(reader, "test1", index_col=0) + + int_frame = frame.astype(np.int64) + tm.assert_frame_equal(int_frame, recons) + + recons2 = read_excel(self.path, "test1", index_col=0) + tm.assert_frame_equal(int_frame, recons2) + + # Test with convert_float=False comes back as float. + float_frame = frame.astype(float) + recons = read_excel(self.path, "test1", + convert_float=False, index_col=0) + tm.assert_frame_equal(recons, float_frame, + check_index_type=False, + check_column_type=False) + + @pytest.mark.parametrize("np_type", [ + np.float16, np.float32, np.float64]) + def test_float_types(self, merge_cells, engine, ext, np_type): + # Test np.float values read come back as float. + frame = DataFrame(np.random.random_sample(10), dtype=np_type) + frame.to_excel(self.path, "test1") + + reader = ExcelFile(self.path) + recons = read_excel(reader, "test1", index_col=0).astype(np_type) + + tm.assert_frame_equal(frame, recons, check_dtype=False) + + @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) + def test_bool_types(self, merge_cells, engine, ext, np_type): + # Test np.bool values read come back as float. + frame = (DataFrame([1, 0, True, False], dtype=np_type)) + frame.to_excel(self.path, "test1") + + reader = ExcelFile(self.path) + recons = read_excel(reader, "test1", index_col=0).astype(np_type) + + tm.assert_frame_equal(frame, recons) + + def test_inf_roundtrip(self, *_): + frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) + frame.to_excel(self.path, "test1") + + reader = ExcelFile(self.path) + recons = read_excel(reader, "test1", index_col=0) + + tm.assert_frame_equal(frame, recons) + + def test_sheets(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # Test writing to separate sheets + writer = ExcelWriter(self.path) + self.frame.to_excel(writer, 'test1') + self.tsframe.to_excel(writer, 'test2') + writer.save() + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', index_col=0) + tm.assert_frame_equal(self.frame, recons) + recons = read_excel(reader, 'test2', index_col=0) + tm.assert_frame_equal(self.tsframe, recons) + assert 2 == len(reader.sheet_names) + assert 'test1' == reader.sheet_names[0] + assert 'test2' == reader.sheet_names[1] + + def test_colaliases(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # column aliases + col_aliases = Index(['AA', 'X', 'Y', 'Z']) + self.frame2.to_excel(self.path, 'test1', header=col_aliases) + reader = ExcelFile(self.path) + rs = read_excel(reader, 'test1', index_col=0) + xp = self.frame2.copy() + xp.columns = col_aliases + tm.assert_frame_equal(xp, rs) + + def test_roundtrip_indexlabels(self, merge_cells, engine, ext): + self.frame['A'][:5] = nan + + self.frame.to_excel(self.path, 'test1') + self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) + self.frame.to_excel(self.path, 'test1', header=False) + self.frame.to_excel(self.path, 'test1', index=False) + + # test index_label + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(self.path, 'test1', + index_label=['test'], + merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=0, + ).astype(np.int64) + frame.index.names = ['test'] + assert frame.index.names == recons.index.names + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(self.path, + 'test1', + index_label=['test', 'dummy', 'dummy2'], + merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=0, + ).astype(np.int64) + frame.index.names = ['test'] + assert frame.index.names == recons.index.names + + frame = (DataFrame(np.random.randn(10, 2)) >= 0) + frame.to_excel(self.path, + 'test1', + index_label='test', + merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=0, + ).astype(np.int64) + frame.index.names = ['test'] + tm.assert_frame_equal(frame, recons.astype(bool)) + + self.frame.to_excel(self.path, + 'test1', + columns=['A', 'B', 'C', 'D'], + index=False, merge_cells=merge_cells) + # take 'A' and 'B' as indexes (same row as cols 'C', 'D') + df = self.frame.copy() + df = df.set_index(['A', 'B']) + + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', index_col=[0, 1]) + tm.assert_frame_equal(df, recons, check_less_precise=True) + + def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): + df = DataFrame(np.random.randn(10, 4)) + df.index.name = 'foo' + + df.to_excel(self.path, merge_cells=merge_cells) + + xf = ExcelFile(self.path) + result = read_excel(xf, xf.sheet_names[0], + index_col=0) + + tm.assert_frame_equal(result, df) + assert result.index.name == 'foo' + + def test_excel_roundtrip_datetime(self, merge_cells, *_): + # datetime.date, not sure what to test here exactly + tsf = self.tsframe.copy() + + tsf.index = [x.date() for x in self.tsframe.index] + tsf.to_excel(self.path, "test1", merge_cells=merge_cells) + + reader = ExcelFile(self.path) + recons = read_excel(reader, "test1", index_col=0) + + tm.assert_frame_equal(self.tsframe, recons) + + def test_excel_date_datetime_format(self, merge_cells, engine, ext): + # see gh-4133 + # + # Excel output format strings + df = DataFrame([[date(2014, 1, 31), + date(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), + datetime(2014, 2, 28, 13, 5, 13)]], + index=["DATE", "DATETIME"], columns=["X", "Y"]) + df_expected = DataFrame([[datetime(2014, 1, 31), + datetime(1999, 9, 24)], + [datetime(1998, 5, 26, 23, 33, 4), + datetime(2014, 2, 28, 13, 5, 13)]], + index=["DATE", "DATETIME"], columns=["X", "Y"]) + + with ensure_clean(ext) as filename2: + writer1 = ExcelWriter(self.path) + writer2 = ExcelWriter(filename2, + date_format="DD.MM.YYYY", + datetime_format="DD.MM.YYYY HH-MM-SS") + + df.to_excel(writer1, "test1") + df.to_excel(writer2, "test1") + + writer1.close() + writer2.close() + + reader1 = ExcelFile(self.path) + reader2 = ExcelFile(filename2) + + rs1 = read_excel(reader1, "test1", index_col=0) + rs2 = read_excel(reader2, "test1", index_col=0) + + tm.assert_frame_equal(rs1, rs2) + + # Since the reader returns a datetime object for dates, + # we need to use df_expected to check the result. + tm.assert_frame_equal(rs2, df_expected) + + def test_to_excel_interval_no_labels(self, *_): + # see gh-19242 + # + # Test writing Interval without labels. + frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = frame.copy() + + frame["new"] = pd.cut(frame[0], 10) + expected["new"] = pd.cut(expected[0], 10).astype(str) + + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) + + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_interval_labels(self, *_): + # see gh-19242 + # + # Test writing Interval with labels. + frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + dtype=np.int64) + expected = frame.copy() + intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E", + "F", "G", "H", "I", "J"]) + frame["new"] = intervals + expected["new"] = pd.Series(list(intervals)) + + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) + + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_timedelta(self, *_): + # see gh-19242, gh-9155 + # + # Test writing timedelta to xls. + frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), + columns=["A"], dtype=np.int64) + expected = frame.copy() + + frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x)) + expected["new"] = expected["A"].apply( + lambda x: timedelta(seconds=x).total_seconds() / float(86400)) + + frame.to_excel(self.path, "test1") + reader = ExcelFile(self.path) + + recons = read_excel(reader, "test1", index_col=0) + tm.assert_frame_equal(expected, recons) + + def test_to_excel_periodindex(self, merge_cells, engine, ext): + frame = self.tsframe + xp = frame.resample('M', kind='period').mean() + + xp.to_excel(self.path, 'sht1') + + reader = ExcelFile(self.path) + rs = read_excel(reader, 'sht1', index_col=0) + tm.assert_frame_equal(xp, rs.to_period('M')) + + def test_to_excel_multiindex(self, merge_cells, engine, ext): + frame = self.frame + arrays = np.arange(len(frame.index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, + names=['first', 'second']) + frame.index = new_index + + frame.to_excel(self.path, 'test1', header=False) + frame.to_excel(self.path, 'test1', columns=['A', 'B']) + + # round trip + frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + reader = ExcelFile(self.path) + df = read_excel(reader, 'test1', index_col=[0, 1]) + tm.assert_frame_equal(frame, df) + + # GH13511 + def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): + frame = pd.DataFrame({'A': [None, 2, 3], + 'B': [10, 20, 30], + 'C': np.random.sample(3)}) + frame = frame.set_index(['A', 'B']) + + frame.to_excel(self.path, merge_cells=merge_cells) + df = read_excel(self.path, index_col=[0, 1]) + tm.assert_frame_equal(frame, df) + + # Test for Issue 11328. If column indices are integers, make + # sure they are handled correctly for either setting of + # merge_cells + def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): + frame = self.frame + arrays = np.arange(len(frame.index) * 2).reshape(2, -1) + new_index = MultiIndex.from_arrays(arrays, + names=['first', 'second']) + frame.index = new_index + + new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), + (50, 1), (50, 2)]) + frame.columns = new_cols_index + header = [0, 1] + if not merge_cells: + header = 0 + + # round trip + frame.to_excel(self.path, 'test1', merge_cells=merge_cells) + reader = ExcelFile(self.path) + df = read_excel(reader, 'test1', header=header, + index_col=[0, 1]) + if not merge_cells: + fm = frame.columns.format(sparsify=False, + adjoin=False, names=False) + frame.columns = [".".join(map(str, q)) for q in zip(*fm)] + tm.assert_frame_equal(frame, df) + + def test_to_excel_multiindex_dates(self, merge_cells, engine, ext): + # try multiindex with dates + tsframe = self.tsframe.copy() + new_index = [tsframe.index, np.arange(len(tsframe.index))] + tsframe.index = MultiIndex.from_arrays(new_index) + + tsframe.index.names = ['time', 'foo'] + tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells) + reader = ExcelFile(self.path) + recons = read_excel(reader, 'test1', + index_col=[0, 1]) + + tm.assert_frame_equal(tsframe, recons) + assert recons.index.names == ('time', 'foo') + + def test_to_excel_multiindex_no_write_index(self, merge_cells, engine, + ext): + # Test writing and re-reading a MI witout the index. GH 5616. + + # Initial non-MI frame. + frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) + + # Add a MI. + frame2 = frame1.copy() + multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) + frame2.index = multi_index + + # Write out to Excel without the index. + frame2.to_excel(self.path, 'test1', index=False) + + # Read it back in. + reader = ExcelFile(self.path) + frame3 = read_excel(reader, 'test1') + + # Test that it is the same as the initial frame. + tm.assert_frame_equal(frame1, frame3) + + def test_to_excel_float_format(self, *_): + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=["A", "B"], columns=["X", "Y", "Z"]) + df.to_excel(self.path, "test1", float_format="%.2f") + + reader = ExcelFile(self.path) + result = read_excel(reader, "test1", index_col=0) + + expected = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=["A", "B"], columns=["X", "Y", "Z"]) + tm.assert_frame_equal(result, expected) + + def test_to_excel_output_encoding(self, merge_cells, engine, ext): + # Avoid mixed inferred_type. + df = DataFrame([[u"\u0192", u"\u0193", u"\u0194"], + [u"\u0195", u"\u0196", u"\u0197"]], + index=[u"A\u0192", u"B"], + columns=[u"X\u0193", u"Y", u"Z"]) + + with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: + df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") + result = read_excel(filename, "TestSheet", + encoding="utf8", index_col=0) + tm.assert_frame_equal(result, df) + + def test_to_excel_unicode_filename(self, merge_cells, engine, ext): + with ensure_clean(u("\u0192u.") + ext) as filename: + try: + f = open(filename, "wb") + except UnicodeEncodeError: + pytest.skip("No unicode file names on this system") + else: + f.close() + + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=["A", "B"], columns=["X", "Y", "Z"]) + df.to_excel(filename, "test1", float_format="%.2f") + + reader = ExcelFile(filename) + result = read_excel(reader, "test1", index_col=0) + + expected = DataFrame([[0.12, 0.23, 0.57], + [12.32, 123123.20, 321321.20]], + index=["A", "B"], columns=["X", "Y", "Z"]) + tm.assert_frame_equal(result, expected) + + # def test_to_excel_header_styling_xls(self, merge_cells, engine, ext): + + # import StringIO + # s = StringIO( + # """Date,ticker,type,value + # 2001-01-01,x,close,12.2 + # 2001-01-01,x,open ,12.1 + # 2001-01-01,y,close,12.2 + # 2001-01-01,y,open ,12.1 + # 2001-02-01,x,close,12.2 + # 2001-02-01,x,open ,12.1 + # 2001-02-01,y,close,12.2 + # 2001-02-01,y,open ,12.1 + # 2001-03-01,x,close,12.2 + # 2001-03-01,x,open ,12.1 + # 2001-03-01,y,close,12.2 + # 2001-03-01,y,open ,12.1""") + # df = read_csv(s, parse_dates=["Date"]) + # pdf = df.pivot_table(values="value", rows=["ticker"], + # cols=["Date", "type"]) + + # try: + # import xlwt + # import xlrd + # except ImportError: + # pytest.skip + + # filename = '__tmp_to_excel_header_styling_xls__.xls' + # pdf.to_excel(filename, 'test1') + + # wbk = xlrd.open_workbook(filename, + # formatting_info=True) + # assert ["test1"] == wbk.sheet_names() + # ws = wbk.sheet_by_name('test1') + # assert [(0, 1, 5, 7), (0, 1, 3, 5), (0, 1, 1, 3)] == ws.merged_cells + # for i in range(0, 2): + # for j in range(0, 7): + # xfx = ws.cell_xf_index(0, 0) + # cell_xf = wbk.xf_list[xfx] + # font = wbk.font_list + # assert 1 == font[cell_xf.font_index].bold + # assert 1 == cell_xf.border.top_line_style + # assert 1 == cell_xf.border.right_line_style + # assert 1 == cell_xf.border.bottom_line_style + # assert 1 == cell_xf.border.left_line_style + # assert 2 == cell_xf.alignment.hor_align + # os.remove(filename) + # def test_to_excel_header_styling_xlsx(self, merge_cells, engine, ext): + # import StringIO + # s = StringIO( + # """Date,ticker,type,value + # 2001-01-01,x,close,12.2 + # 2001-01-01,x,open ,12.1 + # 2001-01-01,y,close,12.2 + # 2001-01-01,y,open ,12.1 + # 2001-02-01,x,close,12.2 + # 2001-02-01,x,open ,12.1 + # 2001-02-01,y,close,12.2 + # 2001-02-01,y,open ,12.1 + # 2001-03-01,x,close,12.2 + # 2001-03-01,x,open ,12.1 + # 2001-03-01,y,close,12.2 + # 2001-03-01,y,open ,12.1""") + # df = read_csv(s, parse_dates=["Date"]) + # pdf = df.pivot_table(values="value", rows=["ticker"], + # cols=["Date", "type"]) + # try: + # import openpyxl + # from openpyxl.cell import get_column_letter + # except ImportError: + # pytest.skip + # if openpyxl.__version__ < '1.6.1': + # pytest.skip + # # test xlsx_styling + # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx' + # pdf.to_excel(filename, 'test1') + # wbk = openpyxl.load_workbook(filename) + # assert ["test1"] == wbk.get_sheet_names() + # ws = wbk.get_sheet_by_name('test1') + # xlsaddrs = ["%s2" % chr(i) for i in range(ord('A'), ord('H'))] + # xlsaddrs += ["A%s" % i for i in range(1, 6)] + # xlsaddrs += ["B1", "D1", "F1"] + # for xlsaddr in xlsaddrs: + # cell = ws.cell(xlsaddr) + # assert cell.style.font.bold + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.top.border_style) + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.right.border_style) + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.bottom.border_style) + # assert (openpyxl.style.Border.BORDER_THIN == + # cell.style.borders.left.border_style) + # assert (openpyxl.style.Alignment.HORIZONTAL_CENTER == + # cell.style.alignment.horizontal) + # mergedcells_addrs = ["C1", "E1", "G1"] + # for maddr in mergedcells_addrs: + # assert ws.cell(maddr).merged + # os.remove(filename) + + @pytest.mark.parametrize("use_headers", [True, False]) + @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) + @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) + def test_excel_010_hemstring(self, merge_cells, engine, ext, + c_idx_nlevels, r_idx_nlevels, use_headers): + + def roundtrip(data, header=True, parser_hdr=0, index=True): + data.to_excel(self.path, header=header, + merge_cells=merge_cells, index=index) + + xf = ExcelFile(self.path) + return read_excel(xf, xf.sheet_names[0], header=parser_hdr) + + # Basic test. + parser_header = 0 if use_headers else None + res = roundtrip(DataFrame([0]), use_headers, parser_header) + + assert res.shape == (1, 2) + assert res.iloc[0, 0] is not np.nan + + # More complex tests with multi-index. + nrows = 5 + ncols = 3 + + from pandas.util.testing import makeCustomDataframe as mkdf + # ensure limited functionality in 0.10 + # override of gh-2370 until sorted out in 0.11 + + df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels, + c_idx_nlevels=c_idx_nlevels) + + # This if will be removed once multi-column Excel writing + # is implemented. For now fixing gh-9794. + if c_idx_nlevels > 1: + with pytest.raises(NotImplementedError): + roundtrip(df, use_headers, index=False) + else: + res = roundtrip(df, use_headers) + + if use_headers: + assert res.shape == (nrows, ncols + r_idx_nlevels) + else: + # First row taken as columns. + assert res.shape == (nrows - 1, ncols + r_idx_nlevels) + + # No NaNs. + for r in range(len(res.index)): + for c in range(len(res.columns)): + assert res.iloc[r, c] is not np.nan + + def test_duplicated_columns(self, *_): + # see gh-5235 + df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], + columns=["A", "B", "B"]) + df.to_excel(self.path, "test1") + expected = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], + columns=["A", "B", "B.1"]) + + # By default, we mangle. + result = read_excel(self.path, "test1", index_col=0) + tm.assert_frame_equal(result, expected) + + # Explicitly, we pass in the parameter. + result = read_excel(self.path, "test1", index_col=0, + mangle_dupe_cols=True) + tm.assert_frame_equal(result, expected) + + # see gh-11007, gh-10970 + df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "A", "B"]) + df.to_excel(self.path, "test1") + + result = read_excel(self.path, "test1", index_col=0) + expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=["A", "B", "A.1", "B.1"]) + tm.assert_frame_equal(result, expected) + + # see gh-10982 + df.to_excel(self.path, "test1", index=False, header=False) + result = read_excel(self.path, "test1", header=None) + + expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) + tm.assert_frame_equal(result, expected) + + msg = "Setting mangle_dupe_cols=False is not supported yet" + with pytest.raises(ValueError, match=msg): + read_excel(self.path, "test1", header=None, mangle_dupe_cols=False) + + def test_swapped_columns(self, merge_cells, engine, ext): + # Test for issue #5427. + write_frame = DataFrame({'A': [1, 1, 1], + 'B': [2, 2, 2]}) + write_frame.to_excel(self.path, 'test1', columns=['B', 'A']) + + read_frame = read_excel(self.path, 'test1', header=0) + + tm.assert_series_equal(write_frame['A'], read_frame['A']) + tm.assert_series_equal(write_frame['B'], read_frame['B']) + + def test_invalid_columns(self, *_): + # see gh-10982 + write_frame = DataFrame({"A": [1, 1, 1], + "B": [2, 2, 2]}) + + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + write_frame.to_excel(self.path, "test1", columns=["B", "C"]) + + expected = write_frame.reindex(columns=["B", "C"]) + read_frame = read_excel(self.path, "test1", index_col=0) + tm.assert_frame_equal(expected, read_frame) + + with pytest.raises(KeyError): + write_frame.to_excel(self.path, "test1", columns=["C", "D"]) + + def test_comment_arg(self, *_): + # see gh-18735 + # + # Test the comment argument functionality to read_excel. + + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], + "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") + + # Read file without comment arg. + result1 = read_excel(self.path, "test_c", index_col=0) + + result1.iloc[1, 0] = None + result1.iloc[1, 1] = None + result1.iloc[2, 1] = None + + result2 = read_excel(self.path, "test_c", comment="#", index_col=0) + tm.assert_frame_equal(result1, result2) + + def test_comment_default(self, merge_cells, engine, ext): + # Re issue #18735 + # Test the comment argument default to read_excel + + # Create file to read in + df = DataFrame({'A': ['one', '#one', 'one'], + 'B': ['two', 'two', '#two']}) + df.to_excel(self.path, 'test_c') + + # Read file with default and explicit comment=None + result1 = read_excel(self.path, 'test_c') + result2 = read_excel(self.path, 'test_c', comment=None) + tm.assert_frame_equal(result1, result2) + + def test_comment_used(self, *_): + # see gh-18735 + # + # Test the comment argument is working as expected when used. + + # Create file to read in. + df = DataFrame({"A": ["one", "#one", "one"], + "B": ["two", "two", "#two"]}) + df.to_excel(self.path, "test_c") + + # Test read_frame_comment against manually produced expected output. + expected = DataFrame({"A": ["one", None, "one"], + "B": ["two", None, None]}) + result = read_excel(self.path, "test_c", comment="#", index_col=0) + tm.assert_frame_equal(result, expected) + + def test_comment_empty_line(self, merge_cells, engine, ext): + # Re issue #18735 + # Test that read_excel ignores commented lines at the end of file + + df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) + df.to_excel(self.path, index=False) + + # Test that all-comment lines at EoF are ignored + expected = DataFrame({'a': [1], 'b': [2]}) + result = read_excel(self.path, comment='#') + tm.assert_frame_equal(result, expected) + + def test_datetimes(self, merge_cells, engine, ext): + + # Test writing and reading datetimes. For issue #9139. (xref #9185) + datetimes = [datetime(2013, 1, 13, 1, 2, 3), + datetime(2013, 1, 13, 2, 45, 56), + datetime(2013, 1, 13, 4, 29, 49), + datetime(2013, 1, 13, 6, 13, 42), + datetime(2013, 1, 13, 7, 57, 35), + datetime(2013, 1, 13, 9, 41, 28), + datetime(2013, 1, 13, 11, 25, 21), + datetime(2013, 1, 13, 13, 9, 14), + datetime(2013, 1, 13, 14, 53, 7), + datetime(2013, 1, 13, 16, 37, 0), + datetime(2013, 1, 13, 18, 20, 52)] + + write_frame = DataFrame({'A': datetimes}) + write_frame.to_excel(self.path, 'Sheet1') + read_frame = read_excel(self.path, 'Sheet1', header=0) + + tm.assert_series_equal(write_frame['A'], read_frame['A']) + + def test_bytes_io(self, merge_cells, engine, ext): + # see gh-7074 + bio = BytesIO() + df = DataFrame(np.random.randn(10, 2)) + + # Pass engine explicitly, as there is no file path to infer from. + writer = ExcelWriter(bio, engine=engine) + df.to_excel(writer) + writer.save() + + bio.seek(0) + reread_df = read_excel(bio, index_col=0) + tm.assert_frame_equal(df, reread_df) + + def test_write_lists_dict(self, *_): + # see gh-8188. + df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], + "numeric": [1, 2, 3.0], + "str": ["apple", "banana", "cherry"]}) + df.to_excel(self.path, "Sheet1") + read = read_excel(self.path, "Sheet1", header=0, index_col=0) + + expected = df.copy() + expected.mixed = expected.mixed.apply(str) + expected.numeric = expected.numeric.astype("int64") + + tm.assert_frame_equal(read, expected) + + def test_true_and_false_value_options(self, *_): + # see gh-13347 + df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"]) + expected = df.replace({"foo": True, "bar": False}) + + df.to_excel(self.path) + read_frame = read_excel(self.path, true_values=["foo"], + false_values=["bar"], index_col=0) + tm.assert_frame_equal(read_frame, expected) + + def test_freeze_panes(self, *_): + # see gh-15160 + expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) + expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1)) + + result = read_excel(self.path, index_col=0) + tm.assert_frame_equal(result, expected) + + def test_path_path_lib(self, merge_cells, engine, ext): + df = tm.makeDataFrame() + writer = partial(df.to_excel, engine=engine) + + reader = partial(pd.read_excel, index_col=0) + result = tm.round_trip_pathlib(writer, reader, + path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) + + def test_path_local_path(self, merge_cells, engine, ext): + df = tm.makeDataFrame() + writer = partial(df.to_excel, engine=engine) + + reader = partial(pd.read_excel, index_col=0) + result = tm.round_trip_pathlib(writer, reader, + path="foo.{ext}".format(ext=ext)) + tm.assert_frame_equal(result, df) + + +@td.skip_if_no('openpyxl') +@pytest.mark.parametrize("merge_cells,ext,engine", [ + (None, '.xlsx', 'openpyxl')]) +class TestOpenpyxlTests(_WriterBase): + + def test_to_excel_styleconverter(self, merge_cells, ext, engine): + from openpyxl import styles + + hstyle = { + "font": { + "color": '00FF0000', + "bold": True, + }, + "borders": { + "top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin", + }, + "alignment": { + "horizontal": "center", + "vertical": "top", + }, + "fill": { + "patternType": 'solid', + 'fgColor': { + 'rgb': '006666FF', + 'tint': 0.3, + }, + }, + "number_format": { + "format_code": "0.00" + }, + "protection": { + "locked": True, + "hidden": False, + }, + } + + font_color = styles.Color('00FF0000') + font = styles.Font(bold=True, color=font_color) + side = styles.Side(style=styles.borders.BORDER_THIN) + border = styles.Border(top=side, right=side, bottom=side, left=side) + alignment = styles.Alignment(horizontal='center', vertical='top') + fill_color = styles.Color(rgb='006666FF', tint=0.3) + fill = styles.PatternFill(patternType='solid', fgColor=fill_color) + + number_format = '0.00' + + protection = styles.Protection(locked=True, hidden=False) + + kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) + assert kw['font'] == font + assert kw['border'] == border + assert kw['alignment'] == alignment + assert kw['fill'] == fill + assert kw['number_format'] == number_format + assert kw['protection'] == protection + + def test_write_cells_merge_styled(self, merge_cells, ext, engine): + from pandas.io.formats.excel import ExcelCell + + sheet_name = 'merge_styled' + + sty_b1 = {'font': {'color': '00FF0000'}} + sty_a2 = {'font': {'color': '0000FF00'}} + + initial_cells = [ + ExcelCell(col=1, row=0, val=42, style=sty_b1), + ExcelCell(col=0, row=1, val=99, style=sty_a2), + ] + + sty_merged = {'font': {'color': '000000FF', 'bold': True}} + sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) + openpyxl_sty_merged = sty_kwargs['font'] + merge_cells = [ + ExcelCell(col=0, row=0, val='pandas', + mergestart=1, mergeend=1, style=sty_merged), + ] + + with ensure_clean(ext) as path: + writer = _OpenpyxlWriter(path) + writer.write_cells(initial_cells, sheet_name=sheet_name) + writer.write_cells(merge_cells, sheet_name=sheet_name) + + wks = writer.sheets[sheet_name] + xcell_b1 = wks['B1'] + xcell_a2 = wks['A2'] + assert xcell_b1.font == openpyxl_sty_merged + assert xcell_a2.font == openpyxl_sty_merged + + @pytest.mark.parametrize("mode,expected", [ + ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])]) + def test_write_append_mode(self, merge_cells, ext, engine, mode, expected): + import openpyxl + df = DataFrame([1], columns=['baz']) + + with ensure_clean(ext) as f: + wb = openpyxl.Workbook() + wb.worksheets[0].title = 'foo' + wb.worksheets[0]['A1'].value = 'foo' + wb.create_sheet('bar') + wb.worksheets[1]['A1'].value = 'bar' + wb.save(f) + + writer = ExcelWriter(f, engine=engine, mode=mode) + df.to_excel(writer, sheet_name='baz', index=False) + writer.save() + + wb2 = openpyxl.load_workbook(f) + result = [sheet.title for sheet in wb2.worksheets] + assert result == expected + + for index, cell_value in enumerate(expected): + assert wb2.worksheets[index]['A1'].value == cell_value + + +@td.skip_if_no('xlwt') +@pytest.mark.parametrize("merge_cells,ext,engine", [ + (None, '.xls', 'xlwt')]) +class TestXlwtTests(_WriterBase): + + def test_excel_raise_error_on_multiindex_columns_and_no_index( + self, merge_cells, ext, engine): + # MultiIndex as columns is not yet implemented 9794 + cols = MultiIndex.from_tuples([('site', ''), + ('2014', 'height'), + ('2014', 'weight')]) + df = DataFrame(np.random.randn(10, 3), columns=cols) + with pytest.raises(NotImplementedError): + with ensure_clean(ext) as path: + df.to_excel(path, index=False) + + def test_excel_multiindex_columns_and_index_true(self, merge_cells, ext, + engine): + cols = MultiIndex.from_tuples([('site', ''), + ('2014', 'height'), + ('2014', 'weight')]) + df = pd.DataFrame(np.random.randn(10, 3), columns=cols) + with ensure_clean(ext) as path: + df.to_excel(path, index=True) + + def test_excel_multiindex_index(self, merge_cells, ext, engine): + # MultiIndex as index works so assert no error #9794 + cols = MultiIndex.from_tuples([('site', ''), + ('2014', 'height'), + ('2014', 'weight')]) + df = DataFrame(np.random.randn(3, 10), index=cols) + with ensure_clean(ext) as path: + df.to_excel(path, index=False) + + def test_to_excel_styleconverter(self, merge_cells, ext, engine): + import xlwt + + hstyle = {"font": {"bold": True}, + "borders": {"top": "thin", + "right": "thin", + "bottom": "thin", + "left": "thin"}, + "alignment": {"horizontal": "center", "vertical": "top"}} + + xls_style = _XlwtWriter._convert_to_style(hstyle) + assert xls_style.font.bold + assert xlwt.Borders.THIN == xls_style.borders.top + assert xlwt.Borders.THIN == xls_style.borders.right + assert xlwt.Borders.THIN == xls_style.borders.bottom + assert xlwt.Borders.THIN == xls_style.borders.left + assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz + assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert + + def test_write_append_mode_raises(self, merge_cells, ext, engine): + msg = "Append mode is not supported with xlwt!" + + with ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine=engine, mode='a') + + +@td.skip_if_no('xlsxwriter') +@pytest.mark.parametrize("merge_cells,ext,engine", [ + (None, '.xlsx', 'xlsxwriter')]) +class TestXlsxWriterTests(_WriterBase): + + @td.skip_if_no('openpyxl') + def test_column_format(self, merge_cells, ext, engine): + # Test that column formats are applied to cells. Test for issue #9167. + # Applicable to xlsxwriter only. + with warnings.catch_warnings(): + # Ignore the openpyxl lxml warning. + warnings.simplefilter("ignore") + import openpyxl + + with ensure_clean(ext) as path: + frame = DataFrame({'A': [123456, 123456], + 'B': [123456, 123456]}) + + writer = ExcelWriter(path) + frame.to_excel(writer) + + # Add a number format to col B and ensure it is applied to cells. + num_format = '#,##0' + write_workbook = writer.book + write_worksheet = write_workbook.worksheets()[0] + col_format = write_workbook.add_format({'num_format': num_format}) + write_worksheet.set_column('B:B', None, col_format) + writer.save() + + read_workbook = openpyxl.load_workbook(path) + try: + read_worksheet = read_workbook['Sheet1'] + except TypeError: + # compat + read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1') + + # Get the number format from the cell. + try: + cell = read_worksheet['B2'] + except TypeError: + # compat + cell = read_worksheet.cell('B2') + + try: + read_num_format = cell.number_format + except Exception: + read_num_format = cell.style.number_format._format_code + + assert read_num_format == num_format + + def test_write_append_mode_raises(self, merge_cells, ext, engine): + msg = "Append mode is not supported with xlsxwriter!" + + with ensure_clean(ext) as f: + with pytest.raises(ValueError, match=msg): + ExcelWriter(f, engine=engine, mode='a') + + +class TestExcelWriterEngineTests(object): + + @pytest.mark.parametrize('klass,ext', [ + pytest.param(_XlsxWriter, '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('xlsxwriter'), reason='No xlsxwriter')), + pytest.param(_OpenpyxlWriter, '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param(_XlwtWriter, '.xls', marks=pytest.mark.skipif( + not td.safe_import('xlwt'), reason='No xlwt')) + ]) + def test_ExcelWriter_dispatch(self, klass, ext): + with ensure_clean(ext) as path: + writer = ExcelWriter(path) + if ext == '.xlsx' and td.safe_import('xlsxwriter'): + # xlsxwriter has preference over openpyxl if both installed + assert isinstance(writer, _XlsxWriter) + else: + assert isinstance(writer, klass) + + def test_ExcelWriter_dispatch_raises(self): + with pytest.raises(ValueError, match='No engine'): + ExcelWriter('nothing') + + @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") + def test_register_writer(self): + # some awkward mocking to test out dispatch and such actually works + called_save = [] + called_write_cells = [] + + class DummyClass(ExcelWriter): + called_save = False + called_write_cells = False + supported_extensions = ['test', 'xlsx', 'xls'] + engine = 'dummy' + + def save(self): + called_save.append(True) + + def write_cells(self, *args, **kwargs): + called_write_cells.append(True) + + def check_called(func): + func() + assert len(called_save) >= 1 + assert len(called_write_cells) >= 1 + del called_save[:] + del called_write_cells[:] + + with pd.option_context('io.excel.xlsx.writer', 'dummy'): + register_writer(DummyClass) + writer = ExcelWriter('something.test') + assert isinstance(writer, DummyClass) + df = tm.makeCustomDataframe(1, 1) + + with catch_warnings(record=True): + panel = tm.makePanel() + func = lambda: df.to_excel('something.test') + check_called(func) + check_called(lambda: panel.to_excel('something.test')) + check_called(lambda: df.to_excel('something.xlsx')) + check_called( + lambda: df.to_excel( + 'something.xls', engine='dummy')) + + +@pytest.mark.parametrize('engine', [ + pytest.param('xlwt', + marks=pytest.mark.xfail(reason='xlwt does not support ' + 'openpyxl-compatible ' + 'style dicts')), + 'xlsxwriter', + 'openpyxl', +]) +def test_styler_to_excel(engine): + def style(df): + # XXX: RGB colors not supported in xlwt + return DataFrame([['font-weight: bold', '', ''], + ['', 'color: blue', ''], + ['', '', 'text-decoration: underline'], + ['border-style: solid', '', ''], + ['', 'font-style: italic', ''], + ['', '', 'text-align: right'], + ['background-color: red', '', ''], + ['number-format: 0%', '', ''], + ['', '', ''], + ['', '', ''], + ['', '', '']], + index=df.index, columns=df.columns) + + def assert_equal_style(cell1, cell2): + # XXX: should find a better way to check equality + assert cell1.alignment.__dict__ == cell2.alignment.__dict__ + assert cell1.border.__dict__ == cell2.border.__dict__ + assert cell1.fill.__dict__ == cell2.fill.__dict__ + assert cell1.font.__dict__ == cell2.font.__dict__ + assert cell1.number_format == cell2.number_format + assert cell1.protection.__dict__ == cell2.protection.__dict__ + + def custom_converter(css): + # use bold iff there is custom style attached to the cell + if css.strip(' \n;'): + return {'font': {'bold': True}} + return {} + + pytest.importorskip('jinja2') + pytest.importorskip(engine) + + # Prepare spreadsheets + + df = DataFrame(np.random.randn(11, 3)) + with ensure_clean('.xlsx' if engine != 'xlwt' else '.xls') as path: + writer = ExcelWriter(path, engine=engine) + df.to_excel(writer, sheet_name='frame') + df.style.to_excel(writer, sheet_name='unstyled') + styled = df.style.apply(style, axis=None) + styled.to_excel(writer, sheet_name='styled') + ExcelFormatter(styled, style_converter=custom_converter).write( + writer, sheet_name='custom') + writer.save() + + if engine not in ('openpyxl', 'xlsxwriter'): + # For other engines, we only smoke test + return + openpyxl = pytest.importorskip('openpyxl') + wb = openpyxl.load_workbook(path) + + # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled + n_cells = 0 + for col1, col2 in zip(wb['frame'].columns, + wb['unstyled'].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + assert cell1.value == cell2.value + assert_equal_style(cell1, cell2) + n_cells += 1 + + # ensure iteration actually happened: + assert n_cells == (11 + 1) * (3 + 1) + + # (2) check styling with default converter + + # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF + alpha = '00' if engine == 'openpyxl' else 'FF' + + n_cells = 0 + for col1, col2 in zip(wb['frame'].columns, + wb['styled'].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + ref = '%s%d' % (cell2.column, cell2.row) + # XXX: this isn't as strong a test as ideal; we should + # confirm that differences are exclusive + if ref == 'B2': + assert not cell1.font.bold + assert cell2.font.bold + elif ref == 'C3': + assert cell1.font.color.rgb != cell2.font.color.rgb + assert cell2.font.color.rgb == alpha + '0000FF' + elif ref == 'D4': + # This fails with engine=xlsxwriter due to + # https://bitbucket.org/openpyxl/openpyxl/issues/800 + if engine == 'xlsxwriter' \ + and (LooseVersion(openpyxl.__version__) < + LooseVersion('2.4.6')): + pass + else: + assert cell1.font.underline != cell2.font.underline + assert cell2.font.underline == 'single' + elif ref == 'B5': + assert not cell1.border.left.style + assert (cell2.border.top.style == + cell2.border.right.style == + cell2.border.bottom.style == + cell2.border.left.style == + 'medium') + elif ref == 'C6': + assert not cell1.font.italic + assert cell2.font.italic + elif ref == 'D7': + assert (cell1.alignment.horizontal != + cell2.alignment.horizontal) + assert cell2.alignment.horizontal == 'right' + elif ref == 'B8': + assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb + assert cell1.fill.patternType != cell2.fill.patternType + assert cell2.fill.fgColor.rgb == alpha + 'FF0000' + assert cell2.fill.patternType == 'solid' + elif ref == 'B9': + assert cell1.number_format == 'General' + assert cell2.number_format == '0%' + else: + assert_equal_style(cell1, cell2) + + assert cell1.value == cell2.value + n_cells += 1 + + assert n_cells == (11 + 1) * (3 + 1) + + # (3) check styling with custom converter + n_cells = 0 + for col1, col2 in zip(wb['frame'].columns, + wb['custom'].columns): + assert len(col1) == len(col2) + for cell1, cell2 in zip(col1, col2): + ref = '%s%d' % (cell2.column, cell2.row) + if ref in ('B2', 'C3', 'D4', 'B5', 'C6', 'D7', 'B8', 'B9'): + assert not cell1.font.bold + assert cell2.font.bold + else: + assert_equal_style(cell1, cell2) + + assert cell1.value == cell2.value + n_cells += 1 + + assert n_cells == (11 + 1) * (3 + 1) + + +@td.skip_if_no('openpyxl') +@pytest.mark.skipif(not PY36, reason='requires fspath') +class TestFSPath(object): + + def test_excelfile_fspath(self): + with tm.ensure_clean('foo.xlsx') as path: + df = DataFrame({"A": [1, 2]}) + df.to_excel(path) + xl = ExcelFile(path) + result = os.fspath(xl) + assert result == path + + def test_excelwriter_fspath(self): + with tm.ensure_clean('foo.xlsx') as path: + writer = ExcelWriter(path) + assert os.fspath(writer) == str(path) diff --git a/pandas/tests/io/excel/xlrd.py b/pandas/tests/io/excel/xlrd.py new file mode 100644 index 0000000000000..b4b09cbe5e716 --- /dev/null +++ b/pandas/tests/io/excel/xlrd.py @@ -0,0 +1,352 @@ +import contextlib +from functools import partial +import os +import warnings + +import numpy as np +import pytest + +import pandas.util._test_decorators as td + +from pandas import DataFrame +from pandas.core.config import get_option, set_option +import pandas.util.testing as tm +from pandas.util.testing import ensure_clean + +from pandas.io.excel import ExcelFile, read_excel + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() +_frame = DataFrame(_seriesd)[:10] +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] +_tsframe = tm.makeTimeDataFrame()[:5] +_mixed_frame = _frame.copy() +_mixed_frame['foo'] = 'bar' + + +@contextlib.contextmanager +def ignore_xlrd_time_clock_warning(): + """ + Context manager to ignore warnings raised by the xlrd library, + regarding the deprecation of `time.clock` in Python 3.7. + """ + with warnings.catch_warnings(): + warnings.filterwarnings( + action='ignore', + message='time.clock has been deprecated', + category=DeprecationWarning) + yield + + +@td.skip_if_no('xlrd', '1.0.0') +class XlrdSharedItems(object): + + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def get_excelfile(self, basename, ext): + """ + Return test data ExcelFile instance. + + Parameters + ---------- + + basename : str + File base name, excluding file extension. + + Returns + ------- + + excel : io.excel.ExcelFile + """ + return ExcelFile(os.path.join(self.dirpath, basename + ext)) + + def get_exceldf(self, basename, ext, *args, **kwds): + """ + Return test data DataFrame. + + Parameters + ---------- + + basename : str + File base name, excluding file extension. + + Returns + ------- + + df : DataFrame + """ + pth = os.path.join(self.dirpath, basename + ext) + return read_excel(pth, *args, **kwds) + + +class XlrdReadingTestsBase(XlrdSharedItems): + # This is based on ExcelWriterBase + + @pytest.fixture(autouse=True, params=['xlrd']) + def set_engine(self, request): + func_name = "get_exceldf" + old_func = getattr(self, func_name) + new_func = partial(old_func, engine=request.param) + setattr(self, func_name, new_func) + yield + setattr(self, func_name, old_func) + + @pytest.mark.parametrize("usecols", [ + [0, 1, 3], [0, 3, 1], + [1, 0, 3], [1, 3, 0], + [3, 0, 1], [3, 1, 0], + ]) + + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 + def test_usecols_int(self, ext): + + df_ref = self.get_csv_refdf("test1") + df_ref = df_ref.reindex(columns=["A", "B", "C"]) + + # usecols as int + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df1 = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols=3) + + # usecols as int + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], + index_col=0, usecols=3) + + # parse_cols instead of usecols, usecols as int + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], + index_col=0, parse_cols=3) + + # TODO add index to xls file) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df3, df_ref, check_names=False) + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_usecols_list(self, ext): + + dfref = self.get_csv_refdf('test1') + dfref = dfref.reindex(columns=['B', 'C']) + df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols=[0, 2, 3]) + df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols=[0, 2, 3]) + + with tm.assert_produces_warning(FutureWarning): + with ignore_xlrd_time_clock_warning(): + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, parse_cols=[0, 2, 3]) + + # TODO add index to xls file) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + tm.assert_frame_equal(df3, dfref, check_names=False) + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_usecols_str(self, ext): + + dfref = self.get_csv_refdf('test1') + + df1 = dfref.reindex(columns=['A', 'B', 'C']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols='A:D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A:D') + + with tm.assert_produces_warning(FutureWarning): + with ignore_xlrd_time_clock_warning(): + df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, parse_cols='A:D') + + # TODO add index to xls, read xls ignores index name ? + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df4, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols='A,C,D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C,D') + # TODO add index to xls file + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols='A,C:D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C:D') + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_deprecated_sheetname(self, ext): + # gh-17964 + excel = self.get_excelfile('test1', ext) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + read_excel(excel, sheetname='Sheet1') + + with pytest.raises(TypeError): + read_excel(excel, sheet='Sheet1') + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_excel_table_sheet_by_index(self, ext): + + excel = self.get_excelfile('test1', ext) + dfref = self.get_csv_refdf('test1') + + df1 = read_excel(excel, 0, index_col=0) + df2 = read_excel(excel, 1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + + df1 = excel.parse(0, index_col=0) + df2 = excel.parse(1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + + df3 = read_excel(excel, 0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df4 = read_excel(excel, 0, index_col=0, skip_footer=1) + tm.assert_frame_equal(df3, df4) + + df3 = excel.parse(0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + import xlrd + with pytest.raises(xlrd.XLRDError): + read_excel(excel, 'asdf') + + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 + def test_sheet_name_and_sheetname(self, ext): + # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" + # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) + # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name + # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different + # behavior for "sheetname" argument + filename = "test1" + sheet_name = "Sheet1" + + df_ref = self.get_csv_refdf(filename) + df1 = self.get_exceldf(filename, ext, + sheet_name=sheet_name, index_col=0) # doc + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df2 = self.get_exceldf(filename, ext, index_col=0, + sheetname=sheet_name) # backward compat + + excel = self.get_excelfile(filename, ext) + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df2_parse = excel.parse(index_col=0, + sheetname=sheet_name) # backward compat + + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1_parse, df_ref, check_names=False) + tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + + +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestXlrdReader(XlrdReadingTestsBase): + """ + This is the base class for the xlrd tests, and 3 different file formats + are supported: xls, xlsx, xlsm + """ + + @td.skip_if_no("xlwt") + def test_read_xlrd_book(self, ext): + import xlrd + df = self.frame + + engine = "xlrd" + sheet_name = "SheetA" + + with ensure_clean(ext) as pth: + df.to_excel(pth, sheet_name) + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine=engine) as xl: + result = read_excel(xl, sheet_name, index_col=0) + tm.assert_frame_equal(df, result) + + result = read_excel(book, sheet_name=sheet_name, + engine=engine, index_col=0) + tm.assert_frame_equal(df, result) + + +class _XlrdWriterBase(XlrdSharedItems): + + @pytest.fixture(autouse=True) + def set_engine_and_path(self, request, merge_cells, engine, ext): + """Fixture to set engine and open file for use in each test case + + Rather than requiring `engine=...` to be provided explicitly as an + argument in each test, this fixture sets a global option to dictate + which engine should be used to write Excel files. After executing + the test it rolls back said change to the global option. + + It also uses a context manager to open a temporary excel file for + the function to write to, accessible via `self.path` + + Notes + ----- + This fixture will run as part of each test method defined in the + class and any subclasses, on account of the `autouse=True` + argument + """ + option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) + prev_engine = get_option(option_name) + set_option(option_name, engine) + with ensure_clean(ext) as path: + self.path = path + yield + set_option(option_name, prev_engine) # Roll back option change + + +@pytest.mark.parametrize("merge_cells", [True, False]) +@pytest.mark.parametrize("engine,ext", [ + pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif( + not td.safe_import('openpyxl'), reason='No openpyxl')), + pytest.param('xlwt', '.xls', marks=pytest.mark.skipif( + not td.safe_import('xlwt'), reason='No xlwt')), + pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( + not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) +]) + + +class XlrdTestExcelWriter(_XlrdWriterBase): + # Base class for test cases to run with different Excel writers. + + def test_excel_sheet_by_name_raise(self, *_): + import xlrd + + gt = DataFrame(np.random.randn(10, 2)) + gt.to_excel(self.path) + + xl = ExcelFile(self.path) + df = read_excel(xl, 0, index_col=0) + + tm.assert_frame_equal(gt, df) + + with pytest.raises(xlrd.XLRDError): + read_excel(xl, "0") \ No newline at end of file From 0f9d7f67b53b8c5b9f4b6fe6068b423cc308ec53 Mon Sep 17 00:00:00 2001 From: Steven Beardwell <46458390+stevenbw@users.noreply.github.com> Date: Sun, 13 Jan 2019 12:53:48 +0000 Subject: [PATCH 2/6] Split test_excel.py (#24472) --- pandas/tests/io/excel/base.py | 14 +- pandas/tests/io/excel/xlrd.py | 21 +- pandas/tests/io/formats/test_to_excel.py | 2 +- pandas/tests/io/test_excel.py | 2563 ---------------------- 4 files changed, 15 insertions(+), 2585 deletions(-) delete mode 100644 pandas/tests/io/test_excel.py diff --git a/pandas/tests/io/excel/base.py b/pandas/tests/io/excel/base.py index 636855711100b..9144cb5279826 100644 --- a/pandas/tests/io/excel/base.py +++ b/pandas/tests/io/excel/base.py @@ -82,7 +82,6 @@ def set_engine(self, request): [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0], ]) - def test_usecols_diff_positional_int_columns_order(self, ext, usecols): expected = self.get_csv_refdf("test1")[["A", "C"]] result = self.get_exceldf("test1", ext, "Sheet1", @@ -92,7 +91,6 @@ def test_usecols_diff_positional_int_columns_order(self, ext, usecols): @pytest.mark.parametrize("usecols", [ ["B", "D"], ["D", "B"] ]) - def test_usecols_diff_positional_str_columns_order(self, ext, usecols): expected = self.get_csv_refdf("test1")[["B", "D"]] expected.index = range(len(expected)) @@ -931,7 +929,7 @@ def test_read_excel_squeeze(self, ext): actual = pd.read_excel(f, 'one_column', squeeze=True) expected = pd.Series([1, 2, 3], name='a') tm.assert_series_equal(actual, expected) - + class _WriterBase(SharedItems): @@ -973,8 +971,6 @@ class and any subclasses, on account of the `autouse=True` pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) ]) - - class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. @@ -1958,7 +1954,7 @@ def test_write_append_mode(self, merge_cells, ext, engine, mode, expected): for index, cell_value in enumerate(expected): assert wb2.worksheets[index]['A1'].value == cell_value - + @td.skip_if_no('xlwt') @pytest.mark.parametrize("merge_cells,ext,engine", [ @@ -2019,7 +2015,7 @@ def test_write_append_mode_raises(self, merge_cells, ext, engine): with ensure_clean(ext) as f: with pytest.raises(ValueError, match=msg): ExcelWriter(f, engine=engine, mode='a') - + @td.skip_if_no('xlsxwriter') @pytest.mark.parametrize("merge_cells,ext,engine", [ @@ -2142,7 +2138,7 @@ def check_called(func): check_called( lambda: df.to_excel( 'something.xls', engine='dummy')) - + @pytest.mark.parametrize('engine', [ pytest.param('xlwt', @@ -2294,7 +2290,7 @@ def custom_converter(css): n_cells += 1 assert n_cells == (11 + 1) * (3 + 1) - + @td.skip_if_no('openpyxl') @pytest.mark.skipif(not PY36, reason='requires fspath') diff --git a/pandas/tests/io/excel/xlrd.py b/pandas/tests/io/excel/xlrd.py index b4b09cbe5e716..3e328923a457e 100644 --- a/pandas/tests/io/excel/xlrd.py +++ b/pandas/tests/io/excel/xlrd.py @@ -83,11 +83,11 @@ def get_exceldf(self, basename, ext, *args, **kwds): """ pth = os.path.join(self.dirpath, basename + ext) return read_excel(pth, *args, **kwds) - - + + class XlrdReadingTestsBase(XlrdSharedItems): # This is based on ExcelWriterBase - + @pytest.fixture(autouse=True, params=['xlrd']) def set_engine(self, request): func_name = "get_exceldf" @@ -102,7 +102,6 @@ def set_engine(self, request): [1, 0, 3], [1, 3, 0], [3, 0, 1], [3, 1, 0], ]) - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_usecols_int(self, ext): @@ -192,7 +191,7 @@ def test_usecols_str(self, ext): index_col=0, usecols='A,C:D') tm.assert_frame_equal(df2, df1, check_names=False) tm.assert_frame_equal(df3, df1, check_names=False) - + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 def test_deprecated_sheetname(self, ext): # gh-17964 @@ -233,7 +232,7 @@ def test_excel_table_sheet_by_index(self, ext): import xlrd with pytest.raises(xlrd.XLRDError): read_excel(excel, 'asdf') - + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_sheet_name_and_sheetname(self, ext): # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" @@ -262,7 +261,7 @@ def test_sheet_name_and_sheetname(self, ext): tm.assert_frame_equal(df2, df_ref, check_names=False) tm.assert_frame_equal(df1_parse, df_ref, check_names=False) tm.assert_frame_equal(df2_parse, df_ref, check_names=False) - + @pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) class TestXlrdReader(XlrdReadingTestsBase): @@ -290,8 +289,8 @@ def test_read_xlrd_book(self, ext): result = read_excel(book, sheet_name=sheet_name, engine=engine, index_col=0) tm.assert_frame_equal(df, result) - - + + class _XlrdWriterBase(XlrdSharedItems): @pytest.fixture(autouse=True) @@ -332,8 +331,6 @@ class and any subclasses, on account of the `autouse=True` pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) ]) - - class XlrdTestExcelWriter(_XlrdWriterBase): # Base class for test cases to run with different Excel writers. @@ -349,4 +346,4 @@ def test_excel_sheet_by_name_raise(self, *_): tm.assert_frame_equal(gt, df) with pytest.raises(xlrd.XLRDError): - read_excel(xl, "0") \ No newline at end of file + read_excel(xl, "0") diff --git a/pandas/tests/io/formats/test_to_excel.py b/pandas/tests/io/formats/test_to_excel.py index 13eb517fcab6a..2bbbc002db729 100644 --- a/pandas/tests/io/formats/test_to_excel.py +++ b/pandas/tests/io/formats/test_to_excel.py @@ -1,6 +1,6 @@ """Tests formatting as writer-agnostic ExcelCells -ExcelFormatter is tested implicitly in pandas/tests/io/test_excel.py +ExcelFormatter is tested implicitly in pandas/tests/io/mainF.py """ import pytest diff --git a/pandas/tests/io/test_excel.py b/pandas/tests/io/test_excel.py deleted file mode 100644 index 84383afed1d03..0000000000000 --- a/pandas/tests/io/test_excel.py +++ /dev/null @@ -1,2563 +0,0 @@ -from collections import OrderedDict -import contextlib -from datetime import date, datetime, time, timedelta -from distutils.version import LooseVersion -from functools import partial -import os -import warnings -from warnings import catch_warnings - -import numpy as np -from numpy import nan -import pytest - -from pandas.compat import PY36, BytesIO, iteritems, map, range, u -import pandas.util._test_decorators as td - -import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series -from pandas.core.config import get_option, set_option -import pandas.util.testing as tm -from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf - -from pandas.io.common import URLError -from pandas.io.excel import ( - ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, - read_excel, register_writer) -from pandas.io.formats.excel import ExcelFormatter -from pandas.io.parsers import read_csv - -_seriesd = tm.getSeriesData() -_tsd = tm.getTimeSeriesData() -_frame = DataFrame(_seriesd)[:10] -_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] -_tsframe = tm.makeTimeDataFrame()[:5] -_mixed_frame = _frame.copy() -_mixed_frame['foo'] = 'bar' - - -@contextlib.contextmanager -def ignore_xlrd_time_clock_warning(): - """ - Context manager to ignore warnings raised by the xlrd library, - regarding the deprecation of `time.clock` in Python 3.7. - """ - with warnings.catch_warnings(): - warnings.filterwarnings( - action='ignore', - message='time.clock has been deprecated', - category=DeprecationWarning) - yield - - -@td.skip_if_no('xlrd', '1.0.0') -class SharedItems(object): - - @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.dirpath = datapath("io", "data") - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.tsframe = _tsframe.copy() - self.mixed_frame = _mixed_frame.copy() - - def get_csv_refdf(self, basename): - """ - Obtain the reference data from read_csv with the Python engine. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - dfref : DataFrame - """ - pref = os.path.join(self.dirpath, basename + '.csv') - dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') - return dfref - - def get_excelfile(self, basename, ext): - """ - Return test data ExcelFile instance. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - excel : io.excel.ExcelFile - """ - return ExcelFile(os.path.join(self.dirpath, basename + ext)) - - def get_exceldf(self, basename, ext, *args, **kwds): - """ - Return test data DataFrame. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - df : DataFrame - """ - pth = os.path.join(self.dirpath, basename + ext) - return read_excel(pth, *args, **kwds) - - -class ReadingTestsBase(SharedItems): - # This is based on ExcelWriterBase - - @pytest.fixture(autouse=True, params=['xlrd', None]) - def set_engine(self, request): - func_name = "get_exceldf" - old_func = getattr(self, func_name) - new_func = partial(old_func, engine=request.param) - setattr(self, func_name, new_func) - yield - setattr(self, func_name, old_func) - - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 - def test_usecols_int(self, ext): - - df_ref = self.get_csv_refdf("test1") - df_ref = df_ref.reindex(columns=["A", "B", "C"]) - - # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df1 = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols=3) - - # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], - index_col=0, usecols=3) - - # parse_cols instead of usecols, usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], - index_col=0, parse_cols=3) - - # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - tm.assert_frame_equal(df3, df_ref, check_names=False) - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_usecols_list(self, ext): - - dfref = self.get_csv_refdf('test1') - dfref = dfref.reindex(columns=['B', 'C']) - df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols=[0, 2, 3]) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols=[0, 2, 3]) - - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols=[0, 2, 3]) - - # TODO add index to xls file) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df3, dfref, check_names=False) - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_usecols_str(self, ext): - - dfref = self.get_csv_refdf('test1') - - df1 = dfref.reindex(columns=['A', 'B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A:D') - - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols='A:D') - - # TODO add index to xls, read xls ignores index name ? - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) - tm.assert_frame_equal(df4, df1, check_names=False) - - df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C,D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C,D') - # TODO add index to xls file - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) - - df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C:D') - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) - - @pytest.mark.parametrize("usecols", [ - [0, 1, 3], [0, 3, 1], - [1, 0, 3], [1, 3, 0], - [3, 0, 1], [3, 1, 0], - ]) - def test_usecols_diff_positional_int_columns_order(self, ext, usecols): - expected = self.get_csv_refdf("test1")[["A", "C"]] - result = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols=usecols) - tm.assert_frame_equal(result, expected, check_names=False) - - @pytest.mark.parametrize("usecols", [ - ["B", "D"], ["D", "B"] - ]) - def test_usecols_diff_positional_str_columns_order(self, ext, usecols): - expected = self.get_csv_refdf("test1")[["B", "D"]] - expected.index = range(len(expected)) - - result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols) - tm.assert_frame_equal(result, expected, check_names=False) - - def test_read_excel_without_slicing(self, ext): - expected = self.get_csv_refdf("test1") - result = self.get_exceldf("test1", ext, "Sheet1", index_col=0) - tm.assert_frame_equal(result, expected, check_names=False) - - def test_usecols_excel_range_str(self, ext): - expected = self.get_csv_refdf("test1")[["C", "D"]] - result = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols="A,D:E") - tm.assert_frame_equal(result, expected, check_names=False) - - def test_usecols_excel_range_str_invalid(self, ext): - msg = "Invalid column name: E1" - - with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1") - - def test_index_col_label_error(self, ext): - msg = "list indices must be integers.*, not str" - - with pytest.raises(TypeError, match=msg): - self.get_exceldf("test1", ext, "Sheet1", index_col=["A"], - usecols=["A", "C"]) - - def test_index_col_empty(self, ext): - # see gh-9208 - result = self.get_exceldf("test1", ext, "Sheet3", - index_col=["A", "B", "C"]) - expected = DataFrame(columns=["D", "E", "F"], - index=MultiIndex(levels=[[]] * 3, - codes=[[]] * 3, - names=["A", "B", "C"])) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("index_col", [None, 2]) - def test_index_col_with_unnamed(self, ext, index_col): - # see gh-18792 - result = self.get_exceldf("test1", ext, "Sheet4", - index_col=index_col) - expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], - columns=["Unnamed: 0", "col1", "col2"]) - if index_col: - expected = expected.set_index(expected.columns[index_col]) - - tm.assert_frame_equal(result, expected) - - def test_usecols_pass_non_existent_column(self, ext): - msg = ("Usecols do not match columns, " - "columns expected but not found: " + r"\['E'\]") - - with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, usecols=["E"]) - - def test_usecols_wrong_type(self, ext): - msg = ("'usecols' must either be list-like of " - "all strings, all unicode, all integers or a callable.") - - with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, usecols=["E1", 0]) - - def test_excel_stop_iterator(self, ext): - - parsed = self.get_exceldf('test2', ext, 'Sheet1') - expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) - tm.assert_frame_equal(parsed, expected) - - def test_excel_cell_error_na(self, ext): - - parsed = self.get_exceldf('test3', ext, 'Sheet1') - expected = DataFrame([[np.nan]], columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - def test_excel_passes_na(self, ext): - - excel = self.get_excelfile('test4', ext) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - # 13967 - excel = self.get_excelfile('test5', ext) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_deprecated_sheetname(self, ext): - # gh-17964 - excel = self.get_excelfile('test1', ext) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - read_excel(excel, sheetname='Sheet1') - - with pytest.raises(TypeError): - read_excel(excel, sheet='Sheet1') - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_excel_table_sheet_by_index(self, ext): - - excel = self.get_excelfile('test1', ext) - dfref = self.get_csv_refdf('test1') - - df1 = read_excel(excel, 0, index_col=0) - df2 = read_excel(excel, 1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - - df1 = excel.parse(0, index_col=0) - df2 = excel.parse(1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - - df3 = read_excel(excel, 0, index_col=0, skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df4 = read_excel(excel, 0, index_col=0, skip_footer=1) - tm.assert_frame_equal(df3, df4) - - df3 = excel.parse(0, index_col=0, skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - import xlrd - with pytest.raises(xlrd.XLRDError): - read_excel(excel, 'asdf') - - def test_excel_table(self, ext): - - dfref = self.get_csv_refdf('test1') - - df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0) - # TODO add index to file - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - - df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - def test_reader_special_dtypes(self, ext): - - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, 4, 0]), - ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), - ("BoolCol", [True, False, True, True, False]), - ("StrCol", [1, 2, 3, 4, 5]), - # GH5394 - this is why convert_float isn't vectorized - ("Str2Col", ["a", 3, "c", "d", "e"]), - ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), - datetime(1905, 1, 1), datetime(2013, 12, 14), - datetime(2015, 3, 14)]) - ])) - basename = 'test_types' - - # should read in correctly and infer types - actual = self.get_exceldf(basename, ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - # if not coercing number, then int comes in as float - float_expected = expected.copy() - float_expected["IntCol"] = float_expected["IntCol"].astype(float) - float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False) - tm.assert_frame_equal(actual, float_expected) - - # check setting Index (assuming xls and xlsx are the same here) - for icol, name in enumerate(expected.columns): - actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol) - exp = expected.set_index(name) - tm.assert_frame_equal(actual, exp) - - # convert_float and converters should be different but both accepted - expected["StrCol"] = expected["StrCol"].apply(str) - actual = self.get_exceldf( - basename, ext, 'Sheet1', converters={"StrCol": str}) - tm.assert_frame_equal(actual, expected) - - no_convert_float = float_expected.copy() - no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False, - converters={"StrCol": str}) - tm.assert_frame_equal(actual, no_convert_float) - - # GH8212 - support for converters and missing values - def test_reader_converters(self, ext): - - basename = 'test_converters' - - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, -1000, 0]), - ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), - ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), - ("StrCol", ['1', np.nan, '3', '4', '5']), - ])) - - converters = {'IntCol': lambda x: int(x) if x != '' else -1000, - 'FloatCol': lambda x: 10 * x if x else np.nan, - 2: lambda x: 'Found' if x != '' else 'Not found', - 3: lambda x: str(x) if x else '', - } - - # should read in correctly and set types of single cells (not array - # dtypes) - actual = self.get_exceldf(basename, ext, 'Sheet1', - converters=converters) - tm.assert_frame_equal(actual, expected) - - def test_reader_dtype(self, ext): - # GH 8212 - basename = 'testdtype' - actual = self.get_exceldf(basename, ext) - - expected = DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [2.5, 3.5, 4.5, 5.5], - 'c': [1, 2, 3, 4], - 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( - columns=['a', 'b', 'c', 'd']) - - tm.assert_frame_equal(actual, expected) - - actual = self.get_exceldf(basename, ext, - dtype={'a': 'float64', - 'b': 'float32', - 'c': str}) - - expected['a'] = expected['a'].astype('float64') - expected['b'] = expected['b'].astype('float32') - expected['c'] = ['001', '002', '003', '004'] - tm.assert_frame_equal(actual, expected) - - with pytest.raises(ValueError): - self.get_exceldf(basename, ext, dtype={'d': 'int64'}) - - @pytest.mark.parametrize("dtype,expected", [ - (None, - DataFrame({ - "a": [1, 2, 3, 4], - "b": [2.5, 3.5, 4.5, 5.5], - "c": [1, 2, 3, 4], - "d": [1.0, 2.0, np.nan, 4.0] - })), - ({"a": "float64", - "b": "float32", - "c": str, - "d": str - }, - DataFrame({ - "a": Series([1, 2, 3, 4], dtype="float64"), - "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": ["001", "002", "003", "004"], - "d": ["1", "2", np.nan, "4"] - })), - ]) - def test_reader_dtype_str(self, ext, dtype, expected): - # see gh-20377 - basename = "testdtype" - - actual = self.get_exceldf(basename, ext, dtype=dtype) - tm.assert_frame_equal(actual, expected) - - def test_reading_all_sheets(self, ext): - # Test reading all sheetnames by setting sheetname to None, - # Ensure a dict is returned. - # See PR #9450 - basename = 'test_multisheet' - dfs = self.get_exceldf(basename, ext, sheet_name=None) - # ensure this is not alphabetical to test order preservation - expected_keys = ['Charlie', 'Alpha', 'Beta'] - tm.assert_contains_all(expected_keys, dfs.keys()) - # Issue 9930 - # Ensure sheet order is preserved - assert expected_keys == list(dfs.keys()) - - def test_reading_multiple_specific_sheets(self, ext): - # Test reading specific sheetnames by specifying a mixed list - # of integers and strings, and confirm that duplicated sheet - # references (positions/names) are removed properly. - # Ensure a dict is returned - # See PR #9450 - basename = 'test_multisheet' - # Explicitly request duplicates. Only the set should be returned. - expected_keys = [2, 'Charlie', 'Charlie'] - dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys) - expected_keys = list(set(expected_keys)) - tm.assert_contains_all(expected_keys, dfs.keys()) - assert len(expected_keys) == len(dfs.keys()) - - def test_reading_all_sheets_with_blank(self, ext): - # Test reading all sheetnames by setting sheetname to None, - # In the case where some sheets are blank. - # Issue #11711 - basename = 'blank_with_header' - dfs = self.get_exceldf(basename, ext, sheet_name=None) - expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] - tm.assert_contains_all(expected_keys, dfs.keys()) - - # GH6403 - def test_read_excel_blank(self, ext): - actual = self.get_exceldf('blank', ext, 'Sheet1') - tm.assert_frame_equal(actual, DataFrame()) - - def test_read_excel_blank_with_header(self, ext): - expected = DataFrame(columns=['col_1', 'col_2']) - actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([np.nan] * 4)), - (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) - ]) - def test_read_one_empty_col_no_header(self, ext, header, expected): - # xref gh-12292 - filename = "no_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, filename, index=False, header=False) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([0] + [np.nan] * 4)), - (0, DataFrame([np.nan] * 4)) - ]) - def test_read_one_empty_col_with_header(self, ext, header, expected): - filename = "with_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, 'with_header', index=False, header=True) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - def test_set_column_names_in_parameter(self, ext): - # GH 12870 : pass down column names associated with - # keyword argument names - refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], - [3, 'baz']], columns=['a', 'b']) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as writer: - refdf.to_excel(writer, 'Data_no_head', - header=False, index=False) - refdf.to_excel(writer, 'Data_with_head', index=False) - - refdf.columns = ['A', 'B'] - - with ExcelFile(pth) as reader: - xlsdf_no_head = read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) - xlsdf_with_head = read_excel(reader, 'Data_with_head', - index_col=None, names=['A', 'B']) - - tm.assert_frame_equal(xlsdf_no_head, refdf) - tm.assert_frame_equal(xlsdf_with_head, refdf) - - def test_date_conversion_overflow(self, ext): - # GH 10001 : pandas.ExcelFile ignore parse_dates=False - expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], - [pd.Timestamp('2016-03-16'), 'Jack Black'], - [1e+20, 'Timothy Brown']], - columns=['DateColWithBigInt', 'StringCol']) - - result = self.get_exceldf('testdateoverflow', ext) - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 - def test_sheet_name_and_sheetname(self, ext): - # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" - # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) - # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name - # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different - # behavior for "sheetname" argument - filename = "test1" - sheet_name = "Sheet1" - - df_ref = self.get_csv_refdf(filename) - df1 = self.get_exceldf(filename, ext, - sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf(filename, ext, index_col=0, - sheetname=sheet_name) # backward compat - - excel = self.get_excelfile(filename, ext) - df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2_parse = excel.parse(index_col=0, - sheetname=sheet_name) # backward compat - - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - tm.assert_frame_equal(df1_parse, df_ref, check_names=False) - tm.assert_frame_equal(df2_parse, df_ref, check_names=False) - - def test_sheet_name_both_raises(self, ext): - with pytest.raises(TypeError, match="Cannot specify both"): - self.get_exceldf('test1', ext, sheetname='Sheet1', - sheet_name='Sheet1') - - excel = self.get_excelfile('test1', ext) - with pytest.raises(TypeError, match="Cannot specify both"): - excel.parse(sheetname='Sheet1', - sheet_name='Sheet1') - - def test_excel_read_buffer(self, ext): - - pth = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(pth, 'Sheet1', index_col=0) - with open(pth, 'rb') as f: - actual = read_excel(f, 'Sheet1', index_col=0) - tm.assert_frame_equal(expected, actual) - - with open(pth, 'rb') as f: - xls = ExcelFile(f) - actual = read_excel(xls, 'Sheet1', index_col=0) - tm.assert_frame_equal(expected, actual) - - def test_bad_engine_raises(self, ext): - bad_engine = 'foo' - with pytest.raises(ValueError, message="Unknown engine: foo"): - read_excel('', engine=bad_engine) - - @tm.network - def test_read_from_http_url(self, ext): - url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/tests/io/data/test1' + ext) - url_table = read_excel(url) - local_table = self.get_exceldf('test1', ext) - tm.assert_frame_equal(url_table, local_table) - - @td.skip_if_not_us_locale - def test_read_from_s3_url(self, ext, s3_resource): - # Bucket "pandas-test" created in tests/io/conftest.py - file_name = os.path.join(self.dirpath, 'test1' + ext) - - with open(file_name, "rb") as f: - s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext, - Body=f) - - url = ('s3://pandas-test/test1' + ext) - url_table = read_excel(url) - local_table = self.get_exceldf('test1', ext) - tm.assert_frame_equal(url_table, local_table) - - @pytest.mark.slow - # ignore warning from old xlrd - @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning") - def test_read_from_file_url(self, ext): - - # FILE - localtable = os.path.join(self.dirpath, 'test1' + ext) - local_table = read_excel(localtable) - - try: - url_table = read_excel('file://localhost/' + localtable) - except URLError: - # fails on some systems - import platform - pytest.skip("failing on %s" % - ' '.join(platform.uname()).strip()) - - tm.assert_frame_equal(url_table, local_table) - - @td.skip_if_no('pathlib') - def test_read_from_pathlib_path(self, ext): - - # GH12655 - from pathlib import Path - - str_path = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(str_path, 'Sheet1', index_col=0) - - path_obj = Path(self.dirpath, 'test1' + ext) - actual = read_excel(path_obj, 'Sheet1', index_col=0) - - tm.assert_frame_equal(expected, actual) - - @td.skip_if_no('py.path') - def test_read_from_py_localpath(self, ext): - - # GH12655 - from py.path import local as LocalPath - - str_path = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(str_path, 'Sheet1', index_col=0) - - abs_dir = os.path.abspath(self.dirpath) - path_obj = LocalPath(abs_dir).join('test1' + ext) - actual = read_excel(path_obj, 'Sheet1', index_col=0) - - tm.assert_frame_equal(expected, actual) - - def test_reader_closes_file(self, ext): - - pth = os.path.join(self.dirpath, 'test1' + ext) - f = open(pth, 'rb') - with ExcelFile(f) as xlsx: - # parses okay - read_excel(xlsx, 'Sheet1', index_col=0) - - assert f.closed - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_creating_and_reading_multiple_sheets(self, ext): - # see gh-9450 - # - # Test reading multiple sheets, from a runtime - # created Excel file with multiple sheets. - def tdf(col_sheet_name): - d, i = [11, 22, 33], [1, 2, 3] - return DataFrame(d, i, columns=[col_sheet_name]) - - sheets = ["AAA", "BBB", "CCC"] - - dfs = [tdf(s) for s in sheets] - dfs = dict(zip(sheets, dfs)) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as ew: - for sheetname, df in iteritems(dfs): - df.to_excel(ew, sheetname) - - dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) - - for s in sheets: - tm.assert_frame_equal(dfs[s], dfs_returned[s]) - - def test_reader_seconds(self, ext): - - # Test reading times with and without milliseconds. GH5945. - expected = DataFrame.from_dict({"Time": [time(1, 2, 3), - time(2, 45, 56, 100000), - time(4, 29, 49, 200000), - time(6, 13, 42, 300000), - time(7, 57, 35, 400000), - time(9, 41, 28, 500000), - time(11, 25, 21, 600000), - time(13, 9, 14, 700000), - time(14, 53, 7, 800000), - time(16, 37, 0, 900000), - time(18, 20, 54)]}) - - actual = self.get_exceldf('times_1900', ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - actual = self.get_exceldf('times_1904', ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - def test_read_excel_multiindex(self, ext): - # see gh-4679 - mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) - mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) - - # "mi_column" sheet - expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], - [2, 3.5, pd.Timestamp("2015-01-02"), False], - [3, 4.5, pd.Timestamp("2015-01-03"), False], - [4, 5.5, pd.Timestamp("2015-01-04"), True]], - columns=mi) - - actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - # "mi_index" sheet - expected.index = mi - expected.columns = ["a", "b", "c", "d"] - - actual = read_excel(mi_file, "mi_index", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) - - # "both" sheet - expected.columns = mi - - actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) - - # "mi_index_name" sheet - expected.columns = ["a", "b", "c", "d"] - expected.index = mi.set_names(["ilvl1", "ilvl2"]) - - actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # "mi_column_name" sheet - expected.index = list(range(4)) - expected.columns = mi.set_names(["c1", "c2"]) - actual = read_excel(mi_file, "mi_column_name", - header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - # see gh-11317 - # "name_with_int" sheet - expected.columns = mi.set_levels( - [1, 2], level=1).set_names(["c1", "c2"]) - - actual = read_excel(mi_file, "name_with_int", - index_col=0, header=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # "both_name" sheet - expected.columns = mi.set_names(["c1", "c2"]) - expected.index = mi.set_names(["ilvl1", "ilvl2"]) - - actual = read_excel(mi_file, "both_name", - index_col=[0, 1], header=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # "both_skiprows" sheet - actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], - header=[0, 1], skiprows=2) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_multiindex_header_only(self, ext): - # see gh-11733. - # - # Don't try to parse a header name if there isn't one. - mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) - result = read_excel(mi_file, "index_col_none", header=[0, 1]) - - exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) - expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlsxwriter") - def test_read_excel_multiindex_empty_level(self, ext): - # see gh-12453 - with ensure_clean(ext) as path: - df = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0} - }) - - expected = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - df = pd.DataFrame({ - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - expected = pd.DataFrame({ - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - @td.skip_if_no("xlsxwriter") - @pytest.mark.parametrize("c_idx_names", [True, False]) - @pytest.mark.parametrize("r_idx_names", [True, False]) - @pytest.mark.parametrize("c_idx_levels", [1, 3]) - @pytest.mark.parametrize("r_idx_levels", [1, 3]) - def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels): - # see gh-4679 - with ensure_clean(ext) as pth: - if c_idx_levels == 1 and c_idx_names: - pytest.skip("Column index name cannot be " - "serialized unless it's a MultiIndex") - - # Empty name case current read in as - # unnamed levels, not Nones. - check_names = r_idx_names or r_idx_levels <= 1 - - df = mkdf(5, 5, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels) - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[0, :] = np.nan - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - def test_excel_old_index_format(self, ext): - # see gh-4679 - filename = "test_index_name_pre17" + ext - in_file = os.path.join(self.dirpath, filename) - - # We detect headers to determine if index names exist, so - # that "index" name in the "names" version of the data will - # now be interpreted as rows that include null data. - data = np.array([[None, None, None, None, None], - ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) - columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", - "R_l0_g2", "R_l0_g3", "R_l0_g4"], - ["R1", "R_l1_g0", "R_l1_g1", - "R_l1_g2", "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], - names=[None, None]) - si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) - - expected = pd.DataFrame(data, index=si, columns=columns) - - actual = pd.read_excel(in_file, "single_names", index_col=0) - tm.assert_frame_equal(actual, expected) - - expected.index = mi - - actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # The analogous versions of the "names" version data - # where there are explicitly no names for the indices. - data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) - columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], - ["R_l1_g0", "R_l1_g1", "R_l1_g2", - "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], - names=[None, None]) - si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) - - expected = pd.DataFrame(data, index=si, columns=columns) - - actual = pd.read_excel(in_file, "single_no_names", index_col=0) - tm.assert_frame_equal(actual, expected) - - expected.index = mi - - actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) - - def test_read_excel_bool_header_arg(self, ext): - # GH 6114 - for arg in [True, False]: - with pytest.raises(TypeError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - header=arg) - - def test_read_excel_chunksize(self, ext): - # GH 8011 - with pytest.raises(NotImplementedError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - chunksize=100) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_read_excel_parse_dates(self, ext): - # see gh-11544, gh-12051 - df = DataFrame( - {"col": [1, 2, 3], - "date_strings": pd.date_range("2012-01-01", periods=3)}) - df2 = df.copy() - df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") - - with ensure_clean(ext) as pth: - df2.to_excel(pth) - - res = read_excel(pth, index_col=0) - tm.assert_frame_equal(df2, res) - - res = read_excel(pth, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) - - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) - tm.assert_frame_equal(df, res) - - def test_read_excel_skiprows_list(self, ext): - # GH 4903 - actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + ext), - 'skiprows_list', skiprows=[0, 2]) - expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], - [2, 3.5, pd.Timestamp('2015-01-02'), False], - [3, 4.5, pd.Timestamp('2015-01-03'), False], - [4, 5.5, pd.Timestamp('2015-01-04'), True]], - columns=['a', 'b', 'c', 'd']) - tm.assert_frame_equal(actual, expected) - - actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + ext), - 'skiprows_list', skiprows=np.array([0, 2])) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows(self, ext): - # GH 16645 - num_rows_to_pull = 5 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows=num_rows_to_pull) - expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + ext)) - expected = expected[:num_rows_to_pull] - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_greater_than_nrows_in_file(self, ext): - # GH 16645 - expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + ext)) - num_records_in_file = len(expected) - num_rows_to_pull = num_records_in_file + 10 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows=num_rows_to_pull) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_non_integer_parameter(self, ext): - # GH 16645 - msg = "'nrows' must be an integer >=0" - with pytest.raises(ValueError, match=msg): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows='5') - - def test_read_excel_squeeze(self, ext): - # GH 12157 - f = os.path.join(self.dirpath, 'test_squeeze' + ext) - - actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) - expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') - expected.index.name = 'a' - tm.assert_series_equal(actual, expected) - - actual = pd.read_excel(f, 'two_columns', squeeze=True) - expected = pd.DataFrame({'a': [4, 5, 6], - 'b': [2, 3, 4]}) - tm.assert_frame_equal(actual, expected) - - actual = pd.read_excel(f, 'one_column', squeeze=True) - expected = pd.Series([1, 2, 3], name='a') - tm.assert_series_equal(actual, expected) - - -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) -class TestXlrdReader(ReadingTestsBase): - """ - This is the base class for the xlrd tests, and 3 different file formats - are supported: xls, xlsx, xlsm - """ - - @td.skip_if_no("xlwt") - def test_read_xlrd_book(self, ext): - import xlrd - df = self.frame - - engine = "xlrd" - sheet_name = "SheetA" - - with ensure_clean(ext) as pth: - df.to_excel(pth, sheet_name) - book = xlrd.open_workbook(pth) - - with ExcelFile(book, engine=engine) as xl: - result = read_excel(xl, sheet_name, index_col=0) - tm.assert_frame_equal(df, result) - - result = read_excel(book, sheet_name=sheet_name, - engine=engine, index_col=0) - tm.assert_frame_equal(df, result) - - -class _WriterBase(SharedItems): - - @pytest.fixture(autouse=True) - def set_engine_and_path(self, request, merge_cells, engine, ext): - """Fixture to set engine and open file for use in each test case - - Rather than requiring `engine=...` to be provided explicitly as an - argument in each test, this fixture sets a global option to dictate - which engine should be used to write Excel files. After executing - the test it rolls back said change to the global option. - - It also uses a context manager to open a temporary excel file for - the function to write to, accessible via `self.path` - - Notes - ----- - This fixture will run as part of each test method defined in the - class and any subclasses, on account of the `autouse=True` - argument - """ - option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) - prev_engine = get_option(option_name) - set_option(option_name, engine) - with ensure_clean(ext) as path: - self.path = path - yield - set_option(option_name, prev_engine) # Roll back option change - - -@pytest.mark.parametrize("merge_cells", [True, False]) -@pytest.mark.parametrize("engine,ext", [ - pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param('xlwt', '.xls', marks=pytest.mark.skipif( - not td.safe_import('xlwt'), reason='No xlwt')), - pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) -]) -class TestExcelWriter(_WriterBase): - # Base class for test cases to run with different Excel writers. - - def test_excel_sheet_by_name_raise(self, *_): - import xlrd - - gt = DataFrame(np.random.randn(10, 2)) - gt.to_excel(self.path) - - xl = ExcelFile(self.path) - df = read_excel(xl, 0, index_col=0) - - tm.assert_frame_equal(gt, df) - - with pytest.raises(xlrd.XLRDError): - read_excel(xl, "0") - - def test_excel_writer_context_manager(self, *_): - with ExcelWriter(self.path) as writer: - self.frame.to_excel(writer, "Data1") - self.frame2.to_excel(writer, "Data2") - - with ExcelFile(self.path) as reader: - found_df = read_excel(reader, "Data1", index_col=0) - found_df2 = read_excel(reader, "Data2", index_col=0) - - tm.assert_frame_equal(found_df, self.frame) - tm.assert_frame_equal(found_df2, self.frame2) - - def test_roundtrip(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan - - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) - - # test roundtrip - self.frame.to_excel(self.path, 'test1') - recons = read_excel(self.path, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(self.path, 'test1', index=False) - recons = read_excel(self.path, 'test1', index_col=None) - recons.index = self.frame.index - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(self.path, 'test1', na_rep='NA') - recons = read_excel(self.path, 'test1', index_col=0, na_values=['NA']) - tm.assert_frame_equal(self.frame, recons) - - # GH 3611 - self.frame.to_excel(self.path, 'test1', na_rep='88') - recons = read_excel(self.path, 'test1', index_col=0, na_values=['88']) - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(self.path, 'test1', na_rep='88') - recons = read_excel(self.path, 'test1', index_col=0, - na_values=[88, 88.0]) - tm.assert_frame_equal(self.frame, recons) - - # GH 6573 - self.frame.to_excel(self.path, 'Sheet1') - recons = read_excel(self.path, index_col=0) - tm.assert_frame_equal(self.frame, recons) - - self.frame.to_excel(self.path, '0') - recons = read_excel(self.path, index_col=0) - tm.assert_frame_equal(self.frame, recons) - - # GH 8825 Pandas Series should provide to_excel method - s = self.frame["A"] - s.to_excel(self.path) - recons = read_excel(self.path, index_col=0) - tm.assert_frame_equal(s.to_frame(), recons) - - def test_mixed(self, merge_cells, engine, ext): - self.mixed_frame.to_excel(self.path, 'test1') - reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.mixed_frame, recons) - - def test_ts_frame(self, *_): - df = tm.makeTimeDataFrame()[:5] - - df.to_excel(self.path, "test1") - reader = ExcelFile(self.path) - - recons = read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(df, recons) - - def test_basics_with_nan(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) - - @pytest.mark.parametrize("np_type", [ - np.int8, np.int16, np.int32, np.int64]) - def test_int_types(self, merge_cells, engine, ext, np_type): - # Test np.int values read come back as int - # (rather than float which is Excel's format). - frame = DataFrame(np.random.randint(-10, 10, size=(10, 2)), - dtype=np_type) - frame.to_excel(self.path, "test1") - - reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) - - int_frame = frame.astype(np.int64) - tm.assert_frame_equal(int_frame, recons) - - recons2 = read_excel(self.path, "test1", index_col=0) - tm.assert_frame_equal(int_frame, recons2) - - # Test with convert_float=False comes back as float. - float_frame = frame.astype(float) - recons = read_excel(self.path, "test1", - convert_float=False, index_col=0) - tm.assert_frame_equal(recons, float_frame, - check_index_type=False, - check_column_type=False) - - @pytest.mark.parametrize("np_type", [ - np.float16, np.float32, np.float64]) - def test_float_types(self, merge_cells, engine, ext, np_type): - # Test np.float values read come back as float. - frame = DataFrame(np.random.random_sample(10), dtype=np_type) - frame.to_excel(self.path, "test1") - - reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0).astype(np_type) - - tm.assert_frame_equal(frame, recons, check_dtype=False) - - @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) - def test_bool_types(self, merge_cells, engine, ext, np_type): - # Test np.bool values read come back as float. - frame = (DataFrame([1, 0, True, False], dtype=np_type)) - frame.to_excel(self.path, "test1") - - reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0).astype(np_type) - - tm.assert_frame_equal(frame, recons) - - def test_inf_roundtrip(self, *_): - frame = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) - frame.to_excel(self.path, "test1") - - reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) - - tm.assert_frame_equal(frame, recons) - - def test_sheets(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan - - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) - - # Test writing to separate sheets - writer = ExcelWriter(self.path) - self.frame.to_excel(writer, 'test1') - self.tsframe.to_excel(writer, 'test2') - writer.save() - reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', index_col=0) - tm.assert_frame_equal(self.frame, recons) - recons = read_excel(reader, 'test2', index_col=0) - tm.assert_frame_equal(self.tsframe, recons) - assert 2 == len(reader.sheet_names) - assert 'test1' == reader.sheet_names[0] - assert 'test2' == reader.sheet_names[1] - - def test_colaliases(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan - - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) - - # column aliases - col_aliases = Index(['AA', 'X', 'Y', 'Z']) - self.frame2.to_excel(self.path, 'test1', header=col_aliases) - reader = ExcelFile(self.path) - rs = read_excel(reader, 'test1', index_col=0) - xp = self.frame2.copy() - xp.columns = col_aliases - tm.assert_frame_equal(xp, rs) - - def test_roundtrip_indexlabels(self, merge_cells, engine, ext): - self.frame['A'][:5] = nan - - self.frame.to_excel(self.path, 'test1') - self.frame.to_excel(self.path, 'test1', columns=['A', 'B']) - self.frame.to_excel(self.path, 'test1', header=False) - self.frame.to_excel(self.path, 'test1', index=False) - - # test index_label - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, 'test1', - index_label=['test'], - merge_cells=merge_cells) - reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, - 'test1', - index_label=['test', 'dummy', 'dummy2'], - merge_cells=merge_cells) - reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) - frame.index.names = ['test'] - assert frame.index.names == recons.index.names - - frame = (DataFrame(np.random.randn(10, 2)) >= 0) - frame.to_excel(self.path, - 'test1', - index_label='test', - merge_cells=merge_cells) - reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) - frame.index.names = ['test'] - tm.assert_frame_equal(frame, recons.astype(bool)) - - self.frame.to_excel(self.path, - 'test1', - columns=['A', 'B', 'C', 'D'], - index=False, merge_cells=merge_cells) - # take 'A' and 'B' as indexes (same row as cols 'C', 'D') - df = self.frame.copy() - df = df.set_index(['A', 'B']) - - reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', index_col=[0, 1]) - tm.assert_frame_equal(df, recons, check_less_precise=True) - - def test_excel_roundtrip_indexname(self, merge_cells, engine, ext): - df = DataFrame(np.random.randn(10, 4)) - df.index.name = 'foo' - - df.to_excel(self.path, merge_cells=merge_cells) - - xf = ExcelFile(self.path) - result = read_excel(xf, xf.sheet_names[0], - index_col=0) - - tm.assert_frame_equal(result, df) - assert result.index.name == 'foo' - - def test_excel_roundtrip_datetime(self, merge_cells, *_): - # datetime.date, not sure what to test here exactly - tsf = self.tsframe.copy() - - tsf.index = [x.date() for x in self.tsframe.index] - tsf.to_excel(self.path, "test1", merge_cells=merge_cells) - - reader = ExcelFile(self.path) - recons = read_excel(reader, "test1", index_col=0) - - tm.assert_frame_equal(self.tsframe, recons) - - def test_excel_date_datetime_format(self, merge_cells, engine, ext): - # see gh-4133 - # - # Excel output format strings - df = DataFrame([[date(2014, 1, 31), - date(1999, 9, 24)], - [datetime(1998, 5, 26, 23, 33, 4), - datetime(2014, 2, 28, 13, 5, 13)]], - index=["DATE", "DATETIME"], columns=["X", "Y"]) - df_expected = DataFrame([[datetime(2014, 1, 31), - datetime(1999, 9, 24)], - [datetime(1998, 5, 26, 23, 33, 4), - datetime(2014, 2, 28, 13, 5, 13)]], - index=["DATE", "DATETIME"], columns=["X", "Y"]) - - with ensure_clean(ext) as filename2: - writer1 = ExcelWriter(self.path) - writer2 = ExcelWriter(filename2, - date_format="DD.MM.YYYY", - datetime_format="DD.MM.YYYY HH-MM-SS") - - df.to_excel(writer1, "test1") - df.to_excel(writer2, "test1") - - writer1.close() - writer2.close() - - reader1 = ExcelFile(self.path) - reader2 = ExcelFile(filename2) - - rs1 = read_excel(reader1, "test1", index_col=0) - rs2 = read_excel(reader2, "test1", index_col=0) - - tm.assert_frame_equal(rs1, rs2) - - # Since the reader returns a datetime object for dates, - # we need to use df_expected to check the result. - tm.assert_frame_equal(rs2, df_expected) - - def test_to_excel_interval_no_labels(self, *_): - # see gh-19242 - # - # Test writing Interval without labels. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() - - frame["new"] = pd.cut(frame[0], 10) - expected["new"] = pd.cut(expected[0], 10).astype(str) - - frame.to_excel(self.path, "test1") - reader = ExcelFile(self.path) - - recons = read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(expected, recons) - - def test_to_excel_interval_labels(self, *_): - # see gh-19242 - # - # Test writing Interval with labels. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - dtype=np.int64) - expected = frame.copy() - intervals = pd.cut(frame[0], 10, labels=["A", "B", "C", "D", "E", - "F", "G", "H", "I", "J"]) - frame["new"] = intervals - expected["new"] = pd.Series(list(intervals)) - - frame.to_excel(self.path, "test1") - reader = ExcelFile(self.path) - - recons = read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(expected, recons) - - def test_to_excel_timedelta(self, *_): - # see gh-19242, gh-9155 - # - # Test writing timedelta to xls. - frame = DataFrame(np.random.randint(-10, 10, size=(20, 1)), - columns=["A"], dtype=np.int64) - expected = frame.copy() - - frame["new"] = frame["A"].apply(lambda x: timedelta(seconds=x)) - expected["new"] = expected["A"].apply( - lambda x: timedelta(seconds=x).total_seconds() / float(86400)) - - frame.to_excel(self.path, "test1") - reader = ExcelFile(self.path) - - recons = read_excel(reader, "test1", index_col=0) - tm.assert_frame_equal(expected, recons) - - def test_to_excel_periodindex(self, merge_cells, engine, ext): - frame = self.tsframe - xp = frame.resample('M', kind='period').mean() - - xp.to_excel(self.path, 'sht1') - - reader = ExcelFile(self.path) - rs = read_excel(reader, 'sht1', index_col=0) - tm.assert_frame_equal(xp, rs.to_period('M')) - - def test_to_excel_multiindex(self, merge_cells, engine, ext): - frame = self.frame - arrays = np.arange(len(frame.index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, - names=['first', 'second']) - frame.index = new_index - - frame.to_excel(self.path, 'test1', header=False) - frame.to_excel(self.path, 'test1', columns=['A', 'B']) - - # round trip - frame.to_excel(self.path, 'test1', merge_cells=merge_cells) - reader = ExcelFile(self.path) - df = read_excel(reader, 'test1', index_col=[0, 1]) - tm.assert_frame_equal(frame, df) - - # GH13511 - def test_to_excel_multiindex_nan_label(self, merge_cells, engine, ext): - frame = pd.DataFrame({'A': [None, 2, 3], - 'B': [10, 20, 30], - 'C': np.random.sample(3)}) - frame = frame.set_index(['A', 'B']) - - frame.to_excel(self.path, merge_cells=merge_cells) - df = read_excel(self.path, index_col=[0, 1]) - tm.assert_frame_equal(frame, df) - - # Test for Issue 11328. If column indices are integers, make - # sure they are handled correctly for either setting of - # merge_cells - def test_to_excel_multiindex_cols(self, merge_cells, engine, ext): - frame = self.frame - arrays = np.arange(len(frame.index) * 2).reshape(2, -1) - new_index = MultiIndex.from_arrays(arrays, - names=['first', 'second']) - frame.index = new_index - - new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), - (50, 1), (50, 2)]) - frame.columns = new_cols_index - header = [0, 1] - if not merge_cells: - header = 0 - - # round trip - frame.to_excel(self.path, 'test1', merge_cells=merge_cells) - reader = ExcelFile(self.path) - df = read_excel(reader, 'test1', header=header, - index_col=[0, 1]) - if not merge_cells: - fm = frame.columns.format(sparsify=False, - adjoin=False, names=False) - frame.columns = [".".join(map(str, q)) for q in zip(*fm)] - tm.assert_frame_equal(frame, df) - - def test_to_excel_multiindex_dates(self, merge_cells, engine, ext): - # try multiindex with dates - tsframe = self.tsframe.copy() - new_index = [tsframe.index, np.arange(len(tsframe.index))] - tsframe.index = MultiIndex.from_arrays(new_index) - - tsframe.index.names = ['time', 'foo'] - tsframe.to_excel(self.path, 'test1', merge_cells=merge_cells) - reader = ExcelFile(self.path) - recons = read_excel(reader, 'test1', - index_col=[0, 1]) - - tm.assert_frame_equal(tsframe, recons) - assert recons.index.names == ('time', 'foo') - - def test_to_excel_multiindex_no_write_index(self, merge_cells, engine, - ext): - # Test writing and re-reading a MI witout the index. GH 5616. - - # Initial non-MI frame. - frame1 = DataFrame({'a': [10, 20], 'b': [30, 40], 'c': [50, 60]}) - - # Add a MI. - frame2 = frame1.copy() - multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) - frame2.index = multi_index - - # Write out to Excel without the index. - frame2.to_excel(self.path, 'test1', index=False) - - # Read it back in. - reader = ExcelFile(self.path) - frame3 = read_excel(reader, 'test1') - - # Test that it is the same as the initial frame. - tm.assert_frame_equal(frame1, frame3) - - def test_to_excel_float_format(self, *_): - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=["A", "B"], columns=["X", "Y", "Z"]) - df.to_excel(self.path, "test1", float_format="%.2f") - - reader = ExcelFile(self.path) - result = read_excel(reader, "test1", index_col=0) - - expected = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=["A", "B"], columns=["X", "Y", "Z"]) - tm.assert_frame_equal(result, expected) - - def test_to_excel_output_encoding(self, merge_cells, engine, ext): - # Avoid mixed inferred_type. - df = DataFrame([[u"\u0192", u"\u0193", u"\u0194"], - [u"\u0195", u"\u0196", u"\u0197"]], - index=[u"A\u0192", u"B"], - columns=[u"X\u0193", u"Y", u"Z"]) - - with ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: - df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") - result = read_excel(filename, "TestSheet", - encoding="utf8", index_col=0) - tm.assert_frame_equal(result, df) - - def test_to_excel_unicode_filename(self, merge_cells, engine, ext): - with ensure_clean(u("\u0192u.") + ext) as filename: - try: - f = open(filename, "wb") - except UnicodeEncodeError: - pytest.skip("No unicode file names on this system") - else: - f.close() - - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=["A", "B"], columns=["X", "Y", "Z"]) - df.to_excel(filename, "test1", float_format="%.2f") - - reader = ExcelFile(filename) - result = read_excel(reader, "test1", index_col=0) - - expected = DataFrame([[0.12, 0.23, 0.57], - [12.32, 123123.20, 321321.20]], - index=["A", "B"], columns=["X", "Y", "Z"]) - tm.assert_frame_equal(result, expected) - - # def test_to_excel_header_styling_xls(self, merge_cells, engine, ext): - - # import StringIO - # s = StringIO( - # """Date,ticker,type,value - # 2001-01-01,x,close,12.2 - # 2001-01-01,x,open ,12.1 - # 2001-01-01,y,close,12.2 - # 2001-01-01,y,open ,12.1 - # 2001-02-01,x,close,12.2 - # 2001-02-01,x,open ,12.1 - # 2001-02-01,y,close,12.2 - # 2001-02-01,y,open ,12.1 - # 2001-03-01,x,close,12.2 - # 2001-03-01,x,open ,12.1 - # 2001-03-01,y,close,12.2 - # 2001-03-01,y,open ,12.1""") - # df = read_csv(s, parse_dates=["Date"]) - # pdf = df.pivot_table(values="value", rows=["ticker"], - # cols=["Date", "type"]) - - # try: - # import xlwt - # import xlrd - # except ImportError: - # pytest.skip - - # filename = '__tmp_to_excel_header_styling_xls__.xls' - # pdf.to_excel(filename, 'test1') - - # wbk = xlrd.open_workbook(filename, - # formatting_info=True) - # assert ["test1"] == wbk.sheet_names() - # ws = wbk.sheet_by_name('test1') - # assert [(0, 1, 5, 7), (0, 1, 3, 5), (0, 1, 1, 3)] == ws.merged_cells - # for i in range(0, 2): - # for j in range(0, 7): - # xfx = ws.cell_xf_index(0, 0) - # cell_xf = wbk.xf_list[xfx] - # font = wbk.font_list - # assert 1 == font[cell_xf.font_index].bold - # assert 1 == cell_xf.border.top_line_style - # assert 1 == cell_xf.border.right_line_style - # assert 1 == cell_xf.border.bottom_line_style - # assert 1 == cell_xf.border.left_line_style - # assert 2 == cell_xf.alignment.hor_align - # os.remove(filename) - # def test_to_excel_header_styling_xlsx(self, merge_cells, engine, ext): - # import StringIO - # s = StringIO( - # """Date,ticker,type,value - # 2001-01-01,x,close,12.2 - # 2001-01-01,x,open ,12.1 - # 2001-01-01,y,close,12.2 - # 2001-01-01,y,open ,12.1 - # 2001-02-01,x,close,12.2 - # 2001-02-01,x,open ,12.1 - # 2001-02-01,y,close,12.2 - # 2001-02-01,y,open ,12.1 - # 2001-03-01,x,close,12.2 - # 2001-03-01,x,open ,12.1 - # 2001-03-01,y,close,12.2 - # 2001-03-01,y,open ,12.1""") - # df = read_csv(s, parse_dates=["Date"]) - # pdf = df.pivot_table(values="value", rows=["ticker"], - # cols=["Date", "type"]) - # try: - # import openpyxl - # from openpyxl.cell import get_column_letter - # except ImportError: - # pytest.skip - # if openpyxl.__version__ < '1.6.1': - # pytest.skip - # # test xlsx_styling - # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx' - # pdf.to_excel(filename, 'test1') - # wbk = openpyxl.load_workbook(filename) - # assert ["test1"] == wbk.get_sheet_names() - # ws = wbk.get_sheet_by_name('test1') - # xlsaddrs = ["%s2" % chr(i) for i in range(ord('A'), ord('H'))] - # xlsaddrs += ["A%s" % i for i in range(1, 6)] - # xlsaddrs += ["B1", "D1", "F1"] - # for xlsaddr in xlsaddrs: - # cell = ws.cell(xlsaddr) - # assert cell.style.font.bold - # assert (openpyxl.style.Border.BORDER_THIN == - # cell.style.borders.top.border_style) - # assert (openpyxl.style.Border.BORDER_THIN == - # cell.style.borders.right.border_style) - # assert (openpyxl.style.Border.BORDER_THIN == - # cell.style.borders.bottom.border_style) - # assert (openpyxl.style.Border.BORDER_THIN == - # cell.style.borders.left.border_style) - # assert (openpyxl.style.Alignment.HORIZONTAL_CENTER == - # cell.style.alignment.horizontal) - # mergedcells_addrs = ["C1", "E1", "G1"] - # for maddr in mergedcells_addrs: - # assert ws.cell(maddr).merged - # os.remove(filename) - - @pytest.mark.parametrize("use_headers", [True, False]) - @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) - @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) - def test_excel_010_hemstring(self, merge_cells, engine, ext, - c_idx_nlevels, r_idx_nlevels, use_headers): - - def roundtrip(data, header=True, parser_hdr=0, index=True): - data.to_excel(self.path, header=header, - merge_cells=merge_cells, index=index) - - xf = ExcelFile(self.path) - return read_excel(xf, xf.sheet_names[0], header=parser_hdr) - - # Basic test. - parser_header = 0 if use_headers else None - res = roundtrip(DataFrame([0]), use_headers, parser_header) - - assert res.shape == (1, 2) - assert res.iloc[0, 0] is not np.nan - - # More complex tests with multi-index. - nrows = 5 - ncols = 3 - - from pandas.util.testing import makeCustomDataframe as mkdf - # ensure limited functionality in 0.10 - # override of gh-2370 until sorted out in 0.11 - - df = mkdf(nrows, ncols, r_idx_nlevels=r_idx_nlevels, - c_idx_nlevels=c_idx_nlevels) - - # This if will be removed once multi-column Excel writing - # is implemented. For now fixing gh-9794. - if c_idx_nlevels > 1: - with pytest.raises(NotImplementedError): - roundtrip(df, use_headers, index=False) - else: - res = roundtrip(df, use_headers) - - if use_headers: - assert res.shape == (nrows, ncols + r_idx_nlevels) - else: - # First row taken as columns. - assert res.shape == (nrows - 1, ncols + r_idx_nlevels) - - # No NaNs. - for r in range(len(res.index)): - for c in range(len(res.columns)): - assert res.iloc[r, c] is not np.nan - - def test_duplicated_columns(self, *_): - # see gh-5235 - df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], - columns=["A", "B", "B"]) - df.to_excel(self.path, "test1") - expected = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], - columns=["A", "B", "B.1"]) - - # By default, we mangle. - result = read_excel(self.path, "test1", index_col=0) - tm.assert_frame_equal(result, expected) - - # Explicitly, we pass in the parameter. - result = read_excel(self.path, "test1", index_col=0, - mangle_dupe_cols=True) - tm.assert_frame_equal(result, expected) - - # see gh-11007, gh-10970 - df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=["A", "B", "A", "B"]) - df.to_excel(self.path, "test1") - - result = read_excel(self.path, "test1", index_col=0) - expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], - columns=["A", "B", "A.1", "B.1"]) - tm.assert_frame_equal(result, expected) - - # see gh-10982 - df.to_excel(self.path, "test1", index=False, header=False) - result = read_excel(self.path, "test1", header=None) - - expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) - tm.assert_frame_equal(result, expected) - - msg = "Setting mangle_dupe_cols=False is not supported yet" - with pytest.raises(ValueError, match=msg): - read_excel(self.path, "test1", header=None, mangle_dupe_cols=False) - - def test_swapped_columns(self, merge_cells, engine, ext): - # Test for issue #5427. - write_frame = DataFrame({'A': [1, 1, 1], - 'B': [2, 2, 2]}) - write_frame.to_excel(self.path, 'test1', columns=['B', 'A']) - - read_frame = read_excel(self.path, 'test1', header=0) - - tm.assert_series_equal(write_frame['A'], read_frame['A']) - tm.assert_series_equal(write_frame['B'], read_frame['B']) - - def test_invalid_columns(self, *_): - # see gh-10982 - write_frame = DataFrame({"A": [1, 1, 1], - "B": [2, 2, 2]}) - - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - write_frame.to_excel(self.path, "test1", columns=["B", "C"]) - - expected = write_frame.reindex(columns=["B", "C"]) - read_frame = read_excel(self.path, "test1", index_col=0) - tm.assert_frame_equal(expected, read_frame) - - with pytest.raises(KeyError): - write_frame.to_excel(self.path, "test1", columns=["C", "D"]) - - def test_comment_arg(self, *_): - # see gh-18735 - # - # Test the comment argument functionality to read_excel. - - # Create file to read in. - df = DataFrame({"A": ["one", "#one", "one"], - "B": ["two", "two", "#two"]}) - df.to_excel(self.path, "test_c") - - # Read file without comment arg. - result1 = read_excel(self.path, "test_c", index_col=0) - - result1.iloc[1, 0] = None - result1.iloc[1, 1] = None - result1.iloc[2, 1] = None - - result2 = read_excel(self.path, "test_c", comment="#", index_col=0) - tm.assert_frame_equal(result1, result2) - - def test_comment_default(self, merge_cells, engine, ext): - # Re issue #18735 - # Test the comment argument default to read_excel - - # Create file to read in - df = DataFrame({'A': ['one', '#one', 'one'], - 'B': ['two', 'two', '#two']}) - df.to_excel(self.path, 'test_c') - - # Read file with default and explicit comment=None - result1 = read_excel(self.path, 'test_c') - result2 = read_excel(self.path, 'test_c', comment=None) - tm.assert_frame_equal(result1, result2) - - def test_comment_used(self, *_): - # see gh-18735 - # - # Test the comment argument is working as expected when used. - - # Create file to read in. - df = DataFrame({"A": ["one", "#one", "one"], - "B": ["two", "two", "#two"]}) - df.to_excel(self.path, "test_c") - - # Test read_frame_comment against manually produced expected output. - expected = DataFrame({"A": ["one", None, "one"], - "B": ["two", None, None]}) - result = read_excel(self.path, "test_c", comment="#", index_col=0) - tm.assert_frame_equal(result, expected) - - def test_comment_empty_line(self, merge_cells, engine, ext): - # Re issue #18735 - # Test that read_excel ignores commented lines at the end of file - - df = DataFrame({'a': ['1', '#2'], 'b': ['2', '3']}) - df.to_excel(self.path, index=False) - - # Test that all-comment lines at EoF are ignored - expected = DataFrame({'a': [1], 'b': [2]}) - result = read_excel(self.path, comment='#') - tm.assert_frame_equal(result, expected) - - def test_datetimes(self, merge_cells, engine, ext): - - # Test writing and reading datetimes. For issue #9139. (xref #9185) - datetimes = [datetime(2013, 1, 13, 1, 2, 3), - datetime(2013, 1, 13, 2, 45, 56), - datetime(2013, 1, 13, 4, 29, 49), - datetime(2013, 1, 13, 6, 13, 42), - datetime(2013, 1, 13, 7, 57, 35), - datetime(2013, 1, 13, 9, 41, 28), - datetime(2013, 1, 13, 11, 25, 21), - datetime(2013, 1, 13, 13, 9, 14), - datetime(2013, 1, 13, 14, 53, 7), - datetime(2013, 1, 13, 16, 37, 0), - datetime(2013, 1, 13, 18, 20, 52)] - - write_frame = DataFrame({'A': datetimes}) - write_frame.to_excel(self.path, 'Sheet1') - read_frame = read_excel(self.path, 'Sheet1', header=0) - - tm.assert_series_equal(write_frame['A'], read_frame['A']) - - def test_bytes_io(self, merge_cells, engine, ext): - # see gh-7074 - bio = BytesIO() - df = DataFrame(np.random.randn(10, 2)) - - # Pass engine explicitly, as there is no file path to infer from. - writer = ExcelWriter(bio, engine=engine) - df.to_excel(writer) - writer.save() - - bio.seek(0) - reread_df = read_excel(bio, index_col=0) - tm.assert_frame_equal(df, reread_df) - - def test_write_lists_dict(self, *_): - # see gh-8188. - df = DataFrame({"mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], - "numeric": [1, 2, 3.0], - "str": ["apple", "banana", "cherry"]}) - df.to_excel(self.path, "Sheet1") - read = read_excel(self.path, "Sheet1", header=0, index_col=0) - - expected = df.copy() - expected.mixed = expected.mixed.apply(str) - expected.numeric = expected.numeric.astype("int64") - - tm.assert_frame_equal(read, expected) - - def test_true_and_false_value_options(self, *_): - # see gh-13347 - df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"]) - expected = df.replace({"foo": True, "bar": False}) - - df.to_excel(self.path) - read_frame = read_excel(self.path, true_values=["foo"], - false_values=["bar"], index_col=0) - tm.assert_frame_equal(read_frame, expected) - - def test_freeze_panes(self, *_): - # see gh-15160 - expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) - expected.to_excel(self.path, "Sheet1", freeze_panes=(1, 1)) - - result = read_excel(self.path, index_col=0) - tm.assert_frame_equal(result, expected) - - def test_path_path_lib(self, merge_cells, engine, ext): - df = tm.makeDataFrame() - writer = partial(df.to_excel, engine=engine) - - reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, - path="foo.{ext}".format(ext=ext)) - tm.assert_frame_equal(result, df) - - def test_path_local_path(self, merge_cells, engine, ext): - df = tm.makeDataFrame() - writer = partial(df.to_excel, engine=engine) - - reader = partial(pd.read_excel, index_col=0) - result = tm.round_trip_pathlib(writer, reader, - path="foo.{ext}".format(ext=ext)) - tm.assert_frame_equal(result, df) - - -@td.skip_if_no('openpyxl') -@pytest.mark.parametrize("merge_cells,ext,engine", [ - (None, '.xlsx', 'openpyxl')]) -class TestOpenpyxlTests(_WriterBase): - - def test_to_excel_styleconverter(self, merge_cells, ext, engine): - from openpyxl import styles - - hstyle = { - "font": { - "color": '00FF0000', - "bold": True, - }, - "borders": { - "top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin", - }, - "alignment": { - "horizontal": "center", - "vertical": "top", - }, - "fill": { - "patternType": 'solid', - 'fgColor': { - 'rgb': '006666FF', - 'tint': 0.3, - }, - }, - "number_format": { - "format_code": "0.00" - }, - "protection": { - "locked": True, - "hidden": False, - }, - } - - font_color = styles.Color('00FF0000') - font = styles.Font(bold=True, color=font_color) - side = styles.Side(style=styles.borders.BORDER_THIN) - border = styles.Border(top=side, right=side, bottom=side, left=side) - alignment = styles.Alignment(horizontal='center', vertical='top') - fill_color = styles.Color(rgb='006666FF', tint=0.3) - fill = styles.PatternFill(patternType='solid', fgColor=fill_color) - - number_format = '0.00' - - protection = styles.Protection(locked=True, hidden=False) - - kw = _OpenpyxlWriter._convert_to_style_kwargs(hstyle) - assert kw['font'] == font - assert kw['border'] == border - assert kw['alignment'] == alignment - assert kw['fill'] == fill - assert kw['number_format'] == number_format - assert kw['protection'] == protection - - def test_write_cells_merge_styled(self, merge_cells, ext, engine): - from pandas.io.formats.excel import ExcelCell - - sheet_name = 'merge_styled' - - sty_b1 = {'font': {'color': '00FF0000'}} - sty_a2 = {'font': {'color': '0000FF00'}} - - initial_cells = [ - ExcelCell(col=1, row=0, val=42, style=sty_b1), - ExcelCell(col=0, row=1, val=99, style=sty_a2), - ] - - sty_merged = {'font': {'color': '000000FF', 'bold': True}} - sty_kwargs = _OpenpyxlWriter._convert_to_style_kwargs(sty_merged) - openpyxl_sty_merged = sty_kwargs['font'] - merge_cells = [ - ExcelCell(col=0, row=0, val='pandas', - mergestart=1, mergeend=1, style=sty_merged), - ] - - with ensure_clean(ext) as path: - writer = _OpenpyxlWriter(path) - writer.write_cells(initial_cells, sheet_name=sheet_name) - writer.write_cells(merge_cells, sheet_name=sheet_name) - - wks = writer.sheets[sheet_name] - xcell_b1 = wks['B1'] - xcell_a2 = wks['A2'] - assert xcell_b1.font == openpyxl_sty_merged - assert xcell_a2.font == openpyxl_sty_merged - - @pytest.mark.parametrize("mode,expected", [ - ('w', ['baz']), ('a', ['foo', 'bar', 'baz'])]) - def test_write_append_mode(self, merge_cells, ext, engine, mode, expected): - import openpyxl - df = DataFrame([1], columns=['baz']) - - with ensure_clean(ext) as f: - wb = openpyxl.Workbook() - wb.worksheets[0].title = 'foo' - wb.worksheets[0]['A1'].value = 'foo' - wb.create_sheet('bar') - wb.worksheets[1]['A1'].value = 'bar' - wb.save(f) - - writer = ExcelWriter(f, engine=engine, mode=mode) - df.to_excel(writer, sheet_name='baz', index=False) - writer.save() - - wb2 = openpyxl.load_workbook(f) - result = [sheet.title for sheet in wb2.worksheets] - assert result == expected - - for index, cell_value in enumerate(expected): - assert wb2.worksheets[index]['A1'].value == cell_value - - -@td.skip_if_no('xlwt') -@pytest.mark.parametrize("merge_cells,ext,engine", [ - (None, '.xls', 'xlwt')]) -class TestXlwtTests(_WriterBase): - - def test_excel_raise_error_on_multiindex_columns_and_no_index( - self, merge_cells, ext, engine): - # MultiIndex as columns is not yet implemented 9794 - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) - df = DataFrame(np.random.randn(10, 3), columns=cols) - with pytest.raises(NotImplementedError): - with ensure_clean(ext) as path: - df.to_excel(path, index=False) - - def test_excel_multiindex_columns_and_index_true(self, merge_cells, ext, - engine): - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) - df = pd.DataFrame(np.random.randn(10, 3), columns=cols) - with ensure_clean(ext) as path: - df.to_excel(path, index=True) - - def test_excel_multiindex_index(self, merge_cells, ext, engine): - # MultiIndex as index works so assert no error #9794 - cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) - df = DataFrame(np.random.randn(3, 10), index=cols) - with ensure_clean(ext) as path: - df.to_excel(path, index=False) - - def test_to_excel_styleconverter(self, merge_cells, ext, engine): - import xlwt - - hstyle = {"font": {"bold": True}, - "borders": {"top": "thin", - "right": "thin", - "bottom": "thin", - "left": "thin"}, - "alignment": {"horizontal": "center", "vertical": "top"}} - - xls_style = _XlwtWriter._convert_to_style(hstyle) - assert xls_style.font.bold - assert xlwt.Borders.THIN == xls_style.borders.top - assert xlwt.Borders.THIN == xls_style.borders.right - assert xlwt.Borders.THIN == xls_style.borders.bottom - assert xlwt.Borders.THIN == xls_style.borders.left - assert xlwt.Alignment.HORZ_CENTER == xls_style.alignment.horz - assert xlwt.Alignment.VERT_TOP == xls_style.alignment.vert - - def test_write_append_mode_raises(self, merge_cells, ext, engine): - msg = "Append mode is not supported with xlwt!" - - with ensure_clean(ext) as f: - with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine=engine, mode='a') - - -@td.skip_if_no('xlsxwriter') -@pytest.mark.parametrize("merge_cells,ext,engine", [ - (None, '.xlsx', 'xlsxwriter')]) -class TestXlsxWriterTests(_WriterBase): - - @td.skip_if_no('openpyxl') - def test_column_format(self, merge_cells, ext, engine): - # Test that column formats are applied to cells. Test for issue #9167. - # Applicable to xlsxwriter only. - with warnings.catch_warnings(): - # Ignore the openpyxl lxml warning. - warnings.simplefilter("ignore") - import openpyxl - - with ensure_clean(ext) as path: - frame = DataFrame({'A': [123456, 123456], - 'B': [123456, 123456]}) - - writer = ExcelWriter(path) - frame.to_excel(writer) - - # Add a number format to col B and ensure it is applied to cells. - num_format = '#,##0' - write_workbook = writer.book - write_worksheet = write_workbook.worksheets()[0] - col_format = write_workbook.add_format({'num_format': num_format}) - write_worksheet.set_column('B:B', None, col_format) - writer.save() - - read_workbook = openpyxl.load_workbook(path) - try: - read_worksheet = read_workbook['Sheet1'] - except TypeError: - # compat - read_worksheet = read_workbook.get_sheet_by_name(name='Sheet1') - - # Get the number format from the cell. - try: - cell = read_worksheet['B2'] - except TypeError: - # compat - cell = read_worksheet.cell('B2') - - try: - read_num_format = cell.number_format - except Exception: - read_num_format = cell.style.number_format._format_code - - assert read_num_format == num_format - - def test_write_append_mode_raises(self, merge_cells, ext, engine): - msg = "Append mode is not supported with xlsxwriter!" - - with ensure_clean(ext) as f: - with pytest.raises(ValueError, match=msg): - ExcelWriter(f, engine=engine, mode='a') - - -class TestExcelWriterEngineTests(object): - - @pytest.mark.parametrize('klass,ext', [ - pytest.param(_XlsxWriter, '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('xlsxwriter'), reason='No xlsxwriter')), - pytest.param(_OpenpyxlWriter, '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param(_XlwtWriter, '.xls', marks=pytest.mark.skipif( - not td.safe_import('xlwt'), reason='No xlwt')) - ]) - def test_ExcelWriter_dispatch(self, klass, ext): - with ensure_clean(ext) as path: - writer = ExcelWriter(path) - if ext == '.xlsx' and td.safe_import('xlsxwriter'): - # xlsxwriter has preference over openpyxl if both installed - assert isinstance(writer, _XlsxWriter) - else: - assert isinstance(writer, klass) - - def test_ExcelWriter_dispatch_raises(self): - with pytest.raises(ValueError, match='No engine'): - ExcelWriter('nothing') - - @pytest.mark.filterwarnings("ignore:\\nPanel:FutureWarning") - def test_register_writer(self): - # some awkward mocking to test out dispatch and such actually works - called_save = [] - called_write_cells = [] - - class DummyClass(ExcelWriter): - called_save = False - called_write_cells = False - supported_extensions = ['test', 'xlsx', 'xls'] - engine = 'dummy' - - def save(self): - called_save.append(True) - - def write_cells(self, *args, **kwargs): - called_write_cells.append(True) - - def check_called(func): - func() - assert len(called_save) >= 1 - assert len(called_write_cells) >= 1 - del called_save[:] - del called_write_cells[:] - - with pd.option_context('io.excel.xlsx.writer', 'dummy'): - register_writer(DummyClass) - writer = ExcelWriter('something.test') - assert isinstance(writer, DummyClass) - df = tm.makeCustomDataframe(1, 1) - - with catch_warnings(record=True): - panel = tm.makePanel() - func = lambda: df.to_excel('something.test') - check_called(func) - check_called(lambda: panel.to_excel('something.test')) - check_called(lambda: df.to_excel('something.xlsx')) - check_called( - lambda: df.to_excel( - 'something.xls', engine='dummy')) - - -@pytest.mark.parametrize('engine', [ - pytest.param('xlwt', - marks=pytest.mark.xfail(reason='xlwt does not support ' - 'openpyxl-compatible ' - 'style dicts')), - 'xlsxwriter', - 'openpyxl', -]) -def test_styler_to_excel(engine): - def style(df): - # XXX: RGB colors not supported in xlwt - return DataFrame([['font-weight: bold', '', ''], - ['', 'color: blue', ''], - ['', '', 'text-decoration: underline'], - ['border-style: solid', '', ''], - ['', 'font-style: italic', ''], - ['', '', 'text-align: right'], - ['background-color: red', '', ''], - ['number-format: 0%', '', ''], - ['', '', ''], - ['', '', ''], - ['', '', '']], - index=df.index, columns=df.columns) - - def assert_equal_style(cell1, cell2): - # XXX: should find a better way to check equality - assert cell1.alignment.__dict__ == cell2.alignment.__dict__ - assert cell1.border.__dict__ == cell2.border.__dict__ - assert cell1.fill.__dict__ == cell2.fill.__dict__ - assert cell1.font.__dict__ == cell2.font.__dict__ - assert cell1.number_format == cell2.number_format - assert cell1.protection.__dict__ == cell2.protection.__dict__ - - def custom_converter(css): - # use bold iff there is custom style attached to the cell - if css.strip(' \n;'): - return {'font': {'bold': True}} - return {} - - pytest.importorskip('jinja2') - pytest.importorskip(engine) - - # Prepare spreadsheets - - df = DataFrame(np.random.randn(11, 3)) - with ensure_clean('.xlsx' if engine != 'xlwt' else '.xls') as path: - writer = ExcelWriter(path, engine=engine) - df.to_excel(writer, sheet_name='frame') - df.style.to_excel(writer, sheet_name='unstyled') - styled = df.style.apply(style, axis=None) - styled.to_excel(writer, sheet_name='styled') - ExcelFormatter(styled, style_converter=custom_converter).write( - writer, sheet_name='custom') - writer.save() - - if engine not in ('openpyxl', 'xlsxwriter'): - # For other engines, we only smoke test - return - openpyxl = pytest.importorskip('openpyxl') - wb = openpyxl.load_workbook(path) - - # (1) compare DataFrame.to_excel and Styler.to_excel when unstyled - n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['unstyled'].columns): - assert len(col1) == len(col2) - for cell1, cell2 in zip(col1, col2): - assert cell1.value == cell2.value - assert_equal_style(cell1, cell2) - n_cells += 1 - - # ensure iteration actually happened: - assert n_cells == (11 + 1) * (3 + 1) - - # (2) check styling with default converter - - # XXX: openpyxl (as at 2.4) prefixes colors with 00, xlsxwriter with FF - alpha = '00' if engine == 'openpyxl' else 'FF' - - n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['styled'].columns): - assert len(col1) == len(col2) - for cell1, cell2 in zip(col1, col2): - ref = '%s%d' % (cell2.column, cell2.row) - # XXX: this isn't as strong a test as ideal; we should - # confirm that differences are exclusive - if ref == 'B2': - assert not cell1.font.bold - assert cell2.font.bold - elif ref == 'C3': - assert cell1.font.color.rgb != cell2.font.color.rgb - assert cell2.font.color.rgb == alpha + '0000FF' - elif ref == 'D4': - # This fails with engine=xlsxwriter due to - # https://bitbucket.org/openpyxl/openpyxl/issues/800 - if engine == 'xlsxwriter' \ - and (LooseVersion(openpyxl.__version__) < - LooseVersion('2.4.6')): - pass - else: - assert cell1.font.underline != cell2.font.underline - assert cell2.font.underline == 'single' - elif ref == 'B5': - assert not cell1.border.left.style - assert (cell2.border.top.style == - cell2.border.right.style == - cell2.border.bottom.style == - cell2.border.left.style == - 'medium') - elif ref == 'C6': - assert not cell1.font.italic - assert cell2.font.italic - elif ref == 'D7': - assert (cell1.alignment.horizontal != - cell2.alignment.horizontal) - assert cell2.alignment.horizontal == 'right' - elif ref == 'B8': - assert cell1.fill.fgColor.rgb != cell2.fill.fgColor.rgb - assert cell1.fill.patternType != cell2.fill.patternType - assert cell2.fill.fgColor.rgb == alpha + 'FF0000' - assert cell2.fill.patternType == 'solid' - elif ref == 'B9': - assert cell1.number_format == 'General' - assert cell2.number_format == '0%' - else: - assert_equal_style(cell1, cell2) - - assert cell1.value == cell2.value - n_cells += 1 - - assert n_cells == (11 + 1) * (3 + 1) - - # (3) check styling with custom converter - n_cells = 0 - for col1, col2 in zip(wb['frame'].columns, - wb['custom'].columns): - assert len(col1) == len(col2) - for cell1, cell2 in zip(col1, col2): - ref = '%s%d' % (cell2.column, cell2.row) - if ref in ('B2', 'C3', 'D4', 'B5', 'C6', 'D7', 'B8', 'B9'): - assert not cell1.font.bold - assert cell2.font.bold - else: - assert_equal_style(cell1, cell2) - - assert cell1.value == cell2.value - n_cells += 1 - - assert n_cells == (11 + 1) * (3 + 1) - - -@td.skip_if_no('openpyxl') -@pytest.mark.skipif(not PY36, reason='requires fspath') -class TestFSPath(object): - - def test_excelfile_fspath(self): - with tm.ensure_clean('foo.xlsx') as path: - df = DataFrame({"A": [1, 2]}) - df.to_excel(path) - xl = ExcelFile(path) - result = os.fspath(xl) - assert result == path - - def test_excelwriter_fspath(self): - with tm.ensure_clean('foo.xlsx') as path: - writer = ExcelWriter(path) - assert os.fspath(writer) == str(path) From b21b2aa94d539f8efde191d51e869d3222dd5ca4 Mon Sep 17 00:00:00 2001 From: Steven Beardwell <46458390+stevenbw@users.noreply.github.com> Date: Sun, 13 Jan 2019 15:20:33 +0000 Subject: [PATCH 3/6] Split test_excel.py (#24472) --- pandas/tests/io/excel/{base.py => test_base.py} | 0 pandas/tests/io/excel/{xlrd.py => test_xlrd.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename pandas/tests/io/excel/{base.py => test_base.py} (100%) rename pandas/tests/io/excel/{xlrd.py => test_xlrd.py} (100%) diff --git a/pandas/tests/io/excel/base.py b/pandas/tests/io/excel/test_base.py similarity index 100% rename from pandas/tests/io/excel/base.py rename to pandas/tests/io/excel/test_base.py diff --git a/pandas/tests/io/excel/xlrd.py b/pandas/tests/io/excel/test_xlrd.py similarity index 100% rename from pandas/tests/io/excel/xlrd.py rename to pandas/tests/io/excel/test_xlrd.py From 11ee17e3b59331bdeca4da9df8c7376514438500 Mon Sep 17 00:00:00 2001 From: Steven Beardwell <46458390+stevenbw@users.noreply.github.com> Date: Sun, 13 Jan 2019 16:29:45 +0000 Subject: [PATCH 4/6] Split test_excel.py (#24472) --- pandas/tests/io/excel/test_xlrd.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 3e328923a457e..86283c78b02fb 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -97,11 +97,6 @@ def set_engine(self, request): yield setattr(self, func_name, old_func) - @pytest.mark.parametrize("usecols", [ - [0, 1, 3], [0, 3, 1], - [1, 0, 3], [1, 3, 0], - [3, 0, 1], [3, 1, 0], - ]) @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 def test_usecols_int(self, ext): From 1c2cd2c5f230f8070bc584eae6c6bcf1c5bae45e Mon Sep 17 00:00:00 2001 From: Steven Beardwell <46458390+stevenbw@users.noreply.github.com> Date: Sun, 13 Jan 2019 17:27:38 +0000 Subject: [PATCH 5/6] Split test_excel.py (#24472) --- pandas/tests/io/excel/test_xlrd.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py index 86283c78b02fb..bd0f9d7966de3 100644 --- a/pandas/tests/io/excel/test_xlrd.py +++ b/pandas/tests/io/excel/test_xlrd.py @@ -14,6 +14,7 @@ from pandas.util.testing import ensure_clean from pandas.io.excel import ExcelFile, read_excel +from pandas.io.parsers import read_csv _seriesd = tm.getSeriesData() _tsd = tm.getTimeSeriesData() @@ -49,6 +50,25 @@ def setup_method(self, datapath): self.tsframe = _tsframe.copy() self.mixed_frame = _mixed_frame.copy() + def get_csv_refdf(self, basename): + """ + Obtain the reference data from read_csv with the Python engine. + + Parameters + ---------- + + basename : str + File base name, excluding file extension. + + Returns + ------- + + dfref : DataFrame + """ + pref = os.path.join(self.dirpath, basename + '.csv') + dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') + return dfref + def get_excelfile(self, basename, ext): """ Return test data ExcelFile instance. From 05467eabd5d746e239c82b81ae46a0f6138efa46 Mon Sep 17 00:00:00 2001 From: Steven Beardwell <46458390+stevenbw@users.noreply.github.com> Date: Sun, 20 Jan 2019 11:34:16 +0000 Subject: [PATCH 6/6] TST: Split test_excel.py (#24472) --- pandas/tests/io/excel/test_reader.py | 1151 +++++++++++++++++ .../io/excel/{test_base.py => test_writer.py} | 920 +------------ pandas/tests/io/excel/test_xlrd.py | 364 ------ 3 files changed, 1194 insertions(+), 1241 deletions(-) create mode 100644 pandas/tests/io/excel/test_reader.py rename pandas/tests/io/excel/{test_base.py => test_writer.py} (61%) delete mode 100644 pandas/tests/io/excel/test_xlrd.py diff --git a/pandas/tests/io/excel/test_reader.py b/pandas/tests/io/excel/test_reader.py new file mode 100644 index 0000000000000..376aef6eb4833 --- /dev/null +++ b/pandas/tests/io/excel/test_reader.py @@ -0,0 +1,1151 @@ +from collections import OrderedDict +import contextlib +from datetime import datetime, time +from functools import partial +import os +import warnings + +import numpy as np +import pytest + +from pandas.compat import iteritems, range +import pandas.util._test_decorators as td + +import pandas as pd +from pandas import DataFrame, Index, MultiIndex, Series +import pandas.util.testing as tm +from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf + +from pandas.io.common import URLError +from pandas.io.excel import ExcelFile, ExcelWriter, read_excel +from pandas.io.parsers import read_csv + +_seriesd = tm.getSeriesData() +_tsd = tm.getTimeSeriesData() +_frame = DataFrame(_seriesd)[:10] +_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] +_tsframe = tm.makeTimeDataFrame()[:5] +_mixed_frame = _frame.copy() +_mixed_frame['foo'] = 'bar' + + +@contextlib.contextmanager +def ignore_xlrd_time_clock_warning(): + """ + Context manager to ignore warnings raised by the xlrd library, + regarding the deprecation of `time.clock` in Python 3.7. + """ + with warnings.catch_warnings(): + warnings.filterwarnings( + action='ignore', + message='time.clock has been deprecated', + category=DeprecationWarning) + yield + + +@td.skip_if_no('xlrd', '1.0.0') +class SharedItems(object): + + @pytest.fixture(autouse=True) + def setup_method(self, datapath): + self.dirpath = datapath("io", "data") + self.frame = _frame.copy() + self.frame2 = _frame2.copy() + self.tsframe = _tsframe.copy() + self.mixed_frame = _mixed_frame.copy() + + def get_csv_refdf(self, basename): + """ + Obtain the reference data from read_csv with the Python engine. + Parameters + ---------- + basename : str + File base name, excluding file extension. + Returns + ------- + dfref : DataFrame + """ + pref = os.path.join(self.dirpath, basename + '.csv') + dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') + return dfref + + def get_excelfile(self, basename, ext): + """ + Return test data ExcelFile instance. + Parameters + ---------- + basename : str + File base name, excluding file extension. + Returns + ------- + excel : io.excel.ExcelFile + """ + return ExcelFile(os.path.join(self.dirpath, basename + ext)) + + def get_exceldf(self, basename, ext, *args, **kwds): + """ + Return test data DataFrame. + Parameters + ---------- + basename : str + File base name, excluding file extension. + Returns + ------- + df : DataFrame + """ + pth = os.path.join(self.dirpath, basename + ext) + return read_excel(pth, *args, **kwds) + + +class ReadingTestsBase(SharedItems): + # This is based on ExcelWriterBase + + @pytest.fixture(autouse=True, params=['xlrd', None]) + def set_engine(self, request): + func_name = "get_exceldf" + old_func = getattr(self, func_name) + new_func = partial(old_func, engine=request.param) + setattr(self, func_name, new_func) + yield + setattr(self, func_name, old_func) + + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 + def test_usecols_int(self, ext): + + df_ref = self.get_csv_refdf("test1") + df_ref = df_ref.reindex(columns=["A", "B", "C"]) + + # usecols as int + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df1 = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols=3) + + # usecols as int + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], + index_col=0, usecols=3) + + # parse_cols instead of usecols, usecols as int + with tm.assert_produces_warning(FutureWarning, + check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], + index_col=0, parse_cols=3) + + # TODO add index to xls file) + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df3, df_ref, check_names=False) + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_usecols_list(self, ext): + + dfref = self.get_csv_refdf('test1') + dfref = dfref.reindex(columns=['B', 'C']) + df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols=[0, 2, 3]) + df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols=[0, 2, 3]) + + with tm.assert_produces_warning(FutureWarning): + with ignore_xlrd_time_clock_warning(): + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, parse_cols=[0, 2, 3]) + + # TODO add index to xls file) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + tm.assert_frame_equal(df3, dfref, check_names=False) + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_usecols_str(self, ext): + + dfref = self.get_csv_refdf('test1') + + df1 = dfref.reindex(columns=['A', 'B', 'C']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols='A:D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A:D') + + with tm.assert_produces_warning(FutureWarning): + with ignore_xlrd_time_clock_warning(): + df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, parse_cols='A:D') + + # TODO add index to xls, read xls ignores index name ? + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + tm.assert_frame_equal(df4, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols='A,C,D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C,D') + # TODO add index to xls file + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + df1 = dfref.reindex(columns=['B', 'C']) + df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + usecols='A,C:D') + df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0, usecols='A,C:D') + tm.assert_frame_equal(df2, df1, check_names=False) + tm.assert_frame_equal(df3, df1, check_names=False) + + @pytest.mark.parametrize("usecols", [ + [0, 1, 3], [0, 3, 1], + [1, 0, 3], [1, 3, 0], + [3, 0, 1], [3, 1, 0], + ]) + def test_usecols_diff_positional_int_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["A", "C"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + @pytest.mark.parametrize("usecols", [ + ["B", "D"], ["D", "B"] + ]) + def test_usecols_diff_positional_str_columns_order(self, ext, usecols): + expected = self.get_csv_refdf("test1")[["B", "D"]] + expected.index = range(len(expected)) + + result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_read_excel_without_slicing(self, ext): + expected = self.get_csv_refdf("test1") + result = self.get_exceldf("test1", ext, "Sheet1", index_col=0) + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str(self, ext): + expected = self.get_csv_refdf("test1")[["C", "D"]] + result = self.get_exceldf("test1", ext, "Sheet1", + index_col=0, usecols="A,D:E") + tm.assert_frame_equal(result, expected, check_names=False) + + def test_usecols_excel_range_str_invalid(self, ext): + msg = "Invalid column name: E1" + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1") + + def test_index_col_label_error(self, ext): + msg = "list indices must be integers.*, not str" + + with pytest.raises(TypeError, match=msg): + self.get_exceldf("test1", ext, "Sheet1", index_col=["A"], + usecols=["A", "C"]) + + def test_index_col_empty(self, ext): + # see gh-9208 + result = self.get_exceldf("test1", ext, "Sheet3", + index_col=["A", "B", "C"]) + expected = DataFrame(columns=["D", "E", "F"], + index=MultiIndex(levels=[[]] * 3, + codes=[[]] * 3, + names=["A", "B", "C"])) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("index_col", [None, 2]) + def test_index_col_with_unnamed(self, ext, index_col): + # see gh-18792 + result = self.get_exceldf("test1", ext, "Sheet4", + index_col=index_col) + expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], + columns=["Unnamed: 0", "col1", "col2"]) + if index_col: + expected = expected.set_index(expected.columns[index_col]) + + tm.assert_frame_equal(result, expected) + + def test_usecols_pass_non_existent_column(self, ext): + msg = ("Usecols do not match columns, " + "columns expected but not found: " + r"\['E'\]") + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, usecols=["E"]) + + def test_usecols_wrong_type(self, ext): + msg = ("'usecols' must either be list-like of " + "all strings, all unicode, all integers or a callable.") + + with pytest.raises(ValueError, match=msg): + self.get_exceldf("test1", ext, usecols=["E1", 0]) + + def test_excel_stop_iterator(self, ext): + + parsed = self.get_exceldf('test2', ext, 'Sheet1') + expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) + tm.assert_frame_equal(parsed, expected) + + def test_excel_cell_error_na(self, ext): + + parsed = self.get_exceldf('test3', ext, 'Sheet1') + expected = DataFrame([[np.nan]], columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + def test_excel_passes_na(self, ext): + + excel = self.get_excelfile('test4', ext) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + # 13967 + excel = self.get_excelfile('test5', ext) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=False, + na_values=['apple']) + expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + parsed = read_excel(excel, 'Sheet1', keep_default_na=True, + na_values=['apple']) + expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], + columns=['Test']) + tm.assert_frame_equal(parsed, expected) + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_deprecated_sheetname(self, ext): + # gh-17964 + excel = self.get_excelfile('test1', ext) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + read_excel(excel, sheetname='Sheet1') + + with pytest.raises(TypeError): + read_excel(excel, sheet='Sheet1') + + @td.skip_if_no('xlrd', '1.0.1') # GH-22682 + def test_excel_table_sheet_by_index(self, ext): + + excel = self.get_excelfile('test1', ext) + dfref = self.get_csv_refdf('test1') + + df1 = read_excel(excel, 0, index_col=0) + df2 = read_excel(excel, 1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + + df1 = excel.parse(0, index_col=0) + df2 = excel.parse(1, skiprows=[1], index_col=0) + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + + df3 = read_excel(excel, 0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df4 = read_excel(excel, 0, index_col=0, skip_footer=1) + tm.assert_frame_equal(df3, df4) + + df3 = excel.parse(0, index_col=0, skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + import xlrd + with pytest.raises(xlrd.XLRDError): + read_excel(excel, 'asdf') + + def test_excel_table(self, ext): + + dfref = self.get_csv_refdf('test1') + + df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0) + df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], + index_col=0) + # TODO add index to file + tm.assert_frame_equal(df1, dfref, check_names=False) + tm.assert_frame_equal(df2, dfref, check_names=False) + + df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, + skipfooter=1) + tm.assert_frame_equal(df3, df1.iloc[:-1]) + + def test_reader_special_dtypes(self, ext): + + expected = DataFrame.from_dict(OrderedDict([ + ("IntCol", [1, 2, -3, 4, 0]), + ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), + ("BoolCol", [True, False, True, True, False]), + ("StrCol", [1, 2, 3, 4, 5]), + # GH5394 - this is why convert_float isn't vectorized + ("Str2Col", ["a", 3, "c", "d", "e"]), + ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), + datetime(1905, 1, 1), datetime(2013, 12, 14), + datetime(2015, 3, 14)]) + ])) + basename = 'test_types' + + # should read in correctly and infer types + actual = self.get_exceldf(basename, ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + # if not coercing number, then int comes in as float + float_expected = expected.copy() + float_expected["IntCol"] = float_expected["IntCol"].astype(float) + float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 + actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False) + tm.assert_frame_equal(actual, float_expected) + + # check setting Index (assuming xls and xlsx are the same here) + for icol, name in enumerate(expected.columns): + actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol) + exp = expected.set_index(name) + tm.assert_frame_equal(actual, exp) + + # convert_float and converters should be different but both accepted + expected["StrCol"] = expected["StrCol"].apply(str) + actual = self.get_exceldf( + basename, ext, 'Sheet1', converters={"StrCol": str}) + tm.assert_frame_equal(actual, expected) + + no_convert_float = float_expected.copy() + no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) + actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False, + converters={"StrCol": str}) + tm.assert_frame_equal(actual, no_convert_float) + + # GH8212 - support for converters and missing values + def test_reader_converters(self, ext): + + basename = 'test_converters' + + expected = DataFrame.from_dict(OrderedDict([ + ("IntCol", [1, 2, -3, -1000, 0]), + ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), + ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), + ("StrCol", ['1', np.nan, '3', '4', '5']), + ])) + + converters = {'IntCol': lambda x: int(x) if x != '' else -1000, + 'FloatCol': lambda x: 10 * x if x else np.nan, + 2: lambda x: 'Found' if x != '' else 'Not found', + 3: lambda x: str(x) if x else '', + } + + # should read in correctly and set types of single cells (not array + # dtypes) + actual = self.get_exceldf(basename, ext, 'Sheet1', + converters=converters) + tm.assert_frame_equal(actual, expected) + + def test_reader_dtype(self, ext): + # GH 8212 + basename = 'testdtype' + actual = self.get_exceldf(basename, ext) + + expected = DataFrame({ + 'a': [1, 2, 3, 4], + 'b': [2.5, 3.5, 4.5, 5.5], + 'c': [1, 2, 3, 4], + 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( + columns=['a', 'b', 'c', 'd']) + + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf(basename, ext, + dtype={'a': 'float64', + 'b': 'float32', + 'c': str}) + + expected['a'] = expected['a'].astype('float64') + expected['b'] = expected['b'].astype('float32') + expected['c'] = ['001', '002', '003', '004'] + tm.assert_frame_equal(actual, expected) + + with pytest.raises(ValueError): + self.get_exceldf(basename, ext, dtype={'d': 'int64'}) + + @pytest.mark.parametrize("dtype,expected", [ + (None, + DataFrame({ + "a": [1, 2, 3, 4], + "b": [2.5, 3.5, 4.5, 5.5], + "c": [1, 2, 3, 4], + "d": [1.0, 2.0, np.nan, 4.0] + })), + ({"a": "float64", + "b": "float32", + "c": str, + "d": str + }, + DataFrame({ + "a": Series([1, 2, 3, 4], dtype="float64"), + "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), + "c": ["001", "002", "003", "004"], + "d": ["1", "2", np.nan, "4"] + })), + ]) + def test_reader_dtype_str(self, ext, dtype, expected): + # see gh-20377 + basename = "testdtype" + + actual = self.get_exceldf(basename, ext, dtype=dtype) + tm.assert_frame_equal(actual, expected) + + def test_reading_all_sheets(self, ext): + # Test reading all sheetnames by setting sheetname to None, + # Ensure a dict is returned. + # See PR #9450 + basename = 'test_multisheet' + dfs = self.get_exceldf(basename, ext, sheet_name=None) + # ensure this is not alphabetical to test order preservation + expected_keys = ['Charlie', 'Alpha', 'Beta'] + tm.assert_contains_all(expected_keys, dfs.keys()) + # Issue 9930 + # Ensure sheet order is preserved + assert expected_keys == list(dfs.keys()) + + def test_reading_multiple_specific_sheets(self, ext): + # Test reading specific sheetnames by specifying a mixed list + # of integers and strings, and confirm that duplicated sheet + # references (positions/names) are removed properly. + # Ensure a dict is returned + # See PR #9450 + basename = 'test_multisheet' + # Explicitly request duplicates. Only the set should be returned. + expected_keys = [2, 'Charlie', 'Charlie'] + dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys) + expected_keys = list(set(expected_keys)) + tm.assert_contains_all(expected_keys, dfs.keys()) + assert len(expected_keys) == len(dfs.keys()) + + def test_reading_all_sheets_with_blank(self, ext): + # Test reading all sheetnames by setting sheetname to None, + # In the case where some sheets are blank. + # Issue #11711 + basename = 'blank_with_header' + dfs = self.get_exceldf(basename, ext, sheet_name=None) + expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] + tm.assert_contains_all(expected_keys, dfs.keys()) + + # GH6403 + def test_read_excel_blank(self, ext): + actual = self.get_exceldf('blank', ext, 'Sheet1') + tm.assert_frame_equal(actual, DataFrame()) + + def test_read_excel_blank_with_header(self, ext): + expected = DataFrame(columns=['col_1', 'col_2']) + actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([np.nan] * 4)), + (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) + ]) + def test_read_one_empty_col_no_header(self, ext, header, expected): + # xref gh-12292 + filename = "no_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, filename, index=False, header=False) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + @pytest.mark.parametrize("header,expected", [ + (None, DataFrame([0] + [np.nan] * 4)), + (0, DataFrame([np.nan] * 4)) + ]) + def test_read_one_empty_col_with_header(self, ext, header, expected): + filename = "with_header" + df = pd.DataFrame( + [["", 1, 100], + ["", 2, 200], + ["", 3, 300], + ["", 4, 400]] + ) + + with ensure_clean(ext) as path: + df.to_excel(path, 'with_header', index=False, header=True) + result = read_excel(path, filename, usecols=[0], header=header) + + tm.assert_frame_equal(result, expected) + + @td.skip_if_no('openpyxl') + @td.skip_if_no('xlwt') + def test_set_column_names_in_parameter(self, ext): + # GH 12870 : pass down column names associated with + # keyword argument names + refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], + [3, 'baz']], columns=['a', 'b']) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as writer: + refdf.to_excel(writer, 'Data_no_head', + header=False, index=False) + refdf.to_excel(writer, 'Data_with_head', index=False) + + refdf.columns = ['A', 'B'] + + with ExcelFile(pth) as reader: + xlsdf_no_head = read_excel(reader, 'Data_no_head', + header=None, names=['A', 'B']) + xlsdf_with_head = read_excel(reader, 'Data_with_head', + index_col=None, names=['A', 'B']) + + tm.assert_frame_equal(xlsdf_no_head, refdf) + tm.assert_frame_equal(xlsdf_with_head, refdf) + + def test_date_conversion_overflow(self, ext): + # GH 10001 : pandas.ExcelFile ignore parse_dates=False + expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], + [pd.Timestamp('2016-03-16'), 'Jack Black'], + [1e+20, 'Timothy Brown']], + columns=['DateColWithBigInt', 'StringCol']) + + result = self.get_exceldf('testdateoverflow', ext) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 + def test_sheet_name_and_sheetname(self, ext): + # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" + # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) + # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name + # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different + # behavior for "sheetname" argument + filename = "test1" + sheet_name = "Sheet1" + + df_ref = self.get_csv_refdf(filename) + df1 = self.get_exceldf(filename, ext, + sheet_name=sheet_name, index_col=0) # doc + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + with ignore_xlrd_time_clock_warning(): + df2 = self.get_exceldf(filename, ext, index_col=0, + sheetname=sheet_name) # backward compat + + excel = self.get_excelfile(filename, ext) + df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc + with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): + df2_parse = excel.parse(index_col=0, + sheetname=sheet_name) # backward compat + + tm.assert_frame_equal(df1, df_ref, check_names=False) + tm.assert_frame_equal(df2, df_ref, check_names=False) + tm.assert_frame_equal(df1_parse, df_ref, check_names=False) + tm.assert_frame_equal(df2_parse, df_ref, check_names=False) + + def test_sheet_name_both_raises(self, ext): + with pytest.raises(TypeError, match="Cannot specify both"): + self.get_exceldf('test1', ext, sheetname='Sheet1', + sheet_name='Sheet1') + + excel = self.get_excelfile('test1', ext) + with pytest.raises(TypeError, match="Cannot specify both"): + excel.parse(sheetname='Sheet1', + sheet_name='Sheet1') + + def test_excel_read_buffer(self, ext): + + pth = os.path.join(self.dirpath, 'test1' + ext) + expected = read_excel(pth, 'Sheet1', index_col=0) + with open(pth, 'rb') as f: + actual = read_excel(f, 'Sheet1', index_col=0) + tm.assert_frame_equal(expected, actual) + + with open(pth, 'rb') as f: + xls = ExcelFile(f) + actual = read_excel(xls, 'Sheet1', index_col=0) + tm.assert_frame_equal(expected, actual) + + def test_bad_engine_raises(self, ext): + bad_engine = 'foo' + with pytest.raises(ValueError, match="Unknown engine: foo"): + read_excel('', engine=bad_engine) + + @tm.network + def test_read_from_http_url(self, ext): + url = ('https://raw.github.com/pandas-dev/pandas/master/' + 'pandas/tests/io/data/test1' + ext) + url_table = read_excel(url) + local_table = self.get_exceldf('test1', ext) + tm.assert_frame_equal(url_table, local_table) + + @td.skip_if_not_us_locale + def test_read_from_s3_url(self, ext, s3_resource): + # Bucket "pandas-test" created in tests/io/conftest.py + file_name = os.path.join(self.dirpath, 'test1' + ext) + + with open(file_name, "rb") as f: + s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext, + Body=f) + + url = ('s3://pandas-test/test1' + ext) + url_table = read_excel(url) + local_table = self.get_exceldf('test1', ext) + tm.assert_frame_equal(url_table, local_table) + + @pytest.mark.slow + # ignore warning from old xlrd + @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning") + def test_read_from_file_url(self, ext): + + # FILE + localtable = os.path.join(self.dirpath, 'test1' + ext) + local_table = read_excel(localtable) + + try: + url_table = read_excel('file://localhost/' + localtable) + except URLError: + # fails on some systems + import platform + pytest.skip("failing on %s" % + ' '.join(platform.uname()).strip()) + + tm.assert_frame_equal(url_table, local_table) + + @td.skip_if_no('pathlib') + def test_read_from_pathlib_path(self, ext): + + # GH12655 + from pathlib import Path + + str_path = os.path.join(self.dirpath, 'test1' + ext) + expected = read_excel(str_path, 'Sheet1', index_col=0) + + path_obj = Path(self.dirpath, 'test1' + ext) + actual = read_excel(path_obj, 'Sheet1', index_col=0) + + tm.assert_frame_equal(expected, actual) + + @td.skip_if_no('py.path') + def test_read_from_py_localpath(self, ext): + + # GH12655 + from py.path import local as LocalPath + + str_path = os.path.join(self.dirpath, 'test1' + ext) + expected = read_excel(str_path, 'Sheet1', index_col=0) + + abs_dir = os.path.abspath(self.dirpath) + path_obj = LocalPath(abs_dir).join('test1' + ext) + actual = read_excel(path_obj, 'Sheet1', index_col=0) + + tm.assert_frame_equal(expected, actual) + + def test_reader_closes_file(self, ext): + + pth = os.path.join(self.dirpath, 'test1' + ext) + f = open(pth, 'rb') + with ExcelFile(f) as xlsx: + # parses okay + read_excel(xlsx, 'Sheet1', index_col=0) + + assert f.closed + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_creating_and_reading_multiple_sheets(self, ext): + # see gh-9450 + # + # Test reading multiple sheets, from a runtime + # created Excel file with multiple sheets. + def tdf(col_sheet_name): + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[col_sheet_name]) + + sheets = ["AAA", "BBB", "CCC"] + + dfs = [tdf(s) for s in sheets] + dfs = dict(zip(sheets, dfs)) + + with ensure_clean(ext) as pth: + with ExcelWriter(pth) as ew: + for sheetname, df in iteritems(dfs): + df.to_excel(ew, sheetname) + + dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) + + for s in sheets: + tm.assert_frame_equal(dfs[s], dfs_returned[s]) + + def test_reader_seconds(self, ext): + + # Test reading times with and without milliseconds. GH5945. + expected = DataFrame.from_dict({"Time": [time(1, 2, 3), + time(2, 45, 56, 100000), + time(4, 29, 49, 200000), + time(6, 13, 42, 300000), + time(7, 57, 35, 400000), + time(9, 41, 28, 500000), + time(11, 25, 21, 600000), + time(13, 9, 14, 700000), + time(14, 53, 7, 800000), + time(16, 37, 0, 900000), + time(18, 20, 54)]}) + + actual = self.get_exceldf('times_1900', ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + actual = self.get_exceldf('times_1904', ext, 'Sheet1') + tm.assert_frame_equal(actual, expected) + + def test_read_excel_multiindex(self, ext): + # see gh-4679 + mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) + mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + + # "mi_column" sheet + expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], + [2, 3.5, pd.Timestamp("2015-01-02"), False], + [3, 4.5, pd.Timestamp("2015-01-03"), False], + [4, 5.5, pd.Timestamp("2015-01-04"), True]], + columns=mi) + + actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + # "mi_index" sheet + expected.index = mi + expected.columns = ["a", "b", "c", "d"] + + actual = read_excel(mi_file, "mi_index", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + # "both" sheet + expected.columns = mi + + actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + # "mi_index_name" sheet + expected.columns = ["a", "b", "c", "d"] + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "mi_column_name" sheet + expected.index = list(range(4)) + expected.columns = mi.set_names(["c1", "c2"]) + actual = read_excel(mi_file, "mi_column_name", + header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + # see gh-11317 + # "name_with_int" sheet + expected.columns = mi.set_levels( + [1, 2], level=1).set_names(["c1", "c2"]) + + actual = read_excel(mi_file, "name_with_int", + index_col=0, header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "both_name" sheet + expected.columns = mi.set_names(["c1", "c2"]) + expected.index = mi.set_names(["ilvl1", "ilvl2"]) + + actual = read_excel(mi_file, "both_name", + index_col=[0, 1], header=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # "both_skiprows" sheet + actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], + header=[0, 1], skiprows=2) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_multiindex_header_only(self, ext): + # see gh-11733. + # + # Don't try to parse a header name if there isn't one. + mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) + result = read_excel(mi_file, "index_col_none", header=[0, 1]) + + exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) + expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) + tm.assert_frame_equal(result, expected) + + @td.skip_if_no("xlsxwriter") + def test_read_excel_multiindex_empty_level(self, ext): + # see gh-12453 + with ensure_clean(ext) as path: + df = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", ""): {0: 0} + }) + + expected = DataFrame({ + ("One", "x"): {0: 1}, + ("Two", "X"): {0: 3}, + ("Two", "Y"): {0: 7}, + ("Zero", "Unnamed: 4_level_1"): {0: 0} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + df = pd.DataFrame({ + ("Beg", ""): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + expected = pd.DataFrame({ + ("Beg", "Unnamed: 1_level_1"): {0: 0}, + ("Middle", "x"): {0: 1}, + ("Tail", "X"): {0: 3}, + ("Tail", "Y"): {0: 7} + }) + + df.to_excel(path) + actual = pd.read_excel(path, header=[0, 1], index_col=0) + tm.assert_frame_equal(actual, expected) + + @td.skip_if_no("xlsxwriter") + @pytest.mark.parametrize("c_idx_names", [True, False]) + @pytest.mark.parametrize("r_idx_names", [True, False]) + @pytest.mark.parametrize("c_idx_levels", [1, 3]) + @pytest.mark.parametrize("r_idx_levels", [1, 3]) + def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels): + # see gh-4679 + with ensure_clean(ext) as pth: + if c_idx_levels == 1 and c_idx_names: + pytest.skip("Column index name cannot be " + "serialized unless it's a MultiIndex") + + # Empty name case current read in as + # unnamed levels, not Nones. + check_names = r_idx_names or r_idx_levels <= 1 + + df = mkdf(5, 5, c_idx_names, r_idx_names, + c_idx_levels, r_idx_levels) + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[0, :] = np.nan + df.to_excel(pth) + + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + df.iloc[-1, :] = np.nan + df.to_excel(pth) + act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal(df, act, check_names=check_names) + + def test_excel_old_index_format(self, ext): + # see gh-4679 + filename = "test_index_name_pre17" + ext + in_file = os.path.join(self.dirpath, filename) + + # We detect headers to determine if index names exist, so + # that "index" name in the "names" version of the data will + # now be interpreted as rows that include null data. + data = np.array([[None, None, None, None, None], + ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", + "R_l0_g2", "R_l0_g3", "R_l0_g4"], + ["R1", "R_l1_g0", "R_l1_g1", + "R_l1_g2", "R_l1_g3", "R_l1_g4"]], + codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], + names=[None, None]) + si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(in_file, "single_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected) + + # The analogous versions of the "names" version data + # where there are explicitly no names for the indices. + data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], + ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], + ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], + ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], + ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) + columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] + mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], + ["R_l1_g0", "R_l1_g1", "R_l1_g2", + "R_l1_g3", "R_l1_g4"]], + codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], + names=[None, None]) + si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", + "R_l0_g3", "R_l0_g4"], name=None) + + expected = pd.DataFrame(data, index=si, columns=columns) + + actual = pd.read_excel(in_file, "single_no_names", index_col=0) + tm.assert_frame_equal(actual, expected) + + expected.index = mi + + actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1]) + tm.assert_frame_equal(actual, expected, check_names=False) + + def test_read_excel_bool_header_arg(self, ext): + # GH 6114 + for arg in [True, False]: + with pytest.raises(TypeError): + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + header=arg) + + def test_read_excel_chunksize(self, ext): + # GH 8011 + with pytest.raises(NotImplementedError): + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + chunksize=100) + + @td.skip_if_no("xlwt") + @td.skip_if_no("openpyxl") + def test_read_excel_parse_dates(self, ext): + # see gh-11544, gh-12051 + df = DataFrame( + {"col": [1, 2, 3], + "date_strings": pd.date_range("2012-01-01", periods=3)}) + df2 = df.copy() + df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") + + with ensure_clean(ext) as pth: + df2.to_excel(pth) + + res = read_excel(pth, index_col=0) + tm.assert_frame_equal(df2, res) + + res = read_excel(pth, parse_dates=["date_strings"], index_col=0) + tm.assert_frame_equal(df, res) + + date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") + res = read_excel(pth, parse_dates=["date_strings"], + date_parser=date_parser, index_col=0) + tm.assert_frame_equal(df, res) + + def test_read_excel_skiprows_list(self, ext): + # GH 4903 + actual = pd.read_excel(os.path.join(self.dirpath, + 'testskiprows' + ext), + 'skiprows_list', skiprows=[0, 2]) + expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], + [2, 3.5, pd.Timestamp('2015-01-02'), False], + [3, 4.5, pd.Timestamp('2015-01-03'), False], + [4, 5.5, pd.Timestamp('2015-01-04'), True]], + columns=['a', 'b', 'c', 'd']) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel(os.path.join(self.dirpath, + 'testskiprows' + ext), + 'skiprows_list', skiprows=np.array([0, 2])) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows(self, ext): + # GH 16645 + num_rows_to_pull = 5 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + nrows=num_rows_to_pull) + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + ext)) + expected = expected[:num_rows_to_pull] + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_greater_than_nrows_in_file(self, ext): + # GH 16645 + expected = pd.read_excel(os.path.join(self.dirpath, + 'test1' + ext)) + num_records_in_file = len(expected) + num_rows_to_pull = num_records_in_file + 10 + actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + nrows=num_rows_to_pull) + tm.assert_frame_equal(actual, expected) + + def test_read_excel_nrows_non_integer_parameter(self, ext): + # GH 16645 + msg = "'nrows' must be an integer >=0" + with pytest.raises(ValueError, match=msg): + pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), + nrows='5') + + def test_read_excel_squeeze(self, ext): + # GH 12157 + f = os.path.join(self.dirpath, 'test_squeeze' + ext) + + actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) + expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') + expected.index.name = 'a' + tm.assert_series_equal(actual, expected) + + actual = pd.read_excel(f, 'two_columns', squeeze=True) + expected = pd.DataFrame({'a': [4, 5, 6], + 'b': [2, 3, 4]}) + tm.assert_frame_equal(actual, expected) + + actual = pd.read_excel(f, 'one_column', squeeze=True) + expected = pd.Series([1, 2, 3], name='a') + tm.assert_series_equal(actual, expected) + + +@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) +class TestXlrdReader(ReadingTestsBase): + """ + This is the base class for the xlrd tests, and 3 different file formats + are supported: xls, xlsx, xlsm + """ + + @td.skip_if_no("xlwt") + def test_read_xlrd_book(self, ext): + import xlrd + df = self.frame + + engine = "xlrd" + sheet_name = "SheetA" + + with ensure_clean(ext) as pth: + df.to_excel(pth, sheet_name) + book = xlrd.open_workbook(pth) + + with ExcelFile(book, engine=engine) as xl: + result = read_excel(xl, sheet_name, index_col=0) + tm.assert_frame_equal(df, result) + + result = read_excel(book, sheet_name=sheet_name, + engine=engine, index_col=0) + tm.assert_frame_equal(df, result) diff --git a/pandas/tests/io/excel/test_base.py b/pandas/tests/io/excel/test_writer.py similarity index 61% rename from pandas/tests/io/excel/test_base.py rename to pandas/tests/io/excel/test_writer.py index 9144cb5279826..89d624344bce5 100644 --- a/pandas/tests/io/excel/test_base.py +++ b/pandas/tests/io/excel/test_writer.py @@ -1,5 +1,4 @@ -from collections import OrderedDict -from datetime import date, datetime, time, timedelta +from datetime import date, datetime, timedelta from distutils.version import LooseVersion from functools import partial import os @@ -10,16 +9,15 @@ from numpy import nan import pytest -from pandas.compat import PY36, BytesIO, iteritems, map, range, u +from pandas.compat import PY36, BytesIO, map, range, u import pandas.util._test_decorators as td import pandas as pd -from pandas import DataFrame, Index, MultiIndex, Series +from pandas import DataFrame, Index, MultiIndex from pandas.core.config import get_option, set_option import pandas.util.testing as tm from pandas.util.testing import ensure_clean, makeCustomDataframe as mkdf -from pandas.io.common import URLError from pandas.io.excel import ( ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, read_excel, register_writer) @@ -35,6 +33,7 @@ _mixed_frame['foo'] = 'bar' +@td.skip_if_no('xlrd', '1.0.0') class SharedItems(object): @pytest.fixture(autouse=True) @@ -48,887 +47,44 @@ def setup_method(self, datapath): def get_csv_refdf(self, basename): """ Obtain the reference data from read_csv with the Python engine. - Parameters ---------- - basename : str File base name, excluding file extension. - Returns ------- - dfref : DataFrame """ pref = os.path.join(self.dirpath, basename + '.csv') dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') return dfref + def get_excelfile(self, basename, ext): + """ + Return test data ExcelFile instance. + Parameters + ---------- + basename : str + File base name, excluding file extension. + Returns + ------- + excel : io.excel.ExcelFile + """ + return ExcelFile(os.path.join(self.dirpath, basename + ext)) -class ReadingTestsBase(SharedItems): - # This is based on ExcelWriterBase - - @pytest.fixture(autouse=True, params=[None]) - def set_engine(self, request): - func_name = "get_exceldf" - old_func = getattr(self, func_name) - new_func = partial(old_func, engine=request.param) - setattr(self, func_name, new_func) - yield - setattr(self, func_name, old_func) - - @pytest.mark.parametrize("usecols", [ - [0, 1, 3], [0, 3, 1], - [1, 0, 3], [1, 3, 0], - [3, 0, 1], [3, 1, 0], - ]) - def test_usecols_diff_positional_int_columns_order(self, ext, usecols): - expected = self.get_csv_refdf("test1")[["A", "C"]] - result = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols=usecols) - tm.assert_frame_equal(result, expected, check_names=False) - - @pytest.mark.parametrize("usecols", [ - ["B", "D"], ["D", "B"] - ]) - def test_usecols_diff_positional_str_columns_order(self, ext, usecols): - expected = self.get_csv_refdf("test1")[["B", "D"]] - expected.index = range(len(expected)) - - result = self.get_exceldf("test1", ext, "Sheet1", usecols=usecols) - tm.assert_frame_equal(result, expected, check_names=False) - - def test_read_excel_without_slicing(self, ext): - expected = self.get_csv_refdf("test1") - result = self.get_exceldf("test1", ext, "Sheet1", index_col=0) - tm.assert_frame_equal(result, expected, check_names=False) - - def test_usecols_excel_range_str(self, ext): - expected = self.get_csv_refdf("test1")[["C", "D"]] - result = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols="A,D:E") - tm.assert_frame_equal(result, expected, check_names=False) - - def test_usecols_excel_range_str_invalid(self, ext): - msg = "Invalid column name: E1" - - with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, "Sheet1", usecols="D:E1") - - def test_index_col_label_error(self, ext): - msg = "list indices must be integers.*, not str" - - with pytest.raises(TypeError, match=msg): - self.get_exceldf("test1", ext, "Sheet1", index_col=["A"], - usecols=["A", "C"]) - - def test_index_col_empty(self, ext): - # see gh-9208 - result = self.get_exceldf("test1", ext, "Sheet3", - index_col=["A", "B", "C"]) - expected = DataFrame(columns=["D", "E", "F"], - index=MultiIndex(levels=[[]] * 3, - codes=[[]] * 3, - names=["A", "B", "C"])) - tm.assert_frame_equal(result, expected) - - @pytest.mark.parametrize("index_col", [None, 2]) - def test_index_col_with_unnamed(self, ext, index_col): - # see gh-18792 - result = self.get_exceldf("test1", ext, "Sheet4", - index_col=index_col) - expected = DataFrame([["i1", "a", "x"], ["i2", "b", "y"]], - columns=["Unnamed: 0", "col1", "col2"]) - if index_col: - expected = expected.set_index(expected.columns[index_col]) - - tm.assert_frame_equal(result, expected) - - def test_usecols_pass_non_existent_column(self, ext): - msg = ("Usecols do not match columns, " - "columns expected but not found: " + r"\['E'\]") - - with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, usecols=["E"]) - - def test_usecols_wrong_type(self, ext): - msg = ("'usecols' must either be list-like of " - "all strings, all unicode, all integers or a callable.") - - with pytest.raises(ValueError, match=msg): - self.get_exceldf("test1", ext, usecols=["E1", 0]) - - def test_excel_stop_iterator(self, ext): - - parsed = self.get_exceldf('test2', ext, 'Sheet1') - expected = DataFrame([['aaaa', 'bbbbb']], columns=['Test', 'Test1']) - tm.assert_frame_equal(parsed, expected) - - def test_excel_cell_error_na(self, ext): - - parsed = self.get_exceldf('test3', ext, 'Sheet1') - expected = DataFrame([[np.nan]], columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - def test_excel_passes_na(self, ext): - - excel = self.get_excelfile('test4', ext) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - # 13967 - excel = self.get_excelfile('test5', ext) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) - expected = DataFrame([['1.#QNAN'], [1], ['nan'], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - parsed = read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) - expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], - columns=['Test']) - tm.assert_frame_equal(parsed, expected) - - def test_excel_table(self, ext): - - dfref = self.get_csv_refdf('test1') - - df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0) - # TODO add index to file - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - - df3 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - def test_reader_special_dtypes(self, ext): - - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, 4, 0]), - ("FloatCol", [1.25, 2.25, 1.83, 1.92, 0.0000000005]), - ("BoolCol", [True, False, True, True, False]), - ("StrCol", [1, 2, 3, 4, 5]), - # GH5394 - this is why convert_float isn't vectorized - ("Str2Col", ["a", 3, "c", "d", "e"]), - ("DateCol", [datetime(2013, 10, 30), datetime(2013, 10, 31), - datetime(1905, 1, 1), datetime(2013, 12, 14), - datetime(2015, 3, 14)]) - ])) - basename = 'test_types' - - # should read in correctly and infer types - actual = self.get_exceldf(basename, ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - # if not coercing number, then int comes in as float - float_expected = expected.copy() - float_expected["IntCol"] = float_expected["IntCol"].astype(float) - float_expected.loc[float_expected.index[1], "Str2Col"] = 3.0 - actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False) - tm.assert_frame_equal(actual, float_expected) - - # check setting Index (assuming xls and xlsx are the same here) - for icol, name in enumerate(expected.columns): - actual = self.get_exceldf(basename, ext, 'Sheet1', index_col=icol) - exp = expected.set_index(name) - tm.assert_frame_equal(actual, exp) - - # convert_float and converters should be different but both accepted - expected["StrCol"] = expected["StrCol"].apply(str) - actual = self.get_exceldf( - basename, ext, 'Sheet1', converters={"StrCol": str}) - tm.assert_frame_equal(actual, expected) - - no_convert_float = float_expected.copy() - no_convert_float["StrCol"] = no_convert_float["StrCol"].apply(str) - actual = self.get_exceldf(basename, ext, 'Sheet1', convert_float=False, - converters={"StrCol": str}) - tm.assert_frame_equal(actual, no_convert_float) - - # GH8212 - support for converters and missing values - def test_reader_converters(self, ext): - - basename = 'test_converters' - - expected = DataFrame.from_dict(OrderedDict([ - ("IntCol", [1, 2, -3, -1000, 0]), - ("FloatCol", [12.5, np.nan, 18.3, 19.2, 0.000000005]), - ("BoolCol", ['Found', 'Found', 'Found', 'Not found', 'Found']), - ("StrCol", ['1', np.nan, '3', '4', '5']), - ])) - - converters = {'IntCol': lambda x: int(x) if x != '' else -1000, - 'FloatCol': lambda x: 10 * x if x else np.nan, - 2: lambda x: 'Found' if x != '' else 'Not found', - 3: lambda x: str(x) if x else '', - } - - # should read in correctly and set types of single cells (not array - # dtypes) - actual = self.get_exceldf(basename, ext, 'Sheet1', - converters=converters) - tm.assert_frame_equal(actual, expected) - - def test_reader_dtype(self, ext): - # GH 8212 - basename = 'testdtype' - actual = self.get_exceldf(basename, ext) - - expected = DataFrame({ - 'a': [1, 2, 3, 4], - 'b': [2.5, 3.5, 4.5, 5.5], - 'c': [1, 2, 3, 4], - 'd': [1.0, 2.0, np.nan, 4.0]}).reindex( - columns=['a', 'b', 'c', 'd']) - - tm.assert_frame_equal(actual, expected) - - actual = self.get_exceldf(basename, ext, - dtype={'a': 'float64', - 'b': 'float32', - 'c': str}) - - expected['a'] = expected['a'].astype('float64') - expected['b'] = expected['b'].astype('float32') - expected['c'] = ['001', '002', '003', '004'] - tm.assert_frame_equal(actual, expected) - - with pytest.raises(ValueError): - self.get_exceldf(basename, ext, dtype={'d': 'int64'}) - - @pytest.mark.parametrize("dtype,expected", [ - (None, - DataFrame({ - "a": [1, 2, 3, 4], - "b": [2.5, 3.5, 4.5, 5.5], - "c": [1, 2, 3, 4], - "d": [1.0, 2.0, np.nan, 4.0] - })), - ({"a": "float64", - "b": "float32", - "c": str, - "d": str - }, - DataFrame({ - "a": Series([1, 2, 3, 4], dtype="float64"), - "b": Series([2.5, 3.5, 4.5, 5.5], dtype="float32"), - "c": ["001", "002", "003", "004"], - "d": ["1", "2", np.nan, "4"] - })), - ]) - def test_reader_dtype_str(self, ext, dtype, expected): - # see gh-20377 - basename = "testdtype" - - actual = self.get_exceldf(basename, ext, dtype=dtype) - tm.assert_frame_equal(actual, expected) - - def test_reading_all_sheets(self, ext): - # Test reading all sheetnames by setting sheetname to None, - # Ensure a dict is returned. - # See PR #9450 - basename = 'test_multisheet' - dfs = self.get_exceldf(basename, ext, sheet_name=None) - # ensure this is not alphabetical to test order preservation - expected_keys = ['Charlie', 'Alpha', 'Beta'] - tm.assert_contains_all(expected_keys, dfs.keys()) - # Issue 9930 - # Ensure sheet order is preserved - assert expected_keys == list(dfs.keys()) - - def test_reading_multiple_specific_sheets(self, ext): - # Test reading specific sheetnames by specifying a mixed list - # of integers and strings, and confirm that duplicated sheet - # references (positions/names) are removed properly. - # Ensure a dict is returned - # See PR #9450 - basename = 'test_multisheet' - # Explicitly request duplicates. Only the set should be returned. - expected_keys = [2, 'Charlie', 'Charlie'] - dfs = self.get_exceldf(basename, ext, sheet_name=expected_keys) - expected_keys = list(set(expected_keys)) - tm.assert_contains_all(expected_keys, dfs.keys()) - assert len(expected_keys) == len(dfs.keys()) - - def test_reading_all_sheets_with_blank(self, ext): - # Test reading all sheetnames by setting sheetname to None, - # In the case where some sheets are blank. - # Issue #11711 - basename = 'blank_with_header' - dfs = self.get_exceldf(basename, ext, sheet_name=None) - expected_keys = ['Sheet1', 'Sheet2', 'Sheet3'] - tm.assert_contains_all(expected_keys, dfs.keys()) - - # GH6403 - def test_read_excel_blank(self, ext): - actual = self.get_exceldf('blank', ext, 'Sheet1') - tm.assert_frame_equal(actual, DataFrame()) - - def test_read_excel_blank_with_header(self, ext): - expected = DataFrame(columns=['col_1', 'col_2']) - actual = self.get_exceldf('blank_with_header', ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([np.nan] * 4)), - (0, DataFrame({"Unnamed: 0": [np.nan] * 3})) - ]) - def test_read_one_empty_col_no_header(self, ext, header, expected): - # xref gh-12292 - filename = "no_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, filename, index=False, header=False) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - @pytest.mark.parametrize("header,expected", [ - (None, DataFrame([0] + [np.nan] * 4)), - (0, DataFrame([np.nan] * 4)) - ]) - def test_read_one_empty_col_with_header(self, ext, header, expected): - filename = "with_header" - df = pd.DataFrame( - [["", 1, 100], - ["", 2, 200], - ["", 3, 300], - ["", 4, 400]] - ) - - with ensure_clean(ext) as path: - df.to_excel(path, 'with_header', index=False, header=True) - result = read_excel(path, filename, usecols=[0], header=header) - - tm.assert_frame_equal(result, expected) - - @td.skip_if_no('openpyxl') - @td.skip_if_no('xlwt') - def test_set_column_names_in_parameter(self, ext): - # GH 12870 : pass down column names associated with - # keyword argument names - refdf = pd.DataFrame([[1, 'foo'], [2, 'bar'], - [3, 'baz']], columns=['a', 'b']) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as writer: - refdf.to_excel(writer, 'Data_no_head', - header=False, index=False) - refdf.to_excel(writer, 'Data_with_head', index=False) - - refdf.columns = ['A', 'B'] - - with ExcelFile(pth) as reader: - xlsdf_no_head = read_excel(reader, 'Data_no_head', - header=None, names=['A', 'B']) - xlsdf_with_head = read_excel(reader, 'Data_with_head', - index_col=None, names=['A', 'B']) - - tm.assert_frame_equal(xlsdf_no_head, refdf) - tm.assert_frame_equal(xlsdf_with_head, refdf) - - def test_date_conversion_overflow(self, ext): - # GH 10001 : pandas.ExcelFile ignore parse_dates=False - expected = pd.DataFrame([[pd.Timestamp('2016-03-12'), 'Marc Johnson'], - [pd.Timestamp('2016-03-16'), 'Jack Black'], - [1e+20, 'Timothy Brown']], - columns=['DateColWithBigInt', 'StringCol']) - - result = self.get_exceldf('testdateoverflow', ext) - tm.assert_frame_equal(result, expected) - - def test_sheet_name_both_raises(self, ext): - with pytest.raises(TypeError, match="Cannot specify both"): - self.get_exceldf('test1', ext, sheetname='Sheet1', - sheet_name='Sheet1') - - excel = self.get_excelfile('test1', ext) - with pytest.raises(TypeError, match="Cannot specify both"): - excel.parse(sheetname='Sheet1', - sheet_name='Sheet1') - - def test_excel_read_buffer(self, ext): - - pth = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(pth, 'Sheet1', index_col=0) - with open(pth, 'rb') as f: - actual = read_excel(f, 'Sheet1', index_col=0) - tm.assert_frame_equal(expected, actual) - - with open(pth, 'rb') as f: - xls = ExcelFile(f) - actual = read_excel(xls, 'Sheet1', index_col=0) - tm.assert_frame_equal(expected, actual) - - def test_bad_engine_raises(self, ext): - bad_engine = 'foo' - with pytest.raises(ValueError, message="Unknown engine: foo"): - read_excel('', engine=bad_engine) - - @tm.network - def test_read_from_http_url(self, ext): - url = ('https://raw.github.com/pandas-dev/pandas/master/' - 'pandas/tests/io/data/test1' + ext) - url_table = read_excel(url) - local_table = self.get_exceldf('test1', ext) - tm.assert_frame_equal(url_table, local_table) - - @td.skip_if_not_us_locale - def test_read_from_s3_url(self, ext, s3_resource): - # Bucket "pandas-test" created in tests/io/conftest.py - file_name = os.path.join(self.dirpath, 'test1' + ext) - - with open(file_name, "rb") as f: - s3_resource.Bucket("pandas-test").put_object(Key="test1" + ext, - Body=f) - - url = ('s3://pandas-test/test1' + ext) - url_table = read_excel(url) - local_table = self.get_exceldf('test1', ext) - tm.assert_frame_equal(url_table, local_table) - - @pytest.mark.slow - # ignore warning from old xlrd - @pytest.mark.filterwarnings("ignore:This metho:PendingDeprecationWarning") - def test_read_from_file_url(self, ext): - - # FILE - localtable = os.path.join(self.dirpath, 'test1' + ext) - local_table = read_excel(localtable) - - try: - url_table = read_excel('file://localhost/' + localtable) - except URLError: - # fails on some systems - import platform - pytest.skip("failing on %s" % - ' '.join(platform.uname()).strip()) - - tm.assert_frame_equal(url_table, local_table) - - @td.skip_if_no('pathlib') - def test_read_from_pathlib_path(self, ext): - - # GH12655 - from pathlib import Path - - str_path = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(str_path, 'Sheet1', index_col=0) - - path_obj = Path(self.dirpath, 'test1' + ext) - actual = read_excel(path_obj, 'Sheet1', index_col=0) - - tm.assert_frame_equal(expected, actual) - - @td.skip_if_no('py.path') - def test_read_from_py_localpath(self, ext): - - # GH12655 - from py.path import local as LocalPath - - str_path = os.path.join(self.dirpath, 'test1' + ext) - expected = read_excel(str_path, 'Sheet1', index_col=0) - - abs_dir = os.path.abspath(self.dirpath) - path_obj = LocalPath(abs_dir).join('test1' + ext) - actual = read_excel(path_obj, 'Sheet1', index_col=0) - - tm.assert_frame_equal(expected, actual) - - def test_reader_closes_file(self, ext): - - pth = os.path.join(self.dirpath, 'test1' + ext) - f = open(pth, 'rb') - with ExcelFile(f) as xlsx: - # parses okay - read_excel(xlsx, 'Sheet1', index_col=0) - - assert f.closed - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_creating_and_reading_multiple_sheets(self, ext): - # see gh-9450 - # - # Test reading multiple sheets, from a runtime - # created Excel file with multiple sheets. - def tdf(col_sheet_name): - d, i = [11, 22, 33], [1, 2, 3] - return DataFrame(d, i, columns=[col_sheet_name]) - - sheets = ["AAA", "BBB", "CCC"] - - dfs = [tdf(s) for s in sheets] - dfs = dict(zip(sheets, dfs)) - - with ensure_clean(ext) as pth: - with ExcelWriter(pth) as ew: - for sheetname, df in iteritems(dfs): - df.to_excel(ew, sheetname) - - dfs_returned = read_excel(pth, sheet_name=sheets, index_col=0) - - for s in sheets: - tm.assert_frame_equal(dfs[s], dfs_returned[s]) - - def test_reader_seconds(self, ext): - - # Test reading times with and without milliseconds. GH5945. - expected = DataFrame.from_dict({"Time": [time(1, 2, 3), - time(2, 45, 56, 100000), - time(4, 29, 49, 200000), - time(6, 13, 42, 300000), - time(7, 57, 35, 400000), - time(9, 41, 28, 500000), - time(11, 25, 21, 600000), - time(13, 9, 14, 700000), - time(14, 53, 7, 800000), - time(16, 37, 0, 900000), - time(18, 20, 54)]}) - - actual = self.get_exceldf('times_1900', ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - actual = self.get_exceldf('times_1904', ext, 'Sheet1') - tm.assert_frame_equal(actual, expected) - - def test_read_excel_multiindex(self, ext): - # see gh-4679 - mi = MultiIndex.from_product([["foo", "bar"], ["a", "b"]]) - mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) - - # "mi_column" sheet - expected = DataFrame([[1, 2.5, pd.Timestamp("2015-01-01"), True], - [2, 3.5, pd.Timestamp("2015-01-02"), False], - [3, 4.5, pd.Timestamp("2015-01-03"), False], - [4, 5.5, pd.Timestamp("2015-01-04"), True]], - columns=mi) - - actual = read_excel(mi_file, "mi_column", header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - # "mi_index" sheet - expected.index = mi - expected.columns = ["a", "b", "c", "d"] - - actual = read_excel(mi_file, "mi_index", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) - - # "both" sheet - expected.columns = mi - - actual = read_excel(mi_file, "both", index_col=[0, 1], header=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) - - # "mi_index_name" sheet - expected.columns = ["a", "b", "c", "d"] - expected.index = mi.set_names(["ilvl1", "ilvl2"]) - - actual = read_excel(mi_file, "mi_index_name", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # "mi_column_name" sheet - expected.index = list(range(4)) - expected.columns = mi.set_names(["c1", "c2"]) - actual = read_excel(mi_file, "mi_column_name", - header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - # see gh-11317 - # "name_with_int" sheet - expected.columns = mi.set_levels( - [1, 2], level=1).set_names(["c1", "c2"]) - - actual = read_excel(mi_file, "name_with_int", - index_col=0, header=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # "both_name" sheet - expected.columns = mi.set_names(["c1", "c2"]) - expected.index = mi.set_names(["ilvl1", "ilvl2"]) - - actual = read_excel(mi_file, "both_name", - index_col=[0, 1], header=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # "both_skiprows" sheet - actual = read_excel(mi_file, "both_name_skiprows", index_col=[0, 1], - header=[0, 1], skiprows=2) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_multiindex_header_only(self, ext): - # see gh-11733. - # - # Don't try to parse a header name if there isn't one. - mi_file = os.path.join(self.dirpath, "testmultiindex" + ext) - result = read_excel(mi_file, "index_col_none", header=[0, 1]) - - exp_columns = MultiIndex.from_product([("A", "B"), ("key", "val")]) - expected = DataFrame([[1, 2, 3, 4]] * 2, columns=exp_columns) - tm.assert_frame_equal(result, expected) - - @td.skip_if_no("xlsxwriter") - def test_read_excel_multiindex_empty_level(self, ext): - # see gh-12453 - with ensure_clean(ext) as path: - df = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", ""): {0: 0} - }) - - expected = DataFrame({ - ("One", "x"): {0: 1}, - ("Two", "X"): {0: 3}, - ("Two", "Y"): {0: 7}, - ("Zero", "Unnamed: 4_level_1"): {0: 0} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - df = pd.DataFrame({ - ("Beg", ""): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - expected = pd.DataFrame({ - ("Beg", "Unnamed: 1_level_1"): {0: 0}, - ("Middle", "x"): {0: 1}, - ("Tail", "X"): {0: 3}, - ("Tail", "Y"): {0: 7} - }) - - df.to_excel(path) - actual = pd.read_excel(path, header=[0, 1], index_col=0) - tm.assert_frame_equal(actual, expected) - - @td.skip_if_no("xlsxwriter") - @pytest.mark.parametrize("c_idx_names", [True, False]) - @pytest.mark.parametrize("r_idx_names", [True, False]) - @pytest.mark.parametrize("c_idx_levels", [1, 3]) - @pytest.mark.parametrize("r_idx_levels", [1, 3]) - def test_excel_multindex_roundtrip(self, ext, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels): - # see gh-4679 - with ensure_clean(ext) as pth: - if c_idx_levels == 1 and c_idx_names: - pytest.skip("Column index name cannot be " - "serialized unless it's a MultiIndex") - - # Empty name case current read in as - # unnamed levels, not Nones. - check_names = r_idx_names or r_idx_levels <= 1 - - df = mkdf(5, 5, c_idx_names, r_idx_names, - c_idx_levels, r_idx_levels) - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[0, :] = np.nan - df.to_excel(pth) - - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - df.iloc[-1, :] = np.nan - df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) - - def test_excel_old_index_format(self, ext): - # see gh-4679 - filename = "test_index_name_pre17" + ext - in_file = os.path.join(self.dirpath, filename) - - # We detect headers to determine if index names exist, so - # that "index" name in the "names" version of the data will - # now be interpreted as rows that include null data. - data = np.array([[None, None, None, None, None], - ["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) - columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R0", "R_l0_g0", "R_l0_g1", - "R_l0_g2", "R_l0_g3", "R_l0_g4"], - ["R1", "R_l1_g0", "R_l1_g1", - "R_l1_g2", "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4, 5], [0, 1, 2, 3, 4, 5]], - names=[None, None]) - si = Index(["R0", "R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) - - expected = pd.DataFrame(data, index=si, columns=columns) - - actual = pd.read_excel(in_file, "single_names", index_col=0) - tm.assert_frame_equal(actual, expected) - - expected.index = mi - - actual = pd.read_excel(in_file, "multi_names", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected) - - # The analogous versions of the "names" version data - # where there are explicitly no names for the indices. - data = np.array([["R0C0", "R0C1", "R0C2", "R0C3", "R0C4"], - ["R1C0", "R1C1", "R1C2", "R1C3", "R1C4"], - ["R2C0", "R2C1", "R2C2", "R2C3", "R2C4"], - ["R3C0", "R3C1", "R3C2", "R3C3", "R3C4"], - ["R4C0", "R4C1", "R4C2", "R4C3", "R4C4"]]) - columns = ["C_l0_g0", "C_l0_g1", "C_l0_g2", "C_l0_g3", "C_l0_g4"] - mi = MultiIndex(levels=[["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], - ["R_l1_g0", "R_l1_g1", "R_l1_g2", - "R_l1_g3", "R_l1_g4"]], - codes=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], - names=[None, None]) - si = Index(["R_l0_g0", "R_l0_g1", "R_l0_g2", - "R_l0_g3", "R_l0_g4"], name=None) - - expected = pd.DataFrame(data, index=si, columns=columns) - - actual = pd.read_excel(in_file, "single_no_names", index_col=0) - tm.assert_frame_equal(actual, expected) - - expected.index = mi - - actual = pd.read_excel(in_file, "multi_no_names", index_col=[0, 1]) - tm.assert_frame_equal(actual, expected, check_names=False) - - def test_read_excel_bool_header_arg(self, ext): - # GH 6114 - for arg in [True, False]: - with pytest.raises(TypeError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - header=arg) - - def test_read_excel_chunksize(self, ext): - # GH 8011 - with pytest.raises(NotImplementedError): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - chunksize=100) - - @td.skip_if_no("xlwt") - @td.skip_if_no("openpyxl") - def test_read_excel_parse_dates(self, ext): - # see gh-11544, gh-12051 - df = DataFrame( - {"col": [1, 2, 3], - "date_strings": pd.date_range("2012-01-01", periods=3)}) - df2 = df.copy() - df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") - - with ensure_clean(ext) as pth: - df2.to_excel(pth) - - res = read_excel(pth, index_col=0) - tm.assert_frame_equal(df2, res) - - res = read_excel(pth, parse_dates=["date_strings"], index_col=0) - tm.assert_frame_equal(df, res) - - date_parser = lambda x: pd.datetime.strptime(x, "%m/%d/%Y") - res = read_excel(pth, parse_dates=["date_strings"], - date_parser=date_parser, index_col=0) - tm.assert_frame_equal(df, res) - - def test_read_excel_skiprows_list(self, ext): - # GH 4903 - actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + ext), - 'skiprows_list', skiprows=[0, 2]) - expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], - [2, 3.5, pd.Timestamp('2015-01-02'), False], - [3, 4.5, pd.Timestamp('2015-01-03'), False], - [4, 5.5, pd.Timestamp('2015-01-04'), True]], - columns=['a', 'b', 'c', 'd']) - tm.assert_frame_equal(actual, expected) - - actual = pd.read_excel(os.path.join(self.dirpath, - 'testskiprows' + ext), - 'skiprows_list', skiprows=np.array([0, 2])) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows(self, ext): - # GH 16645 - num_rows_to_pull = 5 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows=num_rows_to_pull) - expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + ext)) - expected = expected[:num_rows_to_pull] - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_greater_than_nrows_in_file(self, ext): - # GH 16645 - expected = pd.read_excel(os.path.join(self.dirpath, - 'test1' + ext)) - num_records_in_file = len(expected) - num_rows_to_pull = num_records_in_file + 10 - actual = pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows=num_rows_to_pull) - tm.assert_frame_equal(actual, expected) - - def test_read_excel_nrows_non_integer_parameter(self, ext): - # GH 16645 - msg = "'nrows' must be an integer >=0" - with pytest.raises(ValueError, match=msg): - pd.read_excel(os.path.join(self.dirpath, 'test1' + ext), - nrows='5') - - def test_read_excel_squeeze(self, ext): - # GH 12157 - f = os.path.join(self.dirpath, 'test_squeeze' + ext) - - actual = pd.read_excel(f, 'two_columns', index_col=0, squeeze=True) - expected = pd.Series([2, 3, 4], [4, 5, 6], name='b') - expected.index.name = 'a' - tm.assert_series_equal(actual, expected) - - actual = pd.read_excel(f, 'two_columns', squeeze=True) - expected = pd.DataFrame({'a': [4, 5, 6], - 'b': [2, 3, 4]}) - tm.assert_frame_equal(actual, expected) - - actual = pd.read_excel(f, 'one_column', squeeze=True) - expected = pd.Series([1, 2, 3], name='a') - tm.assert_series_equal(actual, expected) + def get_exceldf(self, basename, ext, *args, **kwds): + """ + Return test data DataFrame. + Parameters + ---------- + basename : str + File base name, excluding file extension. + Returns + ------- + df : DataFrame + """ + pth = os.path.join(self.dirpath, basename + ext) + return read_excel(pth, *args, **kwds) class _WriterBase(SharedItems): @@ -936,15 +92,12 @@ class _WriterBase(SharedItems): @pytest.fixture(autouse=True) def set_engine_and_path(self, request, merge_cells, engine, ext): """Fixture to set engine and open file for use in each test case - Rather than requiring `engine=...` to be provided explicitly as an argument in each test, this fixture sets a global option to dictate which engine should be used to write Excel files. After executing the test it rolls back said change to the global option. - It also uses a context manager to open a temporary excel file for the function to write to, accessible via `self.path` - Notes ----- This fixture will run as part of each test method defined in the @@ -974,6 +127,20 @@ class and any subclasses, on account of the `autouse=True` class TestExcelWriter(_WriterBase): # Base class for test cases to run with different Excel writers. + def test_excel_sheet_by_name_raise(self, *_): + import xlrd + + gt = DataFrame(np.random.randn(10, 2)) + gt.to_excel(self.path) + + xl = ExcelFile(self.path) + df = read_excel(xl, 0, index_col=0) + + tm.assert_frame_equal(gt, df) + + with pytest.raises(xlrd.XLRDError): + read_excel(xl, "0") + def test_excel_writer_context_manager(self, *_): with ExcelWriter(self.path) as writer: self.frame.to_excel(writer, "Data1") @@ -1600,7 +767,6 @@ def roundtrip(data, header=True, parser_hdr=0, index=True): nrows = 5 ncols = 3 - from pandas.util.testing import makeCustomDataframe as mkdf # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 diff --git a/pandas/tests/io/excel/test_xlrd.py b/pandas/tests/io/excel/test_xlrd.py deleted file mode 100644 index bd0f9d7966de3..0000000000000 --- a/pandas/tests/io/excel/test_xlrd.py +++ /dev/null @@ -1,364 +0,0 @@ -import contextlib -from functools import partial -import os -import warnings - -import numpy as np -import pytest - -import pandas.util._test_decorators as td - -from pandas import DataFrame -from pandas.core.config import get_option, set_option -import pandas.util.testing as tm -from pandas.util.testing import ensure_clean - -from pandas.io.excel import ExcelFile, read_excel -from pandas.io.parsers import read_csv - -_seriesd = tm.getSeriesData() -_tsd = tm.getTimeSeriesData() -_frame = DataFrame(_seriesd)[:10] -_frame2 = DataFrame(_seriesd, columns=['D', 'C', 'B', 'A'])[:10] -_tsframe = tm.makeTimeDataFrame()[:5] -_mixed_frame = _frame.copy() -_mixed_frame['foo'] = 'bar' - - -@contextlib.contextmanager -def ignore_xlrd_time_clock_warning(): - """ - Context manager to ignore warnings raised by the xlrd library, - regarding the deprecation of `time.clock` in Python 3.7. - """ - with warnings.catch_warnings(): - warnings.filterwarnings( - action='ignore', - message='time.clock has been deprecated', - category=DeprecationWarning) - yield - - -@td.skip_if_no('xlrd', '1.0.0') -class XlrdSharedItems(object): - - @pytest.fixture(autouse=True) - def setup_method(self, datapath): - self.dirpath = datapath("io", "data") - self.frame = _frame.copy() - self.frame2 = _frame2.copy() - self.tsframe = _tsframe.copy() - self.mixed_frame = _mixed_frame.copy() - - def get_csv_refdf(self, basename): - """ - Obtain the reference data from read_csv with the Python engine. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - dfref : DataFrame - """ - pref = os.path.join(self.dirpath, basename + '.csv') - dfref = read_csv(pref, index_col=0, parse_dates=True, engine='python') - return dfref - - def get_excelfile(self, basename, ext): - """ - Return test data ExcelFile instance. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - excel : io.excel.ExcelFile - """ - return ExcelFile(os.path.join(self.dirpath, basename + ext)) - - def get_exceldf(self, basename, ext, *args, **kwds): - """ - Return test data DataFrame. - - Parameters - ---------- - - basename : str - File base name, excluding file extension. - - Returns - ------- - - df : DataFrame - """ - pth = os.path.join(self.dirpath, basename + ext) - return read_excel(pth, *args, **kwds) - - -class XlrdReadingTestsBase(XlrdSharedItems): - # This is based on ExcelWriterBase - - @pytest.fixture(autouse=True, params=['xlrd']) - def set_engine(self, request): - func_name = "get_exceldf" - old_func = getattr(self, func_name) - new_func = partial(old_func, engine=request.param) - setattr(self, func_name, new_func) - yield - setattr(self, func_name, old_func) - - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 - def test_usecols_int(self, ext): - - df_ref = self.get_csv_refdf("test1") - df_ref = df_ref.reindex(columns=["A", "B", "C"]) - - # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df1 = self.get_exceldf("test1", ext, "Sheet1", - index_col=0, usecols=3) - - # usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], - index_col=0, usecols=3) - - # parse_cols instead of usecols, usecols as int - with tm.assert_produces_warning(FutureWarning, - check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf("test1", ext, "Sheet2", skiprows=[1], - index_col=0, parse_cols=3) - - # TODO add index to xls file) - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - tm.assert_frame_equal(df3, df_ref, check_names=False) - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_usecols_list(self, ext): - - dfref = self.get_csv_refdf('test1') - dfref = dfref.reindex(columns=['B', 'C']) - df1 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols=[0, 2, 3]) - df2 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols=[0, 2, 3]) - - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols=[0, 2, 3]) - - # TODO add index to xls file) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - tm.assert_frame_equal(df3, dfref, check_names=False) - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_usecols_str(self, ext): - - dfref = self.get_csv_refdf('test1') - - df1 = dfref.reindex(columns=['A', 'B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A:D') - - with tm.assert_produces_warning(FutureWarning): - with ignore_xlrd_time_clock_warning(): - df4 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, parse_cols='A:D') - - # TODO add index to xls, read xls ignores index name ? - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) - tm.assert_frame_equal(df4, df1, check_names=False) - - df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C,D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C,D') - # TODO add index to xls file - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) - - df1 = dfref.reindex(columns=['B', 'C']) - df2 = self.get_exceldf('test1', ext, 'Sheet1', index_col=0, - usecols='A,C:D') - df3 = self.get_exceldf('test1', ext, 'Sheet2', skiprows=[1], - index_col=0, usecols='A,C:D') - tm.assert_frame_equal(df2, df1, check_names=False) - tm.assert_frame_equal(df3, df1, check_names=False) - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_deprecated_sheetname(self, ext): - # gh-17964 - excel = self.get_excelfile('test1', ext) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - read_excel(excel, sheetname='Sheet1') - - with pytest.raises(TypeError): - read_excel(excel, sheet='Sheet1') - - @td.skip_if_no('xlrd', '1.0.1') # GH-22682 - def test_excel_table_sheet_by_index(self, ext): - - excel = self.get_excelfile('test1', ext) - dfref = self.get_csv_refdf('test1') - - df1 = read_excel(excel, 0, index_col=0) - df2 = read_excel(excel, 1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - - df1 = excel.parse(0, index_col=0) - df2 = excel.parse(1, skiprows=[1], index_col=0) - tm.assert_frame_equal(df1, dfref, check_names=False) - tm.assert_frame_equal(df2, dfref, check_names=False) - - df3 = read_excel(excel, 0, index_col=0, skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df4 = read_excel(excel, 0, index_col=0, skip_footer=1) - tm.assert_frame_equal(df3, df4) - - df3 = excel.parse(0, index_col=0, skipfooter=1) - tm.assert_frame_equal(df3, df1.iloc[:-1]) - - import xlrd - with pytest.raises(xlrd.XLRDError): - read_excel(excel, 'asdf') - - @td.skip_if_no("xlrd", "1.0.1") # see gh-22682 - def test_sheet_name_and_sheetname(self, ext): - # gh-10559: Minor improvement: Change "sheet_name" to "sheetname" - # gh-10969: DOC: Consistent var names (sheetname vs sheet_name) - # gh-12604: CLN GH10559 Rename sheetname variable to sheet_name - # gh-20920: ExcelFile.parse() and pd.read_xlsx() have different - # behavior for "sheetname" argument - filename = "test1" - sheet_name = "Sheet1" - - df_ref = self.get_csv_refdf(filename) - df1 = self.get_exceldf(filename, ext, - sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - with ignore_xlrd_time_clock_warning(): - df2 = self.get_exceldf(filename, ext, index_col=0, - sheetname=sheet_name) # backward compat - - excel = self.get_excelfile(filename, ext) - df1_parse = excel.parse(sheet_name=sheet_name, index_col=0) # doc - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): - df2_parse = excel.parse(index_col=0, - sheetname=sheet_name) # backward compat - - tm.assert_frame_equal(df1, df_ref, check_names=False) - tm.assert_frame_equal(df2, df_ref, check_names=False) - tm.assert_frame_equal(df1_parse, df_ref, check_names=False) - tm.assert_frame_equal(df2_parse, df_ref, check_names=False) - - -@pytest.mark.parametrize("ext", ['.xls', '.xlsx', '.xlsm']) -class TestXlrdReader(XlrdReadingTestsBase): - """ - This is the base class for the xlrd tests, and 3 different file formats - are supported: xls, xlsx, xlsm - """ - - @td.skip_if_no("xlwt") - def test_read_xlrd_book(self, ext): - import xlrd - df = self.frame - - engine = "xlrd" - sheet_name = "SheetA" - - with ensure_clean(ext) as pth: - df.to_excel(pth, sheet_name) - book = xlrd.open_workbook(pth) - - with ExcelFile(book, engine=engine) as xl: - result = read_excel(xl, sheet_name, index_col=0) - tm.assert_frame_equal(df, result) - - result = read_excel(book, sheet_name=sheet_name, - engine=engine, index_col=0) - tm.assert_frame_equal(df, result) - - -class _XlrdWriterBase(XlrdSharedItems): - - @pytest.fixture(autouse=True) - def set_engine_and_path(self, request, merge_cells, engine, ext): - """Fixture to set engine and open file for use in each test case - - Rather than requiring `engine=...` to be provided explicitly as an - argument in each test, this fixture sets a global option to dictate - which engine should be used to write Excel files. After executing - the test it rolls back said change to the global option. - - It also uses a context manager to open a temporary excel file for - the function to write to, accessible via `self.path` - - Notes - ----- - This fixture will run as part of each test method defined in the - class and any subclasses, on account of the `autouse=True` - argument - """ - option_name = 'io.excel.{ext}.writer'.format(ext=ext.strip('.')) - prev_engine = get_option(option_name) - set_option(option_name, engine) - with ensure_clean(ext) as path: - self.path = path - yield - set_option(option_name, prev_engine) # Roll back option change - - -@pytest.mark.parametrize("merge_cells", [True, False]) -@pytest.mark.parametrize("engine,ext", [ - pytest.param('openpyxl', '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param('openpyxl', '.xlsm', marks=pytest.mark.skipif( - not td.safe_import('openpyxl'), reason='No openpyxl')), - pytest.param('xlwt', '.xls', marks=pytest.mark.skipif( - not td.safe_import('xlwt'), reason='No xlwt')), - pytest.param('xlsxwriter', '.xlsx', marks=pytest.mark.skipif( - not td.safe_import('xlsxwriter'), reason='No xlsxwriter')) -]) -class XlrdTestExcelWriter(_XlrdWriterBase): - # Base class for test cases to run with different Excel writers. - - def test_excel_sheet_by_name_raise(self, *_): - import xlrd - - gt = DataFrame(np.random.randn(10, 2)) - gt.to_excel(self.path) - - xl = ExcelFile(self.path) - df = read_excel(xl, 0, index_col=0) - - tm.assert_frame_equal(gt, df) - - with pytest.raises(xlrd.XLRDError): - read_excel(xl, "0")