from datetime import date, datetime, timedelta from functools import partial from io import BytesIO import os import numpy as np import pytest import pandas.util._test_decorators as td import pandas as pd from pandas import DataFrame, Index, MultiIndex, get_option, set_option import pandas._testing as tm from pandas.io.excel import ( ExcelFile, ExcelWriter, _OpenpyxlWriter, _XlsxWriter, _XlwtWriter, register_writer, ) @pytest.fixture def path(ext): """ Fixture to open file for use in each test case. """ with tm.ensure_clean(ext) as file_path: yield file_path @pytest.fixture def set_engine(engine, ext): """ Fixture to set engine for use in each test case. Rather than requiring `engine=...` to be provided explicitly as an argument in each test, this fixture sets a global option to dictate which engine should be used to write Excel files. After executing the test it rolls back said change to the global option. """ option_name = "io.excel.{ext}.writer".format(ext=ext.strip(".")) prev_engine = get_option(option_name) set_option(option_name, engine) yield set_option(option_name, prev_engine) # Roll back option change @td.skip_if_no("xlrd") @pytest.mark.parametrize("ext", [".xls", ".xlsx", ".xlsm"]) class TestRoundTrip: @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") @pytest.mark.parametrize( "header,expected", [(None, DataFrame([np.nan] * 4)), (0, DataFrame({"Unnamed: 0": [np.nan] * 3}))], ) def test_read_one_empty_col_no_header(self, ext, header, expected): # xref gh-12292 filename = "no_header" df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: df.to_excel(path, filename, index=False, header=False) result = pd.read_excel(path, filename, usecols=[0], header=header) tm.assert_frame_equal(result, expected) @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") @pytest.mark.parametrize( "header,expected", [(None, DataFrame([0] + [np.nan] * 4)), (0, DataFrame([np.nan] * 4))], ) def test_read_one_empty_col_with_header(self, ext, header, expected): filename = "with_header" df = pd.DataFrame([["", 1, 100], ["", 2, 200], ["", 3, 300], ["", 4, 400]]) with tm.ensure_clean(ext) as path: df.to_excel(path, "with_header", index=False, header=True) result = pd.read_excel(path, filename, usecols=[0], header=header) tm.assert_frame_equal(result, expected) @td.skip_if_no("openpyxl") @td.skip_if_no("xlwt") def test_set_column_names_in_parameter(self, ext): # GH 12870 : pass down column names associated with # keyword argument names refdf = pd.DataFrame([[1, "foo"], [2, "bar"], [3, "baz"]], columns=["a", "b"]) with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as writer: refdf.to_excel(writer, "Data_no_head", header=False, index=False) refdf.to_excel(writer, "Data_with_head", index=False) refdf.columns = ["A", "B"] with ExcelFile(pth) as reader: xlsdf_no_head = pd.read_excel( reader, "Data_no_head", header=None, names=["A", "B"] ) xlsdf_with_head = pd.read_excel( reader, "Data_with_head", index_col=None, names=["A", "B"] ) tm.assert_frame_equal(xlsdf_no_head, refdf) tm.assert_frame_equal(xlsdf_with_head, refdf) @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") def test_creating_and_reading_multiple_sheets(self, ext): # see gh-9450 # # Test reading multiple sheets, from a runtime # created Excel file with multiple sheets. def tdf(col_sheet_name): d, i = [11, 22, 33], [1, 2, 3] return DataFrame(d, i, columns=[col_sheet_name]) sheets = ["AAA", "BBB", "CCC"] dfs = [tdf(s) for s in sheets] dfs = dict(zip(sheets, dfs)) with tm.ensure_clean(ext) as pth: with ExcelWriter(pth) as ew: for sheetname, df in dfs.items(): df.to_excel(ew, sheetname) dfs_returned = pd.read_excel(pth, sheet_name=sheets, index_col=0) for s in sheets: tm.assert_frame_equal(dfs[s], dfs_returned[s]) @td.skip_if_no("xlsxwriter") def test_read_excel_multiindex_empty_level(self, ext): # see gh-12453 with tm.ensure_clean(ext) as path: df = DataFrame( { ("One", "x"): {0: 1}, ("Two", "X"): {0: 3}, ("Two", "Y"): {0: 7}, ("Zero", ""): {0: 0}, } ) expected = DataFrame( { ("One", "x"): {0: 1}, ("Two", "X"): {0: 3}, ("Two", "Y"): {0: 7}, ("Zero", "Unnamed: 4_level_1"): {0: 0}, } ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) df = pd.DataFrame( { ("Beg", ""): {0: 0}, ("Middle", "x"): {0: 1}, ("Tail", "X"): {0: 3}, ("Tail", "Y"): {0: 7}, } ) expected = pd.DataFrame( { ("Beg", "Unnamed: 1_level_1"): {0: 0}, ("Middle", "x"): {0: 1}, ("Tail", "X"): {0: 3}, ("Tail", "Y"): {0: 7}, } ) df.to_excel(path) actual = pd.read_excel(path, header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) @td.skip_if_no("xlsxwriter") @pytest.mark.parametrize("c_idx_names", [True, False]) @pytest.mark.parametrize("r_idx_names", [True, False]) @pytest.mark.parametrize("c_idx_levels", [1, 3]) @pytest.mark.parametrize("r_idx_levels", [1, 3]) def test_excel_multindex_roundtrip( self, ext, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels ): # see gh-4679 with tm.ensure_clean(ext) as pth: if c_idx_levels == 1 and c_idx_names: pytest.skip( "Column index name cannot be serialized unless it's a MultiIndex" ) # Empty name case current read in as # unnamed levels, not Nones. check_names = r_idx_names or r_idx_levels <= 1 df = tm.makeCustomDataframe( 5, 5, c_idx_names, r_idx_names, c_idx_levels, r_idx_levels ) df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) act = pd.read_excel( pth, index_col=list(range(r_idx_levels)), header=list(range(c_idx_levels)), ) tm.assert_frame_equal(df, act, check_names=check_names) @td.skip_if_no("xlwt") @td.skip_if_no("openpyxl") def test_read_excel_parse_dates(self, ext): # see gh-11544, gh-12051 df = DataFrame( {"col": [1, 2, 3], "date_strings": pd.date_range("2012-01-01", periods=3)} ) df2 = df.copy() df2["date_strings"] = df2["date_strings"].dt.strftime("%m/%d/%Y") with tm.ensure_clean(ext) as pth: df2.to_excel(pth) res = pd.read_excel(pth, index_col=0) tm.assert_frame_equal(df2, res) res = pd.read_excel(pth, parse_dates=["date_strings"], index_col=0) tm.assert_frame_equal(df, res) date_parser = lambda x: datetime.strptime(x, "%m/%d/%Y") res = pd.read_excel( pth, parse_dates=["date_strings"], date_parser=date_parser, index_col=0 ) tm.assert_frame_equal(df, res) @td.skip_if_no("xlrd") @pytest.mark.parametrize( "engine,ext", [ pytest.param("openpyxl", ".xlsx", marks=td.skip_if_no("openpyxl")), pytest.param("openpyxl", ".xlsm", marks=td.skip_if_no("openpyxl")), pytest.param("xlwt", ".xls", marks=td.skip_if_no("xlwt")), pytest.param("xlsxwriter", ".xlsx", marks=td.skip_if_no("xlsxwriter")), ], ) @pytest.mark.usefixtures("set_engine") class TestExcelWriter: def test_excel_sheet_size(self, path): # GH 26080 breaking_row_count = 2 ** 20 + 1 breaking_col_count = 2 ** 14 + 1 # purposely using two arrays to prevent memory issues while testing row_arr = np.zeros(shape=(breaking_row_count, 1)) col_arr = np.zeros(shape=(1, breaking_col_count)) row_df = pd.DataFrame(row_arr) col_df = pd.DataFrame(col_arr) msg = "sheet is too large" with pytest.raises(ValueError, match=msg): row_df.to_excel(path) with pytest.raises(ValueError, match=msg): col_df.to_excel(path) def test_excel_sheet_by_name_raise(self, path): import xlrd gt = DataFrame(np.random.randn(10, 2)) gt.to_excel(path) xl = ExcelFile(path) df = pd.read_excel(xl, 0, index_col=0) tm.assert_frame_equal(gt, df) with pytest.raises(xlrd.XLRDError): pd.read_excel(xl, "0") def test_excel_writer_context_manager(self, frame, path): with ExcelWriter(path) as writer: frame.to_excel(writer, "Data1") frame2 = frame.copy() frame2.columns = frame.columns[::-1] frame2.to_excel(writer, "Data2") with ExcelFile(path) as reader: found_df = pd.read_excel(reader, "Data1", index_col=0) found_df2 = pd.read_excel(reader, "Data2", index_col=0) tm.assert_frame_equal(found_df, frame) tm.assert_frame_equal(found_df2, frame2) def test_roundtrip(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # test roundtrip frame.to_excel(path, "test1") recons = pd.read_excel(path, "test1", index_col=0) tm.assert_frame_equal(frame, recons) frame.to_excel(path, "test1", index=False) recons = pd.read_excel(path, "test1", index_col=None) recons.index = frame.index tm.assert_frame_equal(frame, recons) frame.to_excel(path, "test1", na_rep="NA") recons = pd.read_excel(path, "test1", index_col=0, na_values=["NA"]) tm.assert_frame_equal(frame, recons) # GH 3611 frame.to_excel(path, "test1", na_rep="88") recons = pd.read_excel(path, "test1", index_col=0, na_values=["88"]) tm.assert_frame_equal(frame, recons) frame.to_excel(path, "test1", na_rep="88") recons = pd.read_excel(path, "test1", index_col=0, na_values=[88, 88.0]) tm.assert_frame_equal(frame, recons) # GH 6573 frame.to_excel(path, "Sheet1") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) frame.to_excel(path, "0") recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(frame, recons) # GH 8825 Pandas Series should provide to_excel method s = frame["A"] s.to_excel(path) recons = pd.read_excel(path, index_col=0) tm.assert_frame_equal(s.to_frame(), recons) def test_mixed(self, frame, path): mixed_frame = frame.copy() mixed_frame["foo"] = "bar" mixed_frame.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(mixed_frame, recons) def test_ts_frame(self, tsframe, path): df = tsframe df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) def test_basics_with_nan(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) @pytest.mark.parametrize("np_type", [np.int8, np.int16, np.int32, np.int64]) def test_int_types(self, np_type, path): # Test np.int values read come back as int # (rather than float which is Excel's format). df = DataFrame(np.random.randint(-10, 10, size=(10, 2)), dtype=np_type) df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) int_frame = df.astype(np.int64) tm.assert_frame_equal(int_frame, recons) recons2 = pd.read_excel(path, "test1", index_col=0) tm.assert_frame_equal(int_frame, recons2) # Test with convert_float=False comes back as float. float_frame = df.astype(float) recons = pd.read_excel(path, "test1", convert_float=False, index_col=0) tm.assert_frame_equal( recons, float_frame, check_index_type=False, check_column_type=False ) @pytest.mark.parametrize("np_type", [np.float16, np.float32, np.float64]) def test_float_types(self, np_type, path): # Test np.float values read come back as float. df = DataFrame(np.random.random_sample(10), dtype=np_type) df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) tm.assert_frame_equal(df, recons, check_dtype=False) @pytest.mark.parametrize("np_type", [np.bool8, np.bool_]) def test_bool_types(self, np_type, path): # Test np.bool values read come back as float. df = DataFrame([1, 0, True, False], dtype=np_type) df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np_type) tm.assert_frame_equal(df, recons) def test_inf_roundtrip(self, path): df = DataFrame([(1, np.inf), (2, 3), (5, -np.inf)]) df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(df, recons) def test_sheets(self, frame, tsframe, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # Test writing to separate sheets writer = ExcelWriter(path) frame.to_excel(writer, "test1") tsframe.to_excel(writer, "test2") writer.save() reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(frame, recons) recons = pd.read_excel(reader, "test2", index_col=0) tm.assert_frame_equal(tsframe, recons) assert 2 == len(reader.sheet_names) assert "test1" == reader.sheet_names[0] assert "test2" == reader.sheet_names[1] def test_colaliases(self, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # column aliases col_aliases = Index(["AA", "X", "Y", "Z"]) frame.to_excel(path, "test1", header=col_aliases) reader = ExcelFile(path) rs = pd.read_excel(reader, "test1", index_col=0) xp = frame.copy() xp.columns = col_aliases tm.assert_frame_equal(xp, rs) def test_roundtrip_indexlabels(self, merge_cells, frame, path): frame = frame.copy() frame["A"][:5] = np.nan frame.to_excel(path, "test1") frame.to_excel(path, "test1", columns=["A", "B"]) frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", index=False) # test index_label df = DataFrame(np.random.randn(10, 2)) >= 0 df.to_excel(path, "test1", index_label=["test"], merge_cells=merge_cells) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) df.index.names = ["test"] assert df.index.names == recons.index.names df = DataFrame(np.random.randn(10, 2)) >= 0 df.to_excel( path, "test1", index_label=["test", "dummy", "dummy2"], merge_cells=merge_cells, ) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) df.index.names = ["test"] assert df.index.names == recons.index.names df = DataFrame(np.random.randn(10, 2)) >= 0 df.to_excel(path, "test1", index_label="test", merge_cells=merge_cells) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0).astype(np.int64) df.index.names = ["test"] tm.assert_frame_equal(df, recons.astype(bool)) frame.to_excel( path, "test1", columns=["A", "B", "C", "D"], index=False, merge_cells=merge_cells, ) # take 'A' and 'B' as indexes (same row as cols 'C', 'D') df = frame.copy() df = df.set_index(["A", "B"]) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(df, recons, check_less_precise=True) def test_excel_roundtrip_indexname(self, merge_cells, path): df = DataFrame(np.random.randn(10, 4)) df.index.name = "foo" df.to_excel(path, merge_cells=merge_cells) xf = ExcelFile(path) result = pd.read_excel(xf, xf.sheet_names[0], index_col=0) tm.assert_frame_equal(result, df) assert result.index.name == "foo" def test_excel_roundtrip_datetime(self, merge_cells, tsframe, path): # datetime.date, not sure what to test here exactly tsf = tsframe.copy() tsf.index = [x.date() for x in tsframe.index] tsf.to_excel(path, "test1", merge_cells=merge_cells) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(tsframe, recons) def test_excel_date_datetime_format(self, engine, ext, path): # see gh-4133 # # Excel output format strings df = DataFrame( [ [date(2014, 1, 31), date(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], ], index=["DATE", "DATETIME"], columns=["X", "Y"], ) df_expected = DataFrame( [ [datetime(2014, 1, 31), datetime(1999, 9, 24)], [datetime(1998, 5, 26, 23, 33, 4), datetime(2014, 2, 28, 13, 5, 13)], ], index=["DATE", "DATETIME"], columns=["X", "Y"], ) with tm.ensure_clean(ext) as filename2: writer1 = ExcelWriter(path) writer2 = ExcelWriter( filename2, date_format="DD.MM.YYYY", datetime_format="DD.MM.YYYY HH-MM-SS", ) df.to_excel(writer1, "test1") df.to_excel(writer2, "test1") writer1.close() writer2.close() reader1 = ExcelFile(path) reader2 = ExcelFile(filename2) rs1 = pd.read_excel(reader1, "test1", index_col=0) rs2 = pd.read_excel(reader2, "test1", index_col=0) tm.assert_frame_equal(rs1, rs2) # Since the reader returns a datetime object for dates, # we need to use df_expected to check the result. tm.assert_frame_equal(rs2, df_expected) def test_to_excel_interval_no_labels(self, path): # see gh-19242 # # Test writing Interval without labels. df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() df["new"] = pd.cut(df[0], 10) expected["new"] = pd.cut(expected[0], 10).astype(str) df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_interval_labels(self, path): # see gh-19242 # # Test writing Interval with labels. df = DataFrame(np.random.randint(-10, 10, size=(20, 1)), dtype=np.int64) expected = df.copy() intervals = pd.cut( df[0], 10, labels=["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"] ) df["new"] = intervals expected["new"] = pd.Series(list(intervals)) df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_timedelta(self, path): # see gh-19242, gh-9155 # # Test writing timedelta to xls. df = DataFrame( np.random.randint(-10, 10, size=(20, 1)), columns=["A"], dtype=np.int64 ) expected = df.copy() df["new"] = df["A"].apply(lambda x: timedelta(seconds=x)) expected["new"] = expected["A"].apply( lambda x: timedelta(seconds=x).total_seconds() / float(86400) ) df.to_excel(path, "test1") reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=0) tm.assert_frame_equal(expected, recons) def test_to_excel_periodindex(self, tsframe, path): xp = tsframe.resample("M", kind="period").mean() xp.to_excel(path, "sht1") reader = ExcelFile(path) rs = pd.read_excel(reader, "sht1", index_col=0) tm.assert_frame_equal(xp, rs.to_period("M")) def test_to_excel_multiindex(self, merge_cells, frame, path): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index frame.to_excel(path, "test1", header=False) frame.to_excel(path, "test1", columns=["A", "B"]) # round trip frame.to_excel(path, "test1", merge_cells=merge_cells) reader = ExcelFile(path) df = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(frame, df) # GH13511 def test_to_excel_multiindex_nan_label(self, merge_cells, path): df = pd.DataFrame( {"A": [None, 2, 3], "B": [10, 20, 30], "C": np.random.sample(3)} ) df = df.set_index(["A", "B"]) df.to_excel(path, merge_cells=merge_cells) df1 = pd.read_excel(path, index_col=[0, 1]) tm.assert_frame_equal(df, df1) # Test for Issue 11328. If column indices are integers, make # sure they are handled correctly for either setting of # merge_cells def test_to_excel_multiindex_cols(self, merge_cells, frame, path): arrays = np.arange(len(frame.index) * 2).reshape(2, -1) new_index = MultiIndex.from_arrays(arrays, names=["first", "second"]) frame.index = new_index new_cols_index = MultiIndex.from_tuples([(40, 1), (40, 2), (50, 1), (50, 2)]) frame.columns = new_cols_index header = [0, 1] if not merge_cells: header = 0 # round trip frame.to_excel(path, "test1", merge_cells=merge_cells) reader = ExcelFile(path) df = pd.read_excel(reader, "test1", header=header, index_col=[0, 1]) if not merge_cells: fm = frame.columns.format(sparsify=False, adjoin=False, names=False) frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) def test_to_excel_multiindex_dates(self, merge_cells, tsframe, path): # try multiindex with dates new_index = [tsframe.index, np.arange(len(tsframe.index))] tsframe.index = MultiIndex.from_arrays(new_index) tsframe.index.names = ["time", "foo"] tsframe.to_excel(path, "test1", merge_cells=merge_cells) reader = ExcelFile(path) recons = pd.read_excel(reader, "test1", index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) assert recons.index.names == ("time", "foo") def test_to_excel_multiindex_no_write_index(self, path): # Test writing and re-reading a MI without the index. GH 5616. # Initial non-MI frame. frame1 = DataFrame({"a": [10, 20], "b": [30, 40], "c": [50, 60]}) # Add a MI. frame2 = frame1.copy() multi_index = MultiIndex.from_tuples([(70, 80), (90, 100)]) frame2.index = multi_index # Write out to Excel without the index. frame2.to_excel(path, "test1", index=False) # Read it back in. reader = ExcelFile(path) frame3 = pd.read_excel(reader, "test1") # Test that it is the same as the initial frame. tm.assert_frame_equal(frame1, frame3) def test_to_excel_float_format(self, path): df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) df.to_excel(path, "test1", float_format="%.2f") reader = ExcelFile(path) result = pd.read_excel(reader, "test1", index_col=0) expected = DataFrame( [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=["A", "B"], columns=["X", "Y", "Z"], ) tm.assert_frame_equal(result, expected) def test_to_excel_output_encoding(self, ext): # Avoid mixed inferred_type. df = DataFrame( [["\u0192", "\u0193", "\u0194"], ["\u0195", "\u0196", "\u0197"]], index=["A\u0192", "B"], columns=["X\u0193", "Y", "Z"], ) with tm.ensure_clean("__tmp_to_excel_float_format__." + ext) as filename: df.to_excel(filename, sheet_name="TestSheet", encoding="utf8") result = pd.read_excel(filename, "TestSheet", encoding="utf8", index_col=0) tm.assert_frame_equal(result, df) def test_to_excel_unicode_filename(self, ext, path): with tm.ensure_clean("\u0192u." + ext) as filename: try: f = open(filename, "wb") except UnicodeEncodeError: pytest.skip("No unicode file names on this system") else: f.close() df = DataFrame( [[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], index=["A", "B"], columns=["X", "Y", "Z"], ) df.to_excel(filename, "test1", float_format="%.2f") reader = ExcelFile(filename) result = pd.read_excel(reader, "test1", index_col=0) expected = DataFrame( [[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], index=["A", "B"], columns=["X", "Y", "Z"], ) tm.assert_frame_equal(result, expected) # FIXME: dont leave commented-out # def test_to_excel_header_styling_xls(self, engine, ext): # import StringIO # s = StringIO( # """Date,ticker,type,value # 2001-01-01,x,close,12.2 # 2001-01-01,x,open ,12.1 # 2001-01-01,y,close,12.2 # 2001-01-01,y,open ,12.1 # 2001-02-01,x,close,12.2 # 2001-02-01,x,open ,12.1 # 2001-02-01,y,close,12.2 # 2001-02-01,y,open ,12.1 # 2001-03-01,x,close,12.2 # 2001-03-01,x,open ,12.1 # 2001-03-01,y,close,12.2 # 2001-03-01,y,open ,12.1""") # df = read_csv(s, parse_dates=["Date"]) # pdf = df.pivot_table(values="value", rows=["ticker"], # cols=["Date", "type"]) # try: # import xlwt # import xlrd # except ImportError: # pytest.skip # filename = '__tmp_to_excel_header_styling_xls__.xls' # pdf.to_excel(filename, 'test1') # wbk = xlrd.open_workbook(filename, # formatting_info=True) # assert ["test1"] == wbk.sheet_names() # ws = wbk.sheet_by_name('test1') # assert [(0, 1, 5, 7), (0, 1, 3, 5), (0, 1, 1, 3)] == ws.merged_cells # for i in range(0, 2): # for j in range(0, 7): # xfx = ws.cell_xf_index(0, 0) # cell_xf = wbk.xf_list[xfx] # font = wbk.font_list # assert 1 == font[cell_xf.font_index].bold # assert 1 == cell_xf.border.top_line_style # assert 1 == cell_xf.border.right_line_style # assert 1 == cell_xf.border.bottom_line_style # assert 1 == cell_xf.border.left_line_style # assert 2 == cell_xf.alignment.hor_align # os.remove(filename) # def test_to_excel_header_styling_xlsx(self, engine, ext): # import StringIO # s = StringIO( # """Date,ticker,type,value # 2001-01-01,x,close,12.2 # 2001-01-01,x,open ,12.1 # 2001-01-01,y,close,12.2 # 2001-01-01,y,open ,12.1 # 2001-02-01,x,close,12.2 # 2001-02-01,x,open ,12.1 # 2001-02-01,y,close,12.2 # 2001-02-01,y,open ,12.1 # 2001-03-01,x,close,12.2 # 2001-03-01,x,open ,12.1 # 2001-03-01,y,close,12.2 # 2001-03-01,y,open ,12.1""") # df = read_csv(s, parse_dates=["Date"]) # pdf = df.pivot_table(values="value", rows=["ticker"], # cols=["Date", "type"]) # try: # import openpyxl # from openpyxl.cell import get_column_letter # except ImportError: # pytest.skip # if openpyxl.__version__ < '1.6.1': # pytest.skip # # test xlsx_styling # filename = '__tmp_to_excel_header_styling_xlsx__.xlsx' # pdf.to_excel(filename, 'test1') # wbk = openpyxl.load_workbook(filename) # assert ["test1"] == wbk.get_sheet_names() # ws = wbk.get_sheet_by_name('test1') # xlsaddrs = ["%s2" % chr(i) for i in range(ord('A'), ord('H'))] # xlsaddrs += ["A%s" % i for i in range(1, 6)] # xlsaddrs += ["B1", "D1", "F1"] # for xlsaddr in xlsaddrs: # cell = ws.cell(xlsaddr) # assert cell.style.font.bold # assert (openpyxl.style.Border.BORDER_THIN == # cell.style.borders.top.border_style) # assert (openpyxl.style.Border.BORDER_THIN == # cell.style.borders.right.border_style) # assert (openpyxl.style.Border.BORDER_THIN == # cell.style.borders.bottom.border_style) # assert (openpyxl.style.Border.BORDER_THIN == # cell.style.borders.left.border_style) # assert (openpyxl.style.Alignment.HORIZONTAL_CENTER == # cell.style.alignment.horizontal) # mergedcells_addrs = ["C1", "E1", "G1"] # for maddr in mergedcells_addrs: # assert ws.cell(maddr).merged # os.remove(filename) @pytest.mark.parametrize("use_headers", [True, False]) @pytest.mark.parametrize("r_idx_nlevels", [1, 2, 3]) @pytest.mark.parametrize("c_idx_nlevels", [1, 2, 3]) def test_excel_010_hemstring( self, merge_cells, c_idx_nlevels, r_idx_nlevels, use_headers, path ): def roundtrip(data, header=True, parser_hdr=0, index=True): data.to_excel(path, header=header, merge_cells=merge_cells, index=index) xf = ExcelFile(path) return pd.read_excel(xf, xf.sheet_names[0], header=parser_hdr) # Basic test. parser_header = 0 if use_headers else None res = roundtrip(DataFrame([0]), use_headers, parser_header) assert res.shape == (1, 2) assert res.iloc[0, 0] is not np.nan # More complex tests with multi-index. nrows = 5 ncols = 3 # ensure limited functionality in 0.10 # override of gh-2370 until sorted out in 0.11 df = tm.makeCustomDataframe( nrows, ncols, r_idx_nlevels=r_idx_nlevels, c_idx_nlevels=c_idx_nlevels ) # This if will be removed once multi-column Excel writing # is implemented. For now fixing gh-9794. if c_idx_nlevels > 1: with pytest.raises(NotImplementedError): roundtrip(df, use_headers, index=False) else: res = roundtrip(df, use_headers) if use_headers: assert res.shape == (nrows, ncols + r_idx_nlevels) else: # First row taken as columns. assert res.shape == (nrows - 1, ncols + r_idx_nlevels) # No NaNs. for r in range(len(res.index)): for c in range(len(res.columns)): assert res.iloc[r, c] is not np.nan def test_duplicated_columns(self, path): # see gh-5235 df = DataFrame([[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B"]) df.to_excel(path, "test1") expected = DataFrame( [[1, 2, 3], [1, 2, 3], [1, 2, 3]], columns=["A", "B", "B.1"] ) # By default, we mangle. result = pd.read_excel(path, "test1", index_col=0) tm.assert_frame_equal(result, expected) # Explicitly, we pass in the parameter. result = pd.read_excel(path, "test1", index_col=0, mangle_dupe_cols=True) tm.assert_frame_equal(result, expected) # see gh-11007, gh-10970 df = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A", "B"]) df.to_excel(path, "test1") result = pd.read_excel(path, "test1", index_col=0) expected = DataFrame( [[1, 2, 3, 4], [5, 6, 7, 8]], columns=["A", "B", "A.1", "B.1"] ) tm.assert_frame_equal(result, expected) # see gh-10982 df.to_excel(path, "test1", index=False, header=False) result = pd.read_excel(path, "test1", header=None) expected = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]) tm.assert_frame_equal(result, expected) msg = "Setting mangle_dupe_cols=False is not supported yet" with pytest.raises(ValueError, match=msg): pd.read_excel(path, "test1", header=None, mangle_dupe_cols=False) def test_swapped_columns(self, path): # Test for issue #5427. write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) write_frame.to_excel(path, "test1", columns=["B", "A"]) read_frame = pd.read_excel(path, "test1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) tm.assert_series_equal(write_frame["B"], read_frame["B"]) def test_invalid_columns(self, path): # see gh-10982 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2]}) with pytest.raises(KeyError, match="Not all names specified"): write_frame.to_excel(path, "test1", columns=["B", "C"]) with pytest.raises( KeyError, match="'passes columns are not ALL present dataframe'" ): write_frame.to_excel(path, "test1", columns=["C", "D"]) @pytest.mark.parametrize( "to_excel_index,read_excel_index_col", [ (True, 0), # Include index in write to file (False, None), # Dont include index in write to file ], ) def test_write_subset_columns(self, path, to_excel_index, read_excel_index_col): # GH 31677 write_frame = DataFrame({"A": [1, 1, 1], "B": [2, 2, 2], "C": [3, 3, 3]}) write_frame.to_excel( path, "col_subset_bug", columns=["A", "B"], index=to_excel_index ) expected = write_frame[["A", "B"]] read_frame = pd.read_excel( path, "col_subset_bug", index_col=read_excel_index_col ) tm.assert_frame_equal(expected, read_frame) def test_comment_arg(self, path): # see gh-18735 # # Test the comment argument functionality to pd.read_excel. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(path, "test_c") # Read file without comment arg. result1 = pd.read_excel(path, "test_c", index_col=0) result1.iloc[1, 0] = None result1.iloc[1, 1] = None result1.iloc[2, 1] = None result2 = pd.read_excel(path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result1, result2) def test_comment_default(self, path): # Re issue #18735 # Test the comment argument default to pd.read_excel # Create file to read in df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(path, "test_c") # Read file with default and explicit comment=None result1 = pd.read_excel(path, "test_c") result2 = pd.read_excel(path, "test_c", comment=None) tm.assert_frame_equal(result1, result2) def test_comment_used(self, path): # see gh-18735 # # Test the comment argument is working as expected when used. # Create file to read in. df = DataFrame({"A": ["one", "#one", "one"], "B": ["two", "two", "#two"]}) df.to_excel(path, "test_c") # Test read_frame_comment against manually produced expected output. expected = DataFrame({"A": ["one", None, "one"], "B": ["two", None, None]}) result = pd.read_excel(path, "test_c", comment="#", index_col=0) tm.assert_frame_equal(result, expected) def test_comment_empty_line(self, path): # Re issue #18735 # Test that pd.read_excel ignores commented lines at the end of file df = DataFrame({"a": ["1", "#2"], "b": ["2", "3"]}) df.to_excel(path, index=False) # Test that all-comment lines at EoF are ignored expected = DataFrame({"a": [1], "b": [2]}) result = pd.read_excel(path, comment="#") tm.assert_frame_equal(result, expected) def test_datetimes(self, path): # Test writing and reading datetimes. For issue #9139. (xref #9185) datetimes = [ datetime(2013, 1, 13, 1, 2, 3), datetime(2013, 1, 13, 2, 45, 56), datetime(2013, 1, 13, 4, 29, 49), datetime(2013, 1, 13, 6, 13, 42), datetime(2013, 1, 13, 7, 57, 35), datetime(2013, 1, 13, 9, 41, 28), datetime(2013, 1, 13, 11, 25, 21), datetime(2013, 1, 13, 13, 9, 14), datetime(2013, 1, 13, 14, 53, 7), datetime(2013, 1, 13, 16, 37, 0), datetime(2013, 1, 13, 18, 20, 52), ] write_frame = DataFrame({"A": datetimes}) write_frame.to_excel(path, "Sheet1") read_frame = pd.read_excel(path, "Sheet1", header=0) tm.assert_series_equal(write_frame["A"], read_frame["A"]) def test_bytes_io(self, engine): # see gh-7074 bio = BytesIO() df = DataFrame(np.random.randn(10, 2)) # Pass engine explicitly, as there is no file path to infer from. writer = ExcelWriter(bio, engine=engine) df.to_excel(writer) writer.save() bio.seek(0) reread_df = pd.read_excel(bio, index_col=0) tm.assert_frame_equal(df, reread_df) def test_write_lists_dict(self, path): # see gh-8188. df = DataFrame( { "mixed": ["a", ["b", "c"], {"d": "e", "f": 2}], "numeric": [1, 2, 3.0], "str": ["apple", "banana", "cherry"], } ) df.to_excel(path, "Sheet1") read = pd.read_excel(path, "Sheet1", header=0, index_col=0) expected = df.copy() expected.mixed = expected.mixed.apply(str) expected.numeric = expected.numeric.astype("int64") tm.assert_frame_equal(read, expected) def test_true_and_false_value_options(self, path): # see gh-13347 df = pd.DataFrame([["foo", "bar"]], columns=["col1", "col2"]) expected = df.replace({"foo": True, "bar": False}) df.to_excel(path) read_frame = pd.read_excel( path, true_values=["foo"], false_values=["bar"], index_col=0 ) tm.assert_frame_equal(read_frame, expected) def test_freeze_panes(self, path): # see gh-15160 expected = DataFrame([[1, 2], [3, 4]], columns=["col1", "col2"]) expected.to_excel(path, "Sheet1", freeze_panes=(1, 1)) result = pd.read_excel(path, index_col=0) tm.assert_frame_equal(result, expected) def test_path_path_lib(self, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) def test_path_local_path(self, engine, ext): df = tm.makeDataFrame() writer = partial(df.to_excel, engine=engine) reader = partial(pd.read_excel, index_col=0) result = tm.round_trip_pathlib(writer, reader, path="foo.{ext}".format(ext=ext)) tm.assert_frame_equal(result, df) def test_merged_cell_custom_objects(self, merge_cells, path): # see GH-27006 mi = MultiIndex.from_tuples( [ (pd.Period("2018"), pd.Period("2018Q1")), (pd.Period("2018"), pd.Period("2018Q2")), ] ) expected = DataFrame(np.ones((2, 2)), columns=mi) expected.to_excel(path) result = pd.read_excel(path, header=[0, 1], index_col=0, convert_float=False) # need to convert PeriodIndexes to standard Indexes for assert equal expected.columns.set_levels( [[str(i) for i in mi.levels[0]], [str(i) for i in mi.levels[1]]], level=[0, 1], inplace=True, ) expected.index = expected.index.astype(np.float64) tm.assert_frame_equal(expected, result) @pytest.mark.parametrize("dtype", [None, object]) def test_raise_when_saving_timezones(self, dtype, tz_aware_fixture, path): # GH 27008, GH 7056 tz = tz_aware_fixture data = pd.Timestamp("2019", tz=tz) df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): df.to_excel(path) data = data.to_pydatetime() df = DataFrame([data], dtype=dtype) with pytest.raises(ValueError, match="Excel does not support"): df.to_excel(path) class TestExcelWriterEngineTests: @pytest.mark.parametrize( "klass,ext", [ pytest.param(_XlsxWriter, ".xlsx", marks=td.skip_if_no("xlsxwriter")), pytest.param(_OpenpyxlWriter, ".xlsx", marks=td.skip_if_no("openpyxl")), pytest.param(_XlwtWriter, ".xls", marks=td.skip_if_no("xlwt")), ], ) def test_ExcelWriter_dispatch(self, klass, ext): with tm.ensure_clean(ext) as path: writer = ExcelWriter(path) if ext == ".xlsx" and td.safe_import("xlsxwriter"): # xlsxwriter has preference over openpyxl if both installed assert isinstance(writer, _XlsxWriter) else: assert isinstance(writer, klass) def test_ExcelWriter_dispatch_raises(self): with pytest.raises(ValueError, match="No engine"): ExcelWriter("nothing") def test_register_writer(self): # some awkward mocking to test out dispatch and such actually works called_save = [] called_write_cells = [] class DummyClass(ExcelWriter): called_save = False called_write_cells = False supported_extensions = ["xlsx", "xls"] engine = "dummy" def save(self): called_save.append(True) def write_cells(self, *args, **kwargs): called_write_cells.append(True) def check_called(func): func() assert len(called_save) >= 1 assert len(called_write_cells) >= 1 del called_save[:] del called_write_cells[:] with pd.option_context("io.excel.xlsx.writer", "dummy"): register_writer(DummyClass) writer = ExcelWriter("something.xlsx") assert isinstance(writer, DummyClass) df = tm.makeCustomDataframe(1, 1) check_called(lambda: df.to_excel("something.xlsx")) check_called(lambda: df.to_excel("something.xls", engine="dummy")) @td.skip_if_no("xlrd") @td.skip_if_no("openpyxl") class TestFSPath: def test_excelfile_fspath(self): with tm.ensure_clean("foo.xlsx") as path: df = DataFrame({"A": [1, 2]}) df.to_excel(path) xl = ExcelFile(path) result = os.fspath(xl) assert result == path def test_excelwriter_fspath(self): with tm.ensure_clean("foo.xlsx") as path: writer = ExcelWriter(path) assert os.fspath(writer) == str(path)