pandas/io/formats/excel.py

"""Utilities for conversion to writer-agnostic Excel representation
"""

import itertools
import re
import warnings

import numpy as np

from pandas.compat import reduce

from pandas.core.dtypes import missing
from pandas.core.dtypes.common import is_float, is_scalar
from pandas.core.dtypes.generic import ABCMultiIndex, ABCPeriodIndex

from pandas import Index
import pandas.core.common as com

from pandas.io.formats.css import CSSResolver, CSSWarning
from pandas.io.formats.format import get_level_lengths
from pandas.io.formats.printing import pprint_thing


class ExcelCell(object):
    __fields__ = ('row', 'col', 'val', 'style', 'mergestart', 'mergeend')
    __slots__ = __fields__

    def __init__(self, row, col, val, style=None, mergestart=None,
                 mergeend=None):
        self.row = row
        self.col = col
        self.val = val
        self.style = style
        self.mergestart = mergestart
        self.mergeend = mergeend


class CSSToExcelConverter(object):
    """A callable for converting CSS declarations to ExcelWriter styles

    Supports parts of CSS 2.2, with minimal CSS 3.0 support (e.g. text-shadow),
    focusing on font styling, backgrounds, borders and alignment.

    Operates by first computing CSS styles in a fairly generic
    way (see :meth:`compute_css`) then determining Excel style
    properties from CSS properties (see :meth:`build_xlstyle`).

    Parameters
    ----------
    inherited : str, optional
        CSS declarations understood to be the containing scope for the
        CSS processed by :meth:`__call__`.
    """
    # NB: Most of the methods here could be classmethods, as only __init__
    #     and __call__ make use of instance attributes.  We leave them as
    #     instancemethods so that users can easily experiment with extensions
    #     without monkey-patching.

    def __init__(self, inherited=None):
        if inherited is not None:
            inherited = self.compute_css(inherited,
                                         self.compute_css.INITIAL_STYLE)

        self.inherited = inherited

    compute_css = CSSResolver()

    def __call__(self, declarations_str):
        """Convert CSS declarations to ExcelWriter style

        Parameters
        ----------
        declarations_str : str
            List of CSS declarations.
            e.g. "font-weight: bold; background: blue"

        Returns
        -------
        xlstyle : dict
            A style as interpreted by ExcelWriter when found in
            ExcelCell.style.
        """
        # TODO: memoize?
        properties = self.compute_css(declarations_str, self.inherited)
        return self.build_xlstyle(properties)

    def build_xlstyle(self, props):
        out = {
            'alignment': self.build_alignment(props),
            'border': self.build_border(props),
            'fill': self.build_fill(props),
            'font': self.build_font(props),
            'number_format': self.build_number_format(props),
        }
        # TODO: handle cell width and height: needs support in pandas.io.excel

        def remove_none(d):
            """Remove key where value is None, through nested dicts"""
            for k, v in list(d.items()):
                if v is None:
                    del d[k]
                elif isinstance(v, dict):
                    remove_none(v)
                    if not v:
                        del d[k]

        remove_none(out)
        return out

    VERTICAL_MAP = {
        'top': 'top',
        'text-top': 'top',
        'middle': 'center',
        'baseline': 'bottom',
        'bottom': 'bottom',
        'text-bottom': 'bottom',
        # OpenXML also has 'justify', 'distributed'
    }

    def build_alignment(self, props):
        # TODO: text-indent, padding-left -> alignment.indent
        return {'horizontal': props.get('text-align'),
                'vertical': self.VERTICAL_MAP.get(props.get('vertical-align')),
                'wrap_text': (None if props.get('white-space') is None else
                              props['white-space'] not in
                              ('nowrap', 'pre', 'pre-line'))
                }

    def build_border(self, props):
        return {side: {
            'style': self._border_style(props.get('border-{side}-style'
                                        .format(side=side)),
                                        props.get('border-{side}-width'
                                        .format(side=side))),
            'color': self.color_to_excel(
                props.get('border-{side}-color'.format(side=side))),
        } for side in ['top', 'right', 'bottom', 'left']}

    def _border_style(self, style, width):
        # convert styles and widths to openxml, one of:
        #       'dashDot'
        #       'dashDotDot'
        #       'dashed'
        #       'dotted'
        #       'double'
        #       'hair'
        #       'medium'
        #       'mediumDashDot'
        #       'mediumDashDotDot'
        #       'mediumDashed'
        #       'slantDashDot'
        #       'thick'
        #       'thin'
        if width is None and style is None:
            return None
        if style == 'none' or style == 'hidden':
            return None

        if width is None:
            width = '2pt'
        width = float(width[:-2])
        if width < 1e-5:
            return None
        elif width < 1.3:
            width_name = 'thin'
        elif width < 2.8:
            width_name = 'medium'
        else:
            width_name = 'thick'

        if style in (None, 'groove', 'ridge', 'inset', 'outset'):
            # not handled
            style = 'solid'

        if style == 'double':
            return 'double'
        if style == 'solid':
            return width_name
        if style == 'dotted':
            if width_name in ('hair', 'thin'):
                return 'dotted'
            return 'mediumDashDotDot'
        if style == 'dashed':
            if width_name in ('hair', 'thin'):
                return 'dashed'
            return 'mediumDashed'

    def build_fill(self, props):
        # TODO: perhaps allow for special properties
        #       -excel-pattern-bgcolor and -excel-pattern-type
        fill_color = props.get('background-color')
        if fill_color not in (None, 'transparent', 'none'):
            return {
                'fgColor': self.color_to_excel(fill_color),
                'patternType': 'solid',
            }

    BOLD_MAP = {'bold': True, 'bolder': True, '600': True, '700': True,
                '800': True, '900': True,
                'normal': False, 'lighter': False, '100': False, '200': False,
                '300': False, '400': False, '500': False}
    ITALIC_MAP = {'normal': False, 'italic': True, 'oblique': True}

    def build_font(self, props):
        size = props.get('font-size')
        if size is not None:
            assert size.endswith('pt')
            size = float(size[:-2])

        font_names_tmp = re.findall(r'''(?x)
            (
            "(?:[^"]|\\")+"
            |
            '(?:[^']|\\')+'
            |
            [^'",]+
            )(?=,|\s*$)
        ''', props.get('font-family', ''))
        font_names = []
        for name in font_names_tmp:
            if name[:1] == '"':
                name = name[1:-1].replace('\\"', '"')
            elif name[:1] == '\'':
                name = name[1:-1].replace('\\\'', '\'')
            else:
                name = name.strip()
            if name:
                font_names.append(name)

        family = None
        for name in font_names:
            if name == 'serif':
                family = 1  # roman
                break
            elif name == 'sans-serif':
                family = 2  # swiss
                break
            elif name == 'cursive':
                family = 4  # script
                break
            elif name == 'fantasy':
                family = 5  # decorative
                break

        decoration = props.get('text-decoration')
        if decoration is not None:
            decoration = decoration.split()
        else:
            decoration = ()

        return {
            'name': font_names[0] if font_names else None,
            'family': family,
            'size': size,
            'bold': self.BOLD_MAP.get(props.get('font-weight')),
            'italic': self.ITALIC_MAP.get(props.get('font-style')),
            'underline': ('single' if
                          'underline' in decoration
                          else None),
            'strike': ('line-through' in decoration) or None,
            'color': self.color_to_excel(props.get('color')),
            # shadow if nonzero digit before shadow color
            'shadow': (bool(re.search('^[^#(]*[1-9]',
                                      props['text-shadow']))
                       if 'text-shadow' in props else None),
            # 'vertAlign':,
            # 'charset': ,
            # 'scheme': ,
            # 'outline': ,
            # 'condense': ,
        }

    NAMED_COLORS = {
        'maroon': '800000',
        'brown': 'A52A2A',
        'red': 'FF0000',
        'pink': 'FFC0CB',
        'orange': 'FFA500',
        'yellow': 'FFFF00',
        'olive': '808000',
        'green': '008000',
        'purple': '800080',
        'fuchsia': 'FF00FF',
        'lime': '00FF00',
        'teal': '008080',
        'aqua': '00FFFF',
        'blue': '0000FF',
        'navy': '000080',
        'black': '000000',
        'gray': '808080',
        'grey': '808080',
        'silver': 'C0C0C0',
        'white': 'FFFFFF',
    }

    def color_to_excel(self, val):
        if val is None:
            return None
        if val.startswith('#') and len(val) == 7:
            return val[1:].upper()
        if val.startswith('#') and len(val) == 4:
            return (val[1] * 2 + val[2] * 2 + val[3] * 2).upper()
        try:
            return self.NAMED_COLORS[val]
        except KeyError:
            warnings.warn('Unhandled color format: {val!r}'.format(val=val),
                          CSSWarning)

    def build_number_format(self, props):
        return {'format_code': props.get('number-format')}


class ExcelFormatter(object):
    """
    Class for formatting a DataFrame to a list of ExcelCells,

    Parameters
    ----------
    df : DataFrame or Styler
    na_rep: na representation
    float_format : string, default None
            Format string for floating point numbers
    cols : sequence, optional
        Columns to write
    header : boolean or list of string, default True
        Write out column names. If a list of string is given it is
        assumed to be aliases for the column names
    index : boolean, default True
        output row names (index)
    index_label : string or sequence, default None
            Column label for index column(s) if desired. If None is given, and
            `header` and `index` are True, then the index names are used. A
            sequence should be given if the DataFrame uses MultiIndex.
    merge_cells : boolean, default False
            Format MultiIndex and Hierarchical Rows as merged cells.
    inf_rep : string, default `'inf'`
        representation for np.inf values (which aren't representable in Excel)
        A `'-'` sign will be added in front of -inf.
    style_converter : callable, optional
        This translates Styler styles (CSS) into ExcelWriter styles.
        Defaults to ``CSSToExcelConverter()``.
        It should have signature css_declarations string -> excel style.
        This is only called for body cells.
    """

    def __init__(self, df, na_rep='', float_format=None, cols=None,
                 header=True, index=True, index_label=None, merge_cells=False,
                 inf_rep='inf', style_converter=None):
        self.rowcounter = 0
        self.na_rep = na_rep
        if hasattr(df, 'render'):
            self.styler = df
            df = df.data
            if style_converter is None:
                style_converter = CSSToExcelConverter()
            self.style_converter = style_converter
        else:
            self.styler = None
        self.df = df
        if cols is not None:

            # all missing, raise
            if not len(Index(cols) & df.columns):
                raise KeyError(
                    "passes columns are not ALL present dataframe")

            # deprecatedin gh-17295
            # 1 missing is ok (for now)
            if len(Index(cols) & df.columns) != len(cols):
                warnings.warn(
                    "Not all names specified in 'columns' are found; "
                    "this will raise a KeyError in the future",
                    FutureWarning)

            self.df = df.reindex(columns=cols)
        self.columns = self.df.columns
        self.float_format = float_format
        self.index = index
        self.index_label = index_label
        self.header = header
        self.merge_cells = merge_cells
        self.inf_rep = inf_rep

    @property
    def header_style(self):
        return {"font": {"bold": True},
                "borders": {"top": "thin",
                            "right": "thin",
                            "bottom": "thin",
                            "left": "thin"},
                "alignment": {"horizontal": "center",
                              "vertical": "top"}}

    def _format_value(self, val):
        if is_scalar(val) and missing.isna(val):
            val = self.na_rep
        elif is_float(val):
            if missing.isposinf_scalar(val):
                val = self.inf_rep
            elif missing.isneginf_scalar(val):
                val = '-{inf}'.format(inf=self.inf_rep)
            elif self.float_format is not None:
                val = float(self.float_format % val)
        return val

    def _format_header_mi(self):
        if self.columns.nlevels > 1:
            if not self.index:
                raise NotImplementedError("Writing to Excel with MultiIndex"
                                          " columns and no index "
                                          "('index'=False) is not yet "
                                          "implemented.")

        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
        if not (has_aliases or self.header):
            return

        columns = self.columns
        level_strs = columns.format(sparsify=self.merge_cells, adjoin=False,
                                    names=False)
        level_lengths = get_level_lengths(level_strs)
        coloffset = 0
        lnum = 0

        if self.index and isinstance(self.df.index, ABCMultiIndex):
            coloffset = len(self.df.index[0]) - 1

        if self.merge_cells:
            # Format multi-index as a merged cells.
            for lnum in range(len(level_lengths)):
                name = columns.names[lnum]
                yield ExcelCell(lnum, coloffset, name, self.header_style)

            for lnum, (spans, levels, level_codes) in enumerate(zip(
                    level_lengths, columns.levels, columns.codes)):
                values = levels.take(level_codes)
                for i in spans:
                    if spans[i] > 1:
                        yield ExcelCell(lnum, coloffset + i + 1, values[i],
                                        self.header_style, lnum,
                                        coloffset + i + spans[i])
                    else:
                        yield ExcelCell(lnum, coloffset + i + 1, values[i],
                                        self.header_style)
        else:
            # Format in legacy format with dots to indicate levels.
            for i, values in enumerate(zip(*level_strs)):
                v = ".".join(map(pprint_thing, values))
                yield ExcelCell(lnum, coloffset + i + 1, v, self.header_style)

        self.rowcounter = lnum

    def _format_header_regular(self):
        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
        if has_aliases or self.header:
            coloffset = 0

            if self.index:
                coloffset = 1
                if isinstance(self.df.index, ABCMultiIndex):
                    coloffset = len(self.df.index[0])

            colnames = self.columns
            if has_aliases:
                if len(self.header) != len(self.columns):
                    raise ValueError('Writing {cols} cols but got {alias} '
                                     'aliases'.format(cols=len(self.columns),
                                                      alias=len(self.header)))
                else:
                    colnames = self.header

            for colindex, colname in enumerate(colnames):
                yield ExcelCell(self.rowcounter, colindex + coloffset, colname,
                                self.header_style)

    def _format_header(self):
        if isinstance(self.columns, ABCMultiIndex):
            gen = self._format_header_mi()
        else:
            gen = self._format_header_regular()

        gen2 = ()
        if self.df.index.names:
            row = [x if x is not None else ''
                   for x in self.df.index.names] + [''] * len(self.columns)
            if reduce(lambda x, y: x and y, map(lambda x: x != '', row)):
                gen2 = (ExcelCell(self.rowcounter, colindex, val,
                                  self.header_style)
                        for colindex, val in enumerate(row))
                self.rowcounter += 1
        return itertools.chain(gen, gen2)

    def _format_body(self):

        if isinstance(self.df.index, ABCMultiIndex):
            return self._format_hierarchical_rows()
        else:
            return self._format_regular_rows()

    def _format_regular_rows(self):
        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
        if has_aliases or self.header:
            self.rowcounter += 1

        # output index and index_label?
        if self.index:
            # check aliases
            # if list only take first as this is not a MultiIndex
            if (self.index_label and
                    isinstance(self.index_label, (list, tuple, np.ndarray,
                                                  Index))):
                index_label = self.index_label[0]
            # if string good to go
            elif self.index_label and isinstance(self.index_label, str):
                index_label = self.index_label
            else:
                index_label = self.df.index.names[0]

            if isinstance(self.columns, ABCMultiIndex):
                self.rowcounter += 1

            if index_label and self.header is not False:
                yield ExcelCell(self.rowcounter - 1, 0, index_label,
                                self.header_style)

            # write index_values
            index_values = self.df.index
            if isinstance(self.df.index, ABCPeriodIndex):
                index_values = self.df.index.to_timestamp()

            for idx, idxval in enumerate(index_values):
                yield ExcelCell(self.rowcounter + idx, 0, idxval,
                                self.header_style)

            coloffset = 1
        else:
            coloffset = 0

        for cell in self._generate_body(coloffset):
            yield cell

    def _format_hierarchical_rows(self):
        has_aliases = isinstance(self.header, (tuple, list, np.ndarray, Index))
        if has_aliases or self.header:
            self.rowcounter += 1

        gcolidx = 0

        if self.index:
            index_labels = self.df.index.names
            # check for aliases
            if (self.index_label and
                    isinstance(self.index_label, (list, tuple, np.ndarray,
                                                  Index))):
                index_labels = self.index_label

            # MultiIndex columns require an extra row
            # with index names (blank if None) for
            # unambigous round-trip, unless not merging,
            # in which case the names all go on one row Issue #11328
            if isinstance(self.columns, ABCMultiIndex) and self.merge_cells:
                self.rowcounter += 1

            # if index labels are not empty go ahead and dump
            if com._any_not_none(*index_labels) and self.header is not False:

                for cidx, name in enumerate(index_labels):
                    yield ExcelCell(self.rowcounter - 1, cidx, name,
                                    self.header_style)

            if self.merge_cells:
                # Format hierarchical rows as merged cells.
                level_strs = self.df.index.format(sparsify=True, adjoin=False,
                                                  names=False)
                level_lengths = get_level_lengths(level_strs)

                for spans, levels, level_codes in zip(level_lengths,
                                                      self.df.index.levels,
                                                      self.df.index.codes):

                    values = levels.take(level_codes,
                                         allow_fill=levels._can_hold_na,
                                         fill_value=True)

                    for i in spans:
                        if spans[i] > 1:
                            yield ExcelCell(self.rowcounter + i, gcolidx,
                                            values[i], self.header_style,
                                            self.rowcounter + i + spans[i] - 1,
                                            gcolidx)
                        else:
                            yield ExcelCell(self.rowcounter + i, gcolidx,
                                            values[i], self.header_style)
                    gcolidx += 1

            else:
                # Format hierarchical rows with non-merged values.
                for indexcolvals in zip(*self.df.index):
                    for idx, indexcolval in enumerate(indexcolvals):
                        yield ExcelCell(self.rowcounter + idx, gcolidx,
                                        indexcolval, self.header_style)
                    gcolidx += 1

        for cell in self._generate_body(gcolidx):
            yield cell

    def _generate_body(self, coloffset):
        if self.styler is None:
            styles = None
        else:
            styles = self.styler._compute().ctx
            if not styles:
                styles = None
        xlstyle = None

        # Write the body of the frame data series by series.
        for colidx in range(len(self.columns)):
            series = self.df.iloc[:, colidx]
            for i, val in enumerate(series):
                if styles is not None:
                    xlstyle = self.style_converter(';'.join(styles[i, colidx]))
                yield ExcelCell(self.rowcounter + i, colidx + coloffset, val,
                                xlstyle)

    def get_formatted_cells(self):
        for cell in itertools.chain(self._format_header(),
                                    self._format_body()):
            cell.val = self._format_value(cell.val)
            yield cell

    def write(self, writer, sheet_name='Sheet1', startrow=0,
              startcol=0, freeze_panes=None, engine=None):
        """
        writer : string or ExcelWriter object
            File path or existing ExcelWriter
        sheet_name : string, default 'Sheet1'
            Name of sheet which will contain DataFrame
        startrow :
            upper left cell row to dump data frame
        startcol :
            upper left cell column to dump data frame
        freeze_panes : tuple of integer (length 2), default None
            Specifies the one-based bottommost row and rightmost column that
            is to be frozen
        engine : string, default None
            write engine to use if writer is a path - you can also set this
            via the options ``io.excel.xlsx.writer``, ``io.excel.xls.writer``,
            and ``io.excel.xlsm.writer``.
        """
        from pandas.io.excel import ExcelWriter
        from pandas.io.common import _stringify_path

        if isinstance(writer, ExcelWriter):
            need_save = False
        else:
            writer = ExcelWriter(_stringify_path(writer), engine=engine)
            need_save = True

        formatted_cells = self.get_formatted_cells()
        writer.write_cells(formatted_cells, sheet_name,
                           startrow=startrow, startcol=startcol,
                           freeze_panes=freeze_panes)
        if need_save:
            writer.save()