diff --git a/asv_bench/benchmarks/parser_vb.py b/asv_bench/benchmarks/parser_vb.py index 18cd4de6cc9c5..04f25034638cd 100644 --- a/asv_bench/benchmarks/parser_vb.py +++ b/asv_bench/benchmarks/parser_vb.py @@ -23,18 +23,42 @@ class read_csv_default_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_default_converter(self): read_csv(StringIO(self.data), sep=',', header=None, float_precision=None) +class read_csv_default_converter_with_decimal(object): + goal_time = 0.2 + + def setup(self): + self.data = """0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n +0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n +0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n +0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n +0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n""" + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',') + + class read_csv_precise_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_precise_converter(self): @@ -45,7 +69,11 @@ class read_csv_roundtrip_converter(object): goal_time = 0.2 def setup(self): - self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = """0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n +0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n +0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n +0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n +0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n""" self.data = (self.data * 200) def time_read_csv_roundtrip_converter(self): @@ -109,4 +137,28 @@ def setup(self): self.data = (self.data * 200) def time_read_table_multiple_date_baseline(self): - read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) \ No newline at end of file + read_table(StringIO(self.data), sep=',', header=None, parse_dates=[1]) + + +class read_csv_default_converter_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data = '0.1213700904466425978256438611,0.0525708283766902484401839501,0.4174092731488769913994474336\n 0.4096341697147408700274695547,0.1587830198973579909349496119,0.1292545832485494372576795285\n 0.8323255650024565799327547210,0.9694902427379478160318626578,0.6295047811546814475747169126\n 0.4679375305798131323697930383,0.2963942381834381301075609371,0.5268936082160610157032465394\n 0.6685382761849776311890991564,0.6721207066140679753374342908,0.6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter(self): + read_csv(StringIO(self.data), sep=',', header=None, + float_precision=None, engine='python') + + +class read_csv_default_converter_with_decimal_python_engine(object): + goal_time = 0.2 + + def setup(self): + self.data = '0,1213700904466425978256438611;0,0525708283766902484401839501;0,4174092731488769913994474336\n 0,4096341697147408700274695547;0,1587830198973579909349496119;0,1292545832485494372576795285\n 0,8323255650024565799327547210;0,9694902427379478160318626578;0,6295047811546814475747169126\n 0,4679375305798131323697930383;0,2963942381834381301075609371;0,5268936082160610157032465394\n 0,6685382761849776311890991564;0,6721207066140679753374342908;0,6519975277021627935170045020\n ' + self.data = (self.data * 200) + + def time_read_csv_default_converter_with_decimal(self): + read_csv(StringIO(self.data), sep=';', header=None, + float_precision=None, decimal=',', engine='python') diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 4b3c96da10efd..cbf95a10447d5 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -47,6 +47,8 @@ Other enhancements pd.Timestamp(year=2012, month=1, day=1, hour=8, minute=30) +- The ``pd.read_csv()`` with ``engine='python'`` has gained support for the ``decimal`` option (:issue:`12933`) + .. _whatsnew_0182.api: API changes diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 25639984e4ccf..07b92fd6bfd28 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -348,6 +348,7 @@ def _read(filepath_or_buffer, kwds): 'keep_default_na': True, 'thousands': None, 'comment': None, + 'decimal': b'.', # 'engine': 'c', 'parse_dates': False, @@ -383,7 +384,6 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines': True, 'warn_bad_lines': True, 'dtype': None, - 'decimal': b'.', 'float_precision': None } @@ -404,7 +404,6 @@ def _read(filepath_or_buffer, kwds): 'error_bad_lines', 'warn_bad_lines', 'dtype', - 'decimal', 'float_precision', ]) @@ -1582,6 +1581,7 @@ def __init__(self, f, **kwds): self.converters = kwds['converters'] self.thousands = kwds['thousands'] + self.decimal = kwds['decimal'] self.comment = kwds['comment'] self._comment_lines = [] @@ -1639,6 +1639,15 @@ def __init__(self, f, **kwds): else: self._no_thousands_columns = None + if len(self.decimal) != 1: + raise ValueError('Only length-1 decimal markers supported') + + if self.thousands is None: + self.nonnum = re.compile('[^-^0-9^%s]+' % self.decimal) + else: + self.nonnum = re.compile('[^-^0-9^%s^%s]+' % (self.thousands, + self.decimal)) + def _set_no_thousands_columns(self): # Create a set of column ids that are not to be stripped of thousands # operators. @@ -2050,22 +2059,35 @@ def _check_empty(self, lines): def _check_thousands(self, lines): if self.thousands is None: return lines - nonnum = re.compile('[^-^0-9^%s^.]+' % self.thousands) + + return self._search_replace_num_columns(lines=lines, + search=self.thousands, + replace='') + + def _search_replace_num_columns(self, lines, search, replace): ret = [] for l in lines: rl = [] for i, x in enumerate(l): if (not isinstance(x, compat.string_types) or - self.thousands not in x or + search not in x or (self._no_thousands_columns and i in self._no_thousands_columns) or - nonnum.search(x.strip())): + self.nonnum.search(x.strip())): rl.append(x) else: - rl.append(x.replace(self.thousands, '')) + rl.append(x.replace(search, replace)) ret.append(rl) return ret + def _check_decimal(self, lines): + if self.decimal == _parser_defaults['decimal']: + return lines + + return self._search_replace_num_columns(lines=lines, + search=self.decimal, + replace='.') + def _clear_buffer(self): self.buf = [] @@ -2249,7 +2271,8 @@ def _get_lines(self, rows=None): lines = self._check_comments(lines) if self.skip_blank_lines: lines = self._check_empty(lines) - return self._check_thousands(lines) + lines = self._check_thousands(lines) + return self._check_decimal(lines) def _make_date_converter(date_parser=None, dayfirst=False, diff --git a/pandas/io/tests/parser/c_parser_only.py b/pandas/io/tests/parser/c_parser_only.py index 24c670abe8158..8e44802adf744 100644 --- a/pandas/io/tests/parser/c_parser_only.py +++ b/pandas/io/tests/parser/c_parser_only.py @@ -353,17 +353,6 @@ def test_disable_bool_parsing(self): result = self.read_csv(StringIO(data), dtype=object, na_filter=False) self.assertEqual(result['B'][2], '') - def test_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - - df2 = self.read_csv(StringIO(data), sep=';', decimal=',') - self.assertEqual(df2['Number1'].dtype, float) - self.assertEqual(df2['Number2'].dtype, float) - self.assertEqual(df2['Number3'].dtype, float) - def test_custom_lineterminator(self): data = 'a,b,c~1,2,3~4,5,6' @@ -444,40 +433,6 @@ def test_raise_on_no_columns(self): data = "\n\n\n" self.assertRaises(ValueError, self.read_csv, StringIO(data)) - def test_1000_sep_with_decimal(self): - data = """A|B|C -1|2,334.01|5 -10|13|10. -""" - expected = DataFrame({ - 'A': [1, 10], - 'B': [2334.01, 13], - 'C': [5, 10.] - }) - - tm.assert_equal(expected.A.dtype, 'int64') - tm.assert_equal(expected.B.dtype, 'float') - tm.assert_equal(expected.C.dtype, 'float') - - df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data), sep='|', - thousands=',', decimal='.') - tm.assert_frame_equal(df, expected) - - data_with_odd_sep = """A|B|C -1|2.334,01|5 -10|13|10, -""" - df = self.read_csv(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - - df = self.read_table(StringIO(data_with_odd_sep), - sep='|', thousands='.', decimal=',') - tm.assert_frame_equal(df, expected) - def test_grow_boundary_at_cap(self): # See gh-12494 # diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py index 4d9ce922184d9..57ab9477302c1 100644 --- a/pandas/io/tests/parser/common.py +++ b/pandas/io/tests/parser/common.py @@ -41,10 +41,10 @@ def test_empty_decimal_marker(self): 1|2,334|5 10|13|10. """ - # C parser: supports only length-1 decimals - # Python parser: 'decimal' not supported yet - self.assertRaises(ValueError, self.read_csv, - StringIO(data), decimal='') + # Parsers support only length-1 decimals + msg = 'Only length-1 decimal markers supported' + with tm.assertRaisesRegexp(ValueError, msg): + self.read_csv(StringIO(data), decimal='') def test_read_csv(self): if not compat.PY3: @@ -1236,3 +1236,48 @@ def test_iteration_open_handle(self): result = self.read_table(f, squeeze=True, header=None) expected = Series(['DDD', 'EEE', 'FFF', 'GGG'], name=0) tm.assert_series_equal(result, expected) + + def test_1000_sep_with_decimal(self): + data = """A|B|C +1|2,334.01|5 +10|13|10. +""" + expected = DataFrame({ + 'A': [1, 10], + 'B': [2334.01, 13], + 'C': [5, 10.] + }) + + tm.assert_equal(expected.A.dtype, 'int64') + tm.assert_equal(expected.B.dtype, 'float') + tm.assert_equal(expected.C.dtype, 'float') + + df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data), sep='|', + thousands=',', decimal='.') + tm.assert_frame_equal(df, expected) + + data_with_odd_sep = """A|B|C +1|2.334,01|5 +10|13|10, +""" + df = self.read_csv(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + df = self.read_table(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') + tm.assert_frame_equal(df, expected) + + def test_euro_decimal_format(self): + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,738797819 +2;121,12;14897,76;DEF;uyt;0,377320872 +3;878,158;108013,434;GHI;rez;2,735694704""" + + df2 = self.read_csv(StringIO(data), sep=';', decimal=',') + self.assertEqual(df2['Number1'].dtype, float) + self.assertEqual(df2['Number2'].dtype, float) + self.assertEqual(df2['Number3'].dtype, float)