diff --git a/pandas/tests/io/parser/comment.py b/pandas/tests/io/parser/comment.py deleted file mode 100644 index fc2310ca1daaf..0000000000000 --- a/pandas/tests/io/parser/comment.py +++ /dev/null @@ -1,119 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that comments are properly handled during parsing -for all of the parsers defined in parsers.py -""" - -import numpy as np - -from pandas.compat import StringIO - -from pandas import DataFrame -import pandas.util.testing as tm - - -class CommentTests(object): - - def test_comment(self): - data = """A,B,C -1,2.,4.#hello world -5.,NaN,10.0 -""" - expected = np.array([[1., 2., 4.], - [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data), comment='#') - tm.assert_numpy_array_equal(df.values, expected) - - df = self.read_table(StringIO(data), sep=',', comment='#', - na_values=['NaN']) - tm.assert_numpy_array_equal(df.values, expected) - - def test_line_comment(self): - data = """# empty -A,B,C -1,2.,4.#hello world -#ignore this line -5.,NaN,10.0 -""" - expected = np.array([[1., 2., 4.], - [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data), comment='#') - tm.assert_numpy_array_equal(df.values, expected) - - # check with delim_whitespace=True - df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', - delim_whitespace=True) - tm.assert_almost_equal(df.values, expected) - - # custom line terminator is not supported - # with the Python parser yet - if self.engine == 'c': - expected = np.array([[1., 2., 4.], - [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data.replace('\n', '*')), - comment='#', lineterminator='*') - tm.assert_numpy_array_equal(df.values, expected) - - def test_comment_skiprows(self): - data = """# empty -random line -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - # this should ignore the first four lines (including comments) - expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data), comment='#', skiprows=4) - tm.assert_numpy_array_equal(df.values, expected) - - def test_comment_header(self): - data = """# empty -# second empty line -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - # header should begin at the second non-comment line - expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data), comment='#', header=1) - tm.assert_numpy_array_equal(df.values, expected) - - def test_comment_skiprows_header(self): - data = """# empty -# second empty line -# third empty line -X,Y,Z -1,2,3 -A,B,C -1,2.,4. -5.,NaN,10.0 -""" - # skiprows should skip the first 4 lines (including comments), while - # header should start from the second non-commented line starting - # with line 5 - expected = np.array([[1., 2., 4.], [5., np.nan, 10.]]) - df = self.read_csv(StringIO(data), comment='#', skiprows=4, header=1) - tm.assert_numpy_array_equal(df.values, expected) - - def test_custom_comment_char(self): - data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" - - result = self.read_csv(StringIO(data), comment='#') - expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) - tm.assert_frame_equal(result, expected) - - def test_commment_first_line(self): - # see gh-4623 - data = '# notes\na,b,c\n# more notes\n1,2,3' - - expected = DataFrame([[1, 2, 3]], columns=['a', 'b', 'c']) - result = self.read_csv(StringIO(data), comment='#') - tm.assert_frame_equal(result, expected) - - expected = DataFrame({0: ['a', '1'], 1: ['b', '2'], 2: ['c', '3']}) - result = self.read_csv(StringIO(data), comment='#', header=None) - tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/compression.py b/pandas/tests/io/parser/compression.py deleted file mode 100644 index e5ada41c06762..0000000000000 --- a/pandas/tests/io/parser/compression.py +++ /dev/null @@ -1,136 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests compressed data parsing functionality for all -of the parsers defined in parsers.py -""" - -import bz2 -import gzip - -import pytest - -import pandas.compat as compat -import pandas.util._test_decorators as td - -import pandas as pd -import pandas.util.testing as tm - -try: - lzma = compat.import_lzma() -except ImportError: - lzma = None - - -class CompressionTests(object): - - def test_zip(self): - import zipfile - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean('test_file.zip') as path: - with zipfile.ZipFile(path, mode='w') as tmp: - tmp.writestr('test_file', data) - - result = self.read_csv(path, compression='zip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - if self.engine is not 'python': - with open(path, 'rb') as f: - result = self.read_csv(f, compression='zip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean('combined_zip.zip') as path: - inner_file_names = ['test_file', 'second_file'] - with zipfile.ZipFile(path, mode='w') as tmp: - for file_name in inner_file_names: - tmp.writestr(file_name, data) - - with pytest.raises(ValueError, match='Multiple files'): - self.read_csv(path, compression='zip') - - with pytest.raises(ValueError, match='Multiple files'): - self.read_csv(path, compression='infer') - - with tm.ensure_clean() as path: - with zipfile.ZipFile(path, mode='w'): - pass - - with pytest.raises(ValueError, match='Zero files'): - self.read_csv(path, compression='zip') - - with tm.ensure_clean() as path: - with open(path, 'wb') as f: - pytest.raises(zipfile.BadZipfile, self.read_csv, - f, compression='zip') - - @pytest.mark.parametrize('compress_type, compress_method, ext', [ - ('gzip', gzip.GzipFile, 'gz'), - ('bz2', bz2.BZ2File, 'bz2'), - pytest.param('xz', getattr(lzma, 'LZMAFile', None), 'xz', - marks=td.skip_if_no_lzma) - ]) - def test_other_compression(self, compress_type, compress_method, ext): - - with open(self.csv1, 'rb') as data_file: - data = data_file.read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - with compress_method(path, mode='wb') as tmp: - tmp.write(data) - - result = self.read_csv(path, compression=compress_type) - tm.assert_frame_equal(result, expected) - - if compress_type == 'bz2': - pytest.raises(ValueError, self.read_csv, - path, compression='bz3') - - with open(path, 'rb') as fin: - result = self.read_csv(fin, compression=compress_type) - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean('test.{}'.format(ext)) as path: - with compress_method(path, mode='wb') as tmp: - tmp.write(data) - result = self.read_csv(path, compression='infer') - tm.assert_frame_equal(result, expected) - - def test_read_csv_infer_compression(self): - # see gh-9770 - expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) - - with open(self.csv1) as f: - inputs = [self.csv1, self.csv1 + '.gz', - self.csv1 + '.bz2', f] - - for inp in inputs: - df = self.read_csv(inp, index_col=0, parse_dates=True, - compression='infer') - - tm.assert_frame_equal(expected, df) - - def test_read_csv_compressed_utf16_example(self, datapath): - # GH18071 - path = datapath('io', 'parser', 'data', 'utf16_ex_small.zip') - - result = self.read_csv(path, encoding='utf-16', - compression='zip', sep='\t') - expected = pd.DataFrame({ - u'Country': [u'Venezuela', u'Venezuela'], - u'Twitter': [u'Hugo Chávez Frías', u'Henrique Capriles R.'] - }) - - tm.assert_frame_equal(result, expected) - - def test_invalid_compression(self): - msg = 'Unrecognized compression type: sfark' - with pytest.raises(ValueError, match=msg): - self.read_csv('test_file.zip', compression='sfark') diff --git a/pandas/tests/io/parser/conftest.py b/pandas/tests/io/parser/conftest.py index a82e3e47c6931..857cdea942459 100644 --- a/pandas/tests/io/parser/conftest.py +++ b/pandas/tests/io/parser/conftest.py @@ -1,3 +1,5 @@ +import os + import pytest from pandas import read_csv, read_table @@ -49,6 +51,11 @@ def csv_dir_path(datapath): return datapath("io", "parser", "data") +@pytest.fixture +def csv1(csv_dir_path): + return os.path.join(csv_dir_path, "test1.csv") + + _cParserHighMemory = CParserHighMemory() _cParserLowMemory = CParserLowMemory() _pythonParser = PythonParser() diff --git a/pandas/tests/io/parser/converters.py b/pandas/tests/io/parser/converters.py deleted file mode 100644 index f8a498172eaf9..0000000000000 --- a/pandas/tests/io/parser/converters.py +++ /dev/null @@ -1,153 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests column conversion functionality during parsing -for all of the parsers defined in parsers.py -""" - -from datetime import datetime - -import numpy as np -import pytest - -from pandas._libs.tslib import Timestamp -from pandas.compat import StringIO, lmap, parse_date - -import pandas as pd -from pandas import DataFrame, Index -import pandas.util.testing as tm - - -class ConverterTests(object): - - def test_converters_type_must_be_dict(self): - data = """index,A,B,C,D -foo,2,3,4,5 -""" - with pytest.raises(TypeError, match='Type converters.+'): - self.read_csv(StringIO(data), converters=0) - - def test_converters(self): - data = """A,B,C,D -a,1,2,01/01/2009 -b,3,4,01/02/2009 -c,4,5,01/03/2009 -""" - result = self.read_csv(StringIO(data), converters={'D': parse_date}) - result2 = self.read_csv(StringIO(data), converters={3: parse_date}) - - expected = self.read_csv(StringIO(data)) - expected['D'] = expected['D'].map(parse_date) - - assert isinstance(result['D'][0], (datetime, Timestamp)) - tm.assert_frame_equal(result, expected) - tm.assert_frame_equal(result2, expected) - - # produce integer - converter = lambda x: int(x.split('/')[2]) - result = self.read_csv(StringIO(data), converters={'D': converter}) - expected = self.read_csv(StringIO(data)) - expected['D'] = expected['D'].map(converter) - tm.assert_frame_equal(result, expected) - - def test_converters_no_implicit_conv(self): - # see gh-2184 - data = """000102,1.2,A\n001245,2,B""" - f = lambda x: x.strip() - converter = {0: f} - df = self.read_csv(StringIO(data), header=None, converters=converter) - assert df[0].dtype == object - - def test_converters_euro_decimal_format(self): - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - f = lambda x: float(x.replace(",", ".")) - converter = {'Number1': f, 'Number2': f, 'Number3': f} - df2 = self.read_csv(StringIO(data), sep=';', converters=converter) - assert df2['Number1'].dtype == float - assert df2['Number2'].dtype == float - assert df2['Number3'].dtype == float - - def test_converter_return_string_bug(self): - # see gh-583 - data = """Id;Number1;Number2;Text1;Text2;Number3 -1;1521,1541;187101,9543;ABC;poi;4,738797819 -2;121,12;14897,76;DEF;uyt;0,377320872 -3;878,158;108013,434;GHI;rez;2,735694704""" - f = lambda x: float(x.replace(",", ".")) - converter = {'Number1': f, 'Number2': f, 'Number3': f} - df2 = self.read_csv(StringIO(data), sep=';', converters=converter) - assert df2['Number1'].dtype == float - - def test_converters_corner_with_nas(self): - # skip aberration observed on Win64 Python 3.2.2 - if hash(np.int64(-1)) != -2: - pytest.skip("skipping because of windows hash on Python" - " 3.2.2") - - data = """id,score,days -1,2,12 -2,2-5, -3,,14+ -4,6-12,2""" - - def convert_days(x): - x = x.strip() - if not x: - return np.nan - - is_plus = x.endswith('+') - if is_plus: - x = int(x[:-1]) + 1 - else: - x = int(x) - return x - - def convert_days_sentinel(x): - x = x.strip() - if not x: - return np.nan - - is_plus = x.endswith('+') - if is_plus: - x = int(x[:-1]) + 1 - else: - x = int(x) - return x - - def convert_score(x): - x = x.strip() - if not x: - return np.nan - if x.find('-') > 0: - valmin, valmax = lmap(int, x.split('-')) - val = 0.5 * (valmin + valmax) - else: - val = float(x) - - return val - - fh = StringIO(data) - result = self.read_csv(fh, converters={'score': convert_score, - 'days': convert_days}, - na_values=['', None]) - assert pd.isna(result['days'][1]) - - fh = StringIO(data) - result2 = self.read_csv(fh, converters={'score': convert_score, - 'days': convert_days_sentinel}, - na_values=['', None]) - tm.assert_frame_equal(result, result2) - - def test_converter_index_col_bug(self): - # see gh-1835 - data = "A;B\n1;2\n3;4" - - rs = self.read_csv(StringIO(data), sep=';', index_col='A', - converters={'A': lambda x: x}) - - xp = DataFrame({'B': [2, 4]}, index=Index([1, 3], name='A')) - tm.assert_frame_equal(rs, xp) - assert rs.index.name == xp.index.name diff --git a/pandas/tests/io/parser/dialect.py b/pandas/tests/io/parser/dialect.py deleted file mode 100644 index aa89f3167788a..0000000000000 --- a/pandas/tests/io/parser/dialect.py +++ /dev/null @@ -1,80 +0,0 @@ -# -*- coding: utf-8 -*- - -""" -Tests that dialects are properly handled during parsing -for all of the parsers defined in parsers.py -""" - -import csv - -import pytest - -from pandas.compat import StringIO -from pandas.errors import ParserWarning - -from pandas import DataFrame -import pandas.util.testing as tm - - -class DialectTests(object): - - def test_dialect(self): - data = """\ -label1,label2,label3 -index1,"a,c,e -index2,b,d,f -""" - - dia = csv.excel() - dia.quoting = csv.QUOTE_NONE - with tm.assert_produces_warning(ParserWarning): - df = self.read_csv(StringIO(data), dialect=dia) - - data = '''\ -label1,label2,label3 -index1,a,c,e -index2,b,d,f -''' - exp = self.read_csv(StringIO(data)) - exp.replace('a', '"a', inplace=True) - tm.assert_frame_equal(df, exp) - - def test_dialect_str(self): - data = """\ -fruit:vegetable -apple:brocolli -pear:tomato -""" - exp = DataFrame({ - 'fruit': ['apple', 'pear'], - 'vegetable': ['brocolli', 'tomato'] - }) - csv.register_dialect('mydialect', delimiter=':') - with tm.assert_produces_warning(ParserWarning): - df = self.read_csv(StringIO(data), dialect='mydialect') - - tm.assert_frame_equal(df, exp) - csv.unregister_dialect('mydialect') - - def test_invalid_dialect(self): - class InvalidDialect(object): - pass - - data = 'a\n1' - msg = 'Invalid dialect' - - with pytest.raises(ValueError, match=msg): - self.read_csv(StringIO(data), dialect=InvalidDialect) - - def test_dialect_conflict(self): - data = 'a,b\n1,2' - dialect = 'excel' - exp = DataFrame({'a': [1], 'b': [2]}) - - with tm.assert_produces_warning(None): - df = self.read_csv(StringIO(data), delimiter=',', dialect=dialect) - tm.assert_frame_equal(df, exp) - - with tm.assert_produces_warning(ParserWarning): - df = self.read_csv(StringIO(data), delimiter='.', dialect=dialect) - tm.assert_frame_equal(df, exp) diff --git a/pandas/tests/io/parser/test_comment.py b/pandas/tests/io/parser/test_comment.py new file mode 100644 index 0000000000000..299a04f876bd1 --- /dev/null +++ b/pandas/tests/io/parser/test_comment.py @@ -0,0 +1,136 @@ +# -*- coding: utf-8 -*- + +""" +Tests that comments are properly handled during parsing +for all of the parsers defined in parsers.py +""" + +import numpy as np +import pytest + +from pandas.compat import StringIO + +from pandas import DataFrame +import pandas.util.testing as tm + + +@pytest.mark.parametrize("na_values", [None, ["NaN"]]) +def test_comment(all_parsers, na_values): + parser = all_parsers + data = """A,B,C +1,2.,4.#hello world +5.,NaN,10.0 +""" + expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], + columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data), comment="#", + na_values=na_values) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("read_kwargs", [ + dict(), + dict(lineterminator="*"), + dict(delim_whitespace=True), +]) +def test_line_comment(all_parsers, read_kwargs): + parser = all_parsers + data = """# empty +A,B,C +1,2.,4.#hello world +#ignore this line +5.,NaN,10.0 +""" + if read_kwargs.get("delim_whitespace"): + data = data.replace(",", " ") + elif read_kwargs.get("lineterminator"): + if parser.engine != "c": + pytest.skip("Custom terminator not supported with Python engine") + + data = data.replace("\n", read_kwargs.get("lineterminator")) + + read_kwargs["comment"] = "#" + result = parser.read_csv(StringIO(data), **read_kwargs) + + expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], + columns=["A", "B", "C"]) + tm.assert_frame_equal(result, expected) + + +def test_comment_skiprows(all_parsers): + parser = all_parsers + data = """# empty +random line +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # This should ignore the first four lines (including comments). + expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], + columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data), comment="#", skiprows=4) + tm.assert_frame_equal(result, expected) + + +def test_comment_header(all_parsers): + parser = all_parsers + data = """# empty +# second empty line +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # Header should begin at the second non-comment line. + expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], + columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data), comment="#", header=1) + tm.assert_frame_equal(result, expected) + + +def test_comment_skiprows_header(all_parsers): + parser = all_parsers + data = """# empty +# second empty line +# third empty line +X,Y,Z +1,2,3 +A,B,C +1,2.,4. +5.,NaN,10.0 +""" + # Skiprows should skip the first 4 lines (including comments), + # while header should start from the second non-commented line, + # starting with line 5. + expected = DataFrame([[1., 2., 4.], [5., np.nan, 10.]], + columns=["A", "B", "C"]) + result = parser.read_csv(StringIO(data), comment="#", skiprows=4, header=1) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("comment_char", ["#", "~", "&", "^", "*", "@"]) +def test_custom_comment_char(all_parsers, comment_char): + parser = all_parsers + data = "a,b,c\n1,2,3#ignore this!\n4,5,6#ignorethistoo" + result = parser.read_csv(StringIO(data.replace("#", comment_char)), + comment=comment_char) + + expected = DataFrame([[1, 2, 3], [4, 5, 6]], columns=["a", "b", "c"]) + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("header", ["infer", None]) +def test_comment_first_line(all_parsers, header): + # see gh-4623 + parser = all_parsers + data = "# notes\na,b,c\n# more notes\n1,2,3" + + if header is None: + expected = DataFrame({0: ["a", "1"], 1: ["b", "2"], 2: ["c", "3"]}) + else: + expected = DataFrame([[1, 2, 3]], columns=["a", "b", "c"]) + + result = parser.read_csv(StringIO(data), comment="#", header=header) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_compression.py b/pandas/tests/io/parser/test_compression.py new file mode 100644 index 0000000000000..9922e1bbf1613 --- /dev/null +++ b/pandas/tests/io/parser/test_compression.py @@ -0,0 +1,212 @@ +# -*- coding: utf-8 -*- + +""" +Tests compressed data parsing functionality for all +of the parsers defined in parsers.py +""" + +import bz2 +import gzip +import os +import zipfile + +import pytest + +import pandas.compat as compat + +import pandas as pd +import pandas.util.testing as tm + + +def lzma_file(): + """ + Try to load the `LZMAFile` class from `backports.lzma`. + + Returns + ------- + klass : type or None + """ + try: + lzma = compat.import_lzma() + except ImportError: + lzma = None + + return getattr(lzma, "LZMAFile", None) + + +def write_to_compressed(compress_type, path, data, dest="test"): + """ + Write data to a compressed file. + + Parameters + ---------- + compress_type : type + The compression type (or class) to use. + path : str + The file path to write the data. + data : str + The data to write. + dest : str, default "test" + The destination file (for ZIP only) + """ + # compression --> compression_method + compression_mappings = { + "zip": zipfile.ZipFile, + "gzip": gzip.GzipFile, + "bz2": bz2.BZ2File, + "xz": lzma_file(), + } + + compress_method = compression_mappings[compress_type] + + if compress_type == "zip": + mode = "w" + args = (dest, data) + method = "writestr" + else: + mode = "wb" + args = (data,) + method = "write" + + with compress_method(path, mode=mode) as f: + getattr(f, method)(*args) + + +@pytest.fixture(params=[True, False]) +def buffer(request): + return request.param + + +@pytest.fixture +def parser_and_data(all_parsers, csv1): + parser = all_parsers + + with open(csv1, "rb") as f: + data = f.read() + expected = parser.read_csv(csv1) + + return parser, data, expected + + +@pytest.mark.parametrize("compression", ["zip", "infer", "zip2"]) +def test_zip(parser_and_data, compression): + parser, data, expected = parser_and_data + + with tm.ensure_clean("test_file.zip") as path: + with zipfile.ZipFile(path, mode="w") as tmp: + tmp.writestr("test_file", data) + + if compression == "zip2": + with open(path, "rb") as f: + result = parser.read_csv(f, compression="zip") + else: + result = parser.read_csv(path, compression=compression) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("compression", ["zip", "infer"]) +def test_zip_error_multiple_files(parser_and_data, compression): + parser, data, expected = parser_and_data + + with tm.ensure_clean("combined_zip.zip") as path: + inner_file_names = ["test_file", "second_file"] + + with zipfile.ZipFile(path, mode="w") as tmp: + for file_name in inner_file_names: + tmp.writestr(file_name, data) + + with pytest.raises(ValueError, match="Multiple files"): + parser.read_csv(path, compression=compression) + + +def test_zip_error_no_files(parser_and_data): + parser, _, _ = parser_and_data + + with tm.ensure_clean() as path: + with zipfile.ZipFile(path, mode="w"): + pass + + with pytest.raises(ValueError, match="Zero files"): + parser.read_csv(path, compression="zip") + + +def test_zip_error_invalid_zip(parser_and_data): + parser, _, _ = parser_and_data + + with tm.ensure_clean() as path: + with open(path, "wb") as f: + with pytest.raises(zipfile.BadZipfile, + match="File is not a zip file"): + parser.read_csv(f, compression="zip") + + +@pytest.mark.parametrize("filename", [None, "test.{ext}"]) +def test_compression(parser_and_data, compression_only, buffer, filename): + parser, data, expected = parser_and_data + compress_type = compression_only + + ext = "gz" if compress_type == "gzip" else compress_type + filename = filename if filename is None else filename.format(ext=ext) + + if filename and buffer: + pytest.skip("Cannot deduce compression from " + "buffer of compressed data.") + + with tm.ensure_clean(filename=filename) as path: + write_to_compressed(compress_type, path, data) + compression = "infer" if filename else compress_type + + if buffer: + with open(path, "rb") as f: + result = parser.read_csv(f, compression=compression) + else: + result = parser.read_csv(path, compression=compression) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("ext", [None, "gz", "bz2"]) +def test_infer_compression(all_parsers, csv1, buffer, ext): + # see gh-9770 + parser = all_parsers + kwargs = dict(index_col=0, parse_dates=True) + + expected = parser.read_csv(csv1, **kwargs) + kwargs["compression"] = "infer" + + if buffer: + with open(csv1) as f: + result = parser.read_csv(f, **kwargs) + else: + ext = "." + ext if ext else "" + result = parser.read_csv(csv1 + ext, **kwargs) + + tm.assert_frame_equal(result, expected) + + +def test_compression_utf16_encoding(all_parsers, csv_dir_path): + # see gh-18071 + parser = all_parsers + path = os.path.join(csv_dir_path, "utf16_ex_small.zip") + + result = parser.read_csv(path, encoding="utf-16", + compression="zip", sep="\t") + expected = pd.DataFrame({ + u"Country": [u"Venezuela", u"Venezuela"], + u"Twitter": [u"Hugo Chávez Frías", u"Henrique Capriles R."] + }) + + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("invalid_compression", ["sfark", "bz3", "zipper"]) +def test_invalid_compression(all_parsers, invalid_compression): + parser = all_parsers + compress_kwargs = dict(compression=invalid_compression) + + msg = ("Unrecognized compression " + "type: {compression}".format(**compress_kwargs)) + + with pytest.raises(ValueError, match=msg): + parser.read_csv("test_file.zip", **compress_kwargs) diff --git a/pandas/tests/io/parser/test_converters.py b/pandas/tests/io/parser/test_converters.py new file mode 100644 index 0000000000000..47bbae0274fd3 --- /dev/null +++ b/pandas/tests/io/parser/test_converters.py @@ -0,0 +1,158 @@ +# -*- coding: utf-8 -*- + +""" +Tests column conversion functionality during parsing +for all of the parsers defined in parsers.py +""" + +import numpy as np +import pytest + +from pandas.compat import StringIO, lmap, parse_date + +import pandas as pd +from pandas import DataFrame, Index +import pandas.util.testing as tm + + +def test_converters_type_must_be_dict(all_parsers): + parser = all_parsers + data = """index,A,B,C,D +foo,2,3,4,5 +""" + + with pytest.raises(TypeError, match="Type converters.+"): + parser.read_csv(StringIO(data), converters=0) + + +@pytest.mark.parametrize("column", [3, "D"]) +@pytest.mark.parametrize("converter", [ + parse_date, + lambda x: int(x.split("/")[2]) # Produce integer. +]) +def test_converters(all_parsers, column, converter): + parser = all_parsers + data = """A,B,C,D +a,1,2,01/01/2009 +b,3,4,01/02/2009 +c,4,5,01/03/2009 +""" + result = parser.read_csv(StringIO(data), converters={column: converter}) + + expected = parser.read_csv(StringIO(data)) + expected["D"] = expected["D"].map(converter) + + tm.assert_frame_equal(result, expected) + + +def test_converters_no_implicit_conv(all_parsers): + # see gh-2184 + parser = all_parsers + data = """000102,1.2,A\n001245,2,B""" + + converters = {0: lambda x: x.strip()} + result = parser.read_csv(StringIO(data), header=None, + converters=converters) + + # Column 0 should not be casted to numeric and should remain as object. + expected = DataFrame([["000102", 1.2, "A"], ["001245", 2, "B"]]) + tm.assert_frame_equal(result, expected) + + +def test_converters_euro_decimal_format(all_parsers): + # see gh-583 + converters = dict() + parser = all_parsers + + data = """Id;Number1;Number2;Text1;Text2;Number3 +1;1521,1541;187101,9543;ABC;poi;4,7387 +2;121,12;14897,76;DEF;uyt;0,3773 +3;878,158;108013,434;GHI;rez;2,7356""" + converters["Number1"] = converters["Number2"] =\ + converters["Number3"] = lambda x: float(x.replace(",", ".")) + + result = parser.read_csv(StringIO(data), sep=";", converters=converters) + expected = DataFrame([[1, 1521.1541, 187101.9543, "ABC", "poi", 4.7387], + [2, 121.12, 14897.76, "DEF", "uyt", 0.3773], + [3, 878.158, 108013.434, "GHI", "rez", 2.7356]], + columns=["Id", "Number1", "Number2", + "Text1", "Text2", "Number3"]) + tm.assert_frame_equal(result, expected) + + +def test_converters_corner_with_nans(all_parsers): + parser = all_parsers + data = """id,score,days +1,2,12 +2,2-5, +3,,14+ +4,6-12,2""" + + # Example converters. + def convert_days(x): + x = x.strip() + + if not x: + return np.nan + + is_plus = x.endswith("+") + + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + + return x + + def convert_days_sentinel(x): + x = x.strip() + + if not x: + return np.nan + + is_plus = x.endswith("+") + + if is_plus: + x = int(x[:-1]) + 1 + else: + x = int(x) + + return x + + def convert_score(x): + x = x.strip() + + if not x: + return np.nan + + if x.find("-") > 0: + val_min, val_max = lmap(int, x.split("-")) + val = 0.5 * (val_min + val_max) + else: + val = float(x) + + return val + + results = [] + + for day_converter in [convert_days, convert_days_sentinel]: + result = parser.read_csv(StringIO(data), + converters={"score": convert_score, + "days": day_converter}, + na_values=["", None]) + assert pd.isna(result["days"][1]) + results.append(result) + + tm.assert_frame_equal(results[0], results[1]) + + +def test_converter_index_col_bug(all_parsers): + # see gh-1835 + parser = all_parsers + data = "A;B\n1;2\n3;4" + + rs = parser.read_csv(StringIO(data), sep=";", index_col="A", + converters={"A": lambda x: x}) + + xp = DataFrame({"B": [2, 4]}, index=Index([1, 3], name="A")) + tm.assert_frame_equal(rs, xp) diff --git a/pandas/tests/io/parser/test_dialect.py b/pandas/tests/io/parser/test_dialect.py new file mode 100644 index 0000000000000..b005bf4f2d212 --- /dev/null +++ b/pandas/tests/io/parser/test_dialect.py @@ -0,0 +1,91 @@ +# -*- coding: utf-8 -*- + +""" +Tests that dialects are properly handled during parsing +for all of the parsers defined in parsers.py +""" + +import csv + +import pytest + +from pandas.compat import StringIO +from pandas.errors import ParserWarning + +from pandas import DataFrame +import pandas.util.testing as tm + + +def test_dialect(all_parsers): + parser = all_parsers + data = """\ +label1,label2,label3 +index1,"a,c,e +index2,b,d,f +""" + + dia = csv.excel() + dia.quoting = csv.QUOTE_NONE + + # Conflicting dialect quoting. + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(StringIO(data), dialect=dia) + + data = """\ +label1,label2,label3 +index1,a,c,e +index2,b,d,f +""" + exp = parser.read_csv(StringIO(data)) + exp.replace("a", "\"a", inplace=True) + tm.assert_frame_equal(df, exp) + + +def test_dialect_str(all_parsers): + dialect_name = "mydialect" + parser = all_parsers + data = """\ +fruit:vegetable +apple:broccoli +pear:tomato +""" + exp = DataFrame({ + "fruit": ["apple", "pear"], + "vegetable": ["broccoli", "tomato"] + }) + csv.register_dialect(dialect_name, delimiter=":") + + # Conflicting dialect delimiter. + with tm.assert_produces_warning(ParserWarning): + df = parser.read_csv(StringIO(data), dialect=dialect_name) + + tm.assert_frame_equal(df, exp) + csv.unregister_dialect(dialect_name) + + +def test_invalid_dialect(all_parsers): + class InvalidDialect(object): + pass + + data = "a\n1" + parser = all_parsers + msg = "Invalid dialect" + + with pytest.raises(ValueError, match=msg): + parser.read_csv(StringIO(data), dialect=InvalidDialect) + + +@pytest.mark.parametrize("delimiter", [",", "."]) +def test_dialect_conflict(all_parsers, delimiter): + data = "a,b\n1,2" + dialect = "excel" + parser = all_parsers + + expected = DataFrame({"a": [1], "b": [2]}) + warning_klass = None if delimiter == "," else ParserWarning + + with tm.assert_produces_warning(warning_klass): + result = parser.read_csv(StringIO(data), + delimiter=delimiter, + dialect=dialect) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/parser/test_parsers.py b/pandas/tests/io/parser/test_parsers.py index 13704e2f542ab..2dfcec161342c 100644 --- a/pandas/tests/io/parser/test_parsers.py +++ b/pandas/tests/io/parser/test_parsers.py @@ -11,11 +11,7 @@ from pandas import DataFrame, read_csv, read_table import pandas.util.testing as tm -from .comment import CommentTests from .common import ParserTests -from .compression import CompressionTests -from .converters import ConverterTests -from .dialect import DialectTests from .dtypes import DtypeTests from .header import HeaderTests from .index_col import IndexColTests @@ -29,9 +25,7 @@ from .usecols import UsecolsTests -class BaseParser(CommentTests, CompressionTests, - ConverterTests, DialectTests, - DtypeTests, DupeColumnTests, +class BaseParser(DtypeTests, DupeColumnTests, HeaderTests, IndexColTests, MultithreadTests, NAvaluesTests, ParseDatesTests, ParserTests,