Skip to content

Commit

Permalink
BUG: Some sas7bdat files with many columns are not parseable by read_…
Browse files Browse the repository at this point in the history
  • Loading branch information
troels authored and victor committed Sep 30, 2018
1 parent e32b255 commit 9cf7b60
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 31 deletions.
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,8 @@ I/O
- :func:`read_excel()` will correctly show the deprecation warning for previously deprecated ``sheetname`` (:issue:`17994`)
- :func:`read_csv()` will correctly parse timezone-aware datetimes (:issue:`22256`)
- :func:`read_sas()` will parse numbers in sas7bdat-files that have width less than 8 bytes correctly. (:issue:`21616`)
- :func:`read_sas()` will correctly parse sas7bdat files with many columns (:issue:`22628`)
- :func:`read_sas()` will correctly parse sas7bdat files with data page types having also bit 7 set (so page type is 128 + 256 = 384) (:issue:`16615`)

Plotting
^^^^^^^^
Expand Down
10 changes: 5 additions & 5 deletions pandas/io/sas/sas.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,8 @@ cdef class Parser(object):
self.parser = parser
self.header_length = self.parser.header_length
self.column_count = parser.column_count
self.lengths = parser._column_data_lengths
self.offsets = parser._column_data_offsets
self.lengths = parser.column_data_lengths()
self.offsets = parser.column_data_offsets()
self.byte_chunk = parser._byte_chunk
self.string_chunk = parser._string_chunk
self.row_length = parser.row_length
Expand All @@ -257,7 +257,7 @@ cdef class Parser(object):
# page indicators
self.update_next_page()

column_types = parser.column_types
column_types = parser.column_types()

# map column types
for j in range(self.column_count):
Expand Down Expand Up @@ -375,7 +375,7 @@ cdef class Parser(object):
if done:
return True
return False
elif self.current_page_type == page_data_type:
elif self.current_page_type & page_data_type == page_data_type:
self.process_byte_array_with_data(
bit_offset + subheader_pointers_offset +
self.current_row_on_page_index * self.row_length,
Expand Down Expand Up @@ -437,7 +437,7 @@ cdef class Parser(object):
elif column_types[j] == column_type_string:
# string
string_chunk[js, current_row] = np.array(source[start:(
start + lngt)]).tostring().rstrip()
start + lngt)]).tostring().rstrip(b"\x00 ")
js += 1

self.current_row_on_page_index += 1
Expand Down
61 changes: 35 additions & 26 deletions pandas/io/sas/sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,15 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
self.compression = ""
self.column_names_strings = []
self.column_names = []
self.column_types = []
self.column_formats = []
self.columns = []

self._current_page_data_subheader_pointers = []
self._cached_page = None
self._column_data_lengths = []
self._column_data_offsets = []
self._column_types = []

self._current_row_in_file_index = 0
self._current_row_on_page_index = 0
self._current_row_in_file_index = 0
Expand All @@ -102,6 +103,19 @@ def __init__(self, path_or_buf, index=None, convert_dates=True,
self._get_properties()
self._parse_metadata()

def column_data_lengths(self):
"""Return a numpy int64 array of the column data lengths"""
return np.asarray(self._column_data_lengths, dtype=np.int64)

def column_data_offsets(self):
"""Return a numpy int64 array of the column offsets"""
return np.asarray(self._column_data_offsets, dtype=np.int64)

def column_types(self):
"""Returns a numpy character array of the column types:
s (string) or d (double)"""
return np.asarray(self._column_types, dtype=np.dtype('S1'))

def close(self):
try:
self.handle.close()
Expand Down Expand Up @@ -287,8 +301,10 @@ def _process_page_meta(self):
pt = [const.page_meta_type, const.page_amd_type] + const.page_mix_types
if self._current_page_type in pt:
self._process_page_metadata()
return ((self._current_page_type in [256] + const.page_mix_types) or
(self._current_page_data_subheader_pointers is not None))
is_data_page = self._current_page_type & const.page_data_type
is_mix_page = self._current_page_type in const.page_mix_types
return (is_data_page or is_mix_page
or self._current_page_data_subheader_pointers != [])

def _read_page_header(self):
bit_offset = self._page_bit_offset
Expand Down Expand Up @@ -503,12 +519,6 @@ def _process_columnattributes_subheader(self, offset, length):
int_len = self._int_length
column_attributes_vectors_count = (
length - 2 * int_len - 12) // (int_len + 8)
self.column_types = np.empty(
column_attributes_vectors_count, dtype=np.dtype('S1'))
self._column_data_lengths = np.empty(
column_attributes_vectors_count, dtype=np.int64)
self._column_data_offsets = np.empty(
column_attributes_vectors_count, dtype=np.int64)
for i in range(column_attributes_vectors_count):
col_data_offset = (offset + int_len +
const.column_data_offset_offset +
Expand All @@ -520,16 +530,13 @@ def _process_columnattributes_subheader(self, offset, length):
const.column_type_offset + i * (int_len + 8))

x = self._read_int(col_data_offset, int_len)
self._column_data_offsets[i] = x
self._column_data_offsets.append(x)

x = self._read_int(col_data_len, const.column_data_length_length)
self._column_data_lengths[i] = x
self._column_data_lengths.append(x)

x = self._read_int(col_types, const.column_type_length)
if x == 1:
self.column_types[i] = b'd'
else:
self.column_types[i] = b's'
self._column_types.append(b'd' if x == 1 else b's')

def _process_columnlist_subheader(self, offset, length):
# unknown purpose
Expand Down Expand Up @@ -586,7 +593,7 @@ def _process_format_subheader(self, offset, length):
col.name = self.column_names[current_column_number]
col.label = column_label
col.format = column_format
col.ctype = self.column_types[current_column_number]
col.ctype = self._column_types[current_column_number]
col.length = self._column_data_lengths[current_column_number]

self.column_formats.append(column_format)
Expand All @@ -599,7 +606,7 @@ def read(self, nrows=None):
elif nrows is None:
nrows = self.row_count

if len(self.column_types) == 0:
if len(self._column_types) == 0:
self.close()
raise EmptyDataError("No columns to parse from file")

Expand All @@ -610,8 +617,8 @@ def read(self, nrows=None):
if nrows > m:
nrows = m

nd = (self.column_types == b'd').sum()
ns = (self.column_types == b's').sum()
nd = self._column_types.count(b'd')
ns = self._column_types.count(b's')

self._string_chunk = np.empty((ns, nrows), dtype=np.object)
self._byte_chunk = np.zeros((nd, 8 * nrows), dtype=np.uint8)
Expand Down Expand Up @@ -639,11 +646,13 @@ def _read_next_page(self):
self._page_length))

self._read_page_header()
if self._current_page_type == const.page_meta_type:
page_type = self._current_page_type
if page_type == const.page_meta_type:
self._process_page_metadata()
pt = [const.page_meta_type, const.page_data_type]
pt += [const.page_mix_types]
if self._current_page_type not in pt:

is_data_page = page_type & const.page_data_type
pt = [const.page_meta_type] + const.page_mix_types
if not is_data_page and self._current_page_type not in pt:
return self._read_next_page()

return False
Expand All @@ -660,7 +669,7 @@ def _chunk_to_dataframe(self):

name = self.column_names[j]

if self.column_types[j] == b'd':
if self._column_types[j] == b'd':
rslt[name] = self._byte_chunk[jb, :].view(
dtype=self.byte_order + 'd')
rslt[name] = np.asarray(rslt[name], dtype=np.float64)
Expand All @@ -674,7 +683,7 @@ def _chunk_to_dataframe(self):
rslt[name] = pd.to_datetime(rslt[name], unit=unit,
origin="1960-01-01")
jb += 1
elif self.column_types[j] == b's':
elif self._column_types[j] == b's':
rslt[name] = self._string_chunk[js, :]
if self.convert_text and (self.encoding is not None):
rslt[name] = rslt[name].str.decode(
Expand All @@ -686,6 +695,6 @@ def _chunk_to_dataframe(self):
else:
self.close()
raise ValueError("unknown column type %s" %
self.column_types[j])
self._column_types[j])

return rslt
Binary file added pandas/tests/io/sas/data/load_log.sas7bdat
Binary file not shown.
4 changes: 4 additions & 0 deletions pandas/tests/io/sas/data/many_columns.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
DATASRC,PDDOCID,age,agegt89,ASSESSA,ASSESS1,ASSESS3,ASSESS4,ASSESS5,ASSESS6,ASSESS7,week,BECK,conf1,conf2,conf3,demo3,demo4,demo5,demo6,demo7,demo11a,demo11b,demo11c,demo11d,derm1b,derm2,derm3,derm4,derm5a,derm5b,derm7,derm7a,derm7b,derm8,derm9,ECG3,ecgrtxt,ecgrhr,ecgrpr,ecgrqrs,ecgrqrsaxis,ecgrqt,ecgrqtc,ecgrrep,ecgrtime,mmse1,mmse2,mmse3,mmse4,mmse5,mmse6,mmse7,mmse8,mmse9,mmse10,mmse11,mmse12,mmse13,mmse14,mmse15,mmse16,mmse17,mmse18,mmse19,mmse20,mmse,mmsescor,mrf1,mrf2,mrf3,mrf4,mrf5,mrf6,mrf7,mrf8,mrf9,mrf10,mrf11,mrf12,mrf13,nvitl1s,nvitl1d,nvitl1r,nvitl2s,nvitl2d,nvitl2r,nvitl3s,nvitl3d,nvitl3r,nvitl4s,nvitl4d,nvitl4r,nvitl5,nvitl1,nvitl2,nvitl3,nvitl4,phys1,phys1a,phys14,phys15a,phys15b,phys15c,phys15d,phys16a,phys16b,phys16c,phys16d,phys17a,phys17b,phys17c,phys17d,phys18a,phys18b,phys18c,phys18d,phys19a,phys19b,phys20,phys22,phys24,phys26,phys28,PREG1,PREG2,updrsa,updrs1,updrs2,updrs3,updrs4,updrs5a,updrs6a,updrs7a,updrs8a,updrs9a,updrs10a,updrs11a,updrs12a,updrs13a,updrs14a,updrs15a,updrs16a,updrs17a,updrs18a,updrs19a,updrs20a1,updrs20b1,updrs20c1,updrs20d1,updrs20e1,updrs21a1,updrs21b1,updrs22a1,updrs22b1,updrs22c1,updrs22d1,updrs22e1,updrs23a1,updrs23b1,updrs24a1,updrs24b1,updrs25a1,updrs25b1,updrs26a1,updrs26b1,updrs26c1,updrs26d1,updrs27a,updrs28a,updrs29a,updrs30a,updrs31a,updrs32a,updrs33a,updrs34a,updrs35,updrs36,updrs37,updrs38,updrs39,updrs5b,updrs6b,updrs7b,updrs8b,updrs9b,updrs10b,updrs11b,updrs12b,updrs13b,updrs14b,updrs15b,updrs16b,updrs17b,updrs18b,updrs19b,updrs20a2,updrs20b2,updrs20c2,updrs20d2,updrs20e2,updrs21a2,updrs21b2,updrs22a2,updrs22b2,updrs22c2,updrs22d2,updrs22e2,updrs23a2,updrs23b2,updrs24a2,updrs24b2,updrs25a2,updrs25b2,updrs26a2,updrs26b2,updrs26c2,updrs26d2,updrs27b,updrs28b,updrs29b,updrs30b,updrs31b,updrs32b,updrs33b,updrs34b,updrs5c,updrs6c,updrs7c,updrs8c,updrs9c,updrs10c,updrs11c,updrs12c,updrs13c,updrs14c,updrs15c,updrs16c,updrs17c,updrs32c,updrs33c,updrs34c,updrsmental,updrsadl,updrsadlon,updrsadloff,updrsadlmin,updrstremor,updrstremortreat,updrstremormin,updrsrigid,updrsrigidtreat,updrsrigidmin,updrsmotor,updrsmotortreat,updrsmotormin,updrs,updrstrt,updrsmin,updrs4a,updrs41,updrs42,updrs43,updrs44,updrs45,updrs46,updrs47,updrs48,updrs49,updrs410,updrs411,vitl1s,vitl1d,vitl2,vitl3s,vitl3d,vitl4,vitl5,vitl6,assess,fbeck,conf,demo1,derm,ecg,ecgr,mrf,nvitl,fphys1,fpreg,fupdrs,fupdrs4,vitl,site,race,rImaged,rPD,rPDlt5,rAgeGt30,rHY,rMed,rMelanoma,rPreclude,rNeed,rEligible,gender,incsae,incsusp,incterm,increlated,inctermat,increason,incafter24,incendp,incres,disp2,disp3,disp4,disp6,inex1,inex2,inex3,inex4,inex5,inex6,inex7,inex8,inex9,inex10,inex11,inex12,inex13,inex14,inex15,inex16,inex17,inex18,inex19,inex20,inex21,inex22,inex23,inex24,inex25,inex26,inex27,inex28,treatment,treat,disp,inex,classify,enrollyr,demoyear,dob_yr,inexdays,demodays,onsetdays,diagdays,medstartdays,physdays,phys21dys,phys23dys,phys25dys,phys27dys,phys29dys,confdays,pregdays,nvitldays,nvitlscandays,vitldays,labdays,ecgdays,ecgtestdays,mrfdays,dermdays,dermexamdays,dermbiopdays,mmsedays,beckdays,updrdays,updr4days,assessdays,daystotherapy,dispdays,endpdys,termdys,SAEdys,resdys,lmeddys,wddays,VISIT_NO
a030,ab304,43.0,0.0,0.0,0.0,,,,,,-2.0,0.0,1.0,1.0,,2.0,1.0,19.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,,,,,,,0.0,2.0,ABNORMAL,75.0,150.0,100.0,-3.0,410.0,460.0,2.0,1000.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,3.0,5.0,2.0,1.0,1.0,1.0,0.0,3.0,1.0,1.0,1.0,26.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,150.0,94.0,73.0,155.0,96.0,71.0,148.0,91.0,69.0,146.0,67.0,72.0,1.0,42840.0,46080.0,46980.0,30600.0,100.0,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,4.0,4.0,4.0,4.0,2.0,1.0,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.5,0.0,0.0,0.0,1.0,1.0,2.0,2.0,1.0,1.5,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.5,95.0,95.0,7.0,,2.0,1.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,1.5,,1.5,7.5,,7.5,20.0,,20.0,25.0,,25.0,,,,,,,,,,,,,138.0,86.0,72.0,130.0,80.0,80.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,abc,1.0,1.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,2002.0,1914.0,-28.0,-28.0,-404.0,-28.0,0.0,-28.0,,,,,-6.0,-28.0,-13.0,-13.0,-12.0,-28.0,-28.0,-28.0,-28.0,-28.0,-14.0,-14.0,,-28.0,-28.0,-28.0,,-28.0,,659.0,426.0,659.0,,,658.0,100.0,ab
a030,ab304,43.0,0.0,0.0,0.0,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,0.0,1.0,2.0,95.0,95.0,7.0,,2.0,1.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,3.0,,,3.0,0.0,,0.0,3.0,,3.0,13.0,,13.0,16.0,,16.0,,,,,,,,,,,,,140.0,86.0,76.0,132.0,80.0,84.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,0.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,0.0,0.0,,,,,,,,,0.0,,0.0,,659.0,426.0,659.0,,,658.0,100.0,ab
a030,ab304,43.0,0.0,0.0,0.0,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1000.0,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,2.0,0.0,1.0,1.0,0.5,1.0,2.0,90.0,95.0,7.0,,2.0,2.0,2.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,5.0,,,5.0,0.5,,0.5,2.0,,2.0,16.0,,16.0,21.0,,21.0,0.0,,,,,,,,,,,,149.0,88.0,80.0,136.0,90.0,82.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,abc,0.0,0.0,1.0,1.0,1.0,34.0,5.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,,0.0,3.0,0.0,1.0,0.0,4.0,3.0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Placebo,1.0,1.0,1.0,1.0,2002.0,,1914.0,-28.0,,,,0.0,,,,,,,,,,,29.0,29.0,,,,,,,,,29.0,29.0,29.0,,659.0,426.0,659.0,,,658.0,100.0,ab
Binary file added pandas/tests/io/sas/data/many_columns.sas7bdat
Binary file not shown.
16 changes: 16 additions & 0 deletions pandas/tests/io/sas/test_sas7bdat.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,22 @@ def test_compact_numerical_values(datapath):
tm.assert_series_equal(result, expected, check_exact=True)


def test_many_columns(datapath):
# Test for looking for column information in more places (PR #22628)
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
df = pd.read_sas(fname, encoding='latin-1')
fname = datapath("io", "sas", "data", "many_columns.csv")
df0 = pd.read_csv(fname, encoding='latin-1')
tm.assert_frame_equal(df, df0)


def test_inconsistent_number_of_rows(datapath):
# Regression test for issue #16615. (PR #22628)
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
df = pd.read_sas(fname, encoding='latin-1')
assert len(df) == 2097


def test_zero_variables(datapath):
# Check if the SAS file has zero variables (PR #18184)
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
Expand Down

0 comments on commit 9cf7b60

Please sign in to comment.