Skip to content

Commit

Permalink
Add extra conversion test for incomplete files
Browse files Browse the repository at this point in the history
 - Include a new complete 1.5MB sas7bdat file to test against
 - Also include a truncated 512KB version of the same file to verify
   partial read behavior

 - Add a test that reads the incomplete file and proves that all the
   important SAS file metadata is contained at the beginning of the
   file. While it was expected that the read of the incomplete stream
   would somehow generate an error -- it actually didn't. Instead, the
   result is a partial CSV file containing garbage data for missing rows
   with no good indication of when the parsing died.

   The test verifies the last correctly written row and the first
   garbage row thereafter.

 - Requires regenerating checksums inside columnize_checksums.json by
   setting the value generateColumnize to true inside of
   columnize_test.go and re-running TestGenerateColumnize to regenerate
   the columnize_checksums.json

 - Also capture versions of the converted project.sas7bdat and
   project_incomplete.sas7bdat to csv equivalents to reflect the reality
   of what conversions look like.

   NOTE: the original upstream source data is a little different in how
   it's formatted within the CSV. In the original source data, values
   are quoted, precision is different for the RAD field and and the
   columns appear in a different order. Another conversion tool
   similarly uses values like 96 rather than 96.000000 for the RAD
   field. It's unclear if this is a bug in this version of the library
   or not.

   project-source.csv is from the AHS source
   project-converted.csv is converted by https://dumbmatter.com/sas7bdat/

   NOTE: project_incomplete.csv contains exactly the converted file --
   which includes a bunch of extraneous data that couldn't be read from
   the original file. This clearly seems like a bug, but documents
   current behavior accurately for partial files. A future PR will try
   to address this bug.

 - Good sources of test data files

   https://github.com/xiaodaigh/sas7bdat-resources

   https://github.com/olivia76/cpp-sas7bdat/tree/main/test
   https://github.com/tk3369/SASLib.jl/tree/master/test

   project.sas7bdat is from:

   https://www.census.gov/programs-surveys/ahs/data/2013/ahs-2013-public-use-file--puf-/ahs-2013-national-public-use-file--puf-.html
  • Loading branch information
ddl-ebrown committed May 27, 2024
1 parent 464f499 commit 9f529b5
Show file tree
Hide file tree
Showing 8 changed files with 186,885 additions and 1 deletion.
69 changes: 69 additions & 0 deletions convert_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"os"
"path/filepath"
"testing"
"time"

"github.com/dominodatalab/datareader"
"github.com/stretchr/testify/assert"
Expand Down Expand Up @@ -50,6 +51,74 @@ func TestToCsvConvertsSAS(t *testing.T) {
}
}

func TestToCsvConvertsTruncatedSAS(t *testing.T) {
// project.sas7bdat is normally 1MB+, but wrote just the first 512KB to a file like:
// { head -c 524288 >project_incomplete.sas7bdat; } < project.sas7bdat
f, err := os.Open("test_files/data/project_incomplete.sas7bdat")
require.NoError(t, err)
defer f.Close()

sas, err := datareader.NewSAS7BDATReader(f)
require.NoError(t, err)
sas.ConvertDates = true
sas.TrimStrings = true

// verify all the file header data processed correctly
// these values come from complete file project.sas7bdat
assert.Equal(t,
"PROJECT \x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
sas.Name)
assert.Equal(t, "DATA ", sas.FileType)
assert.Equal(t, "utf-8", sas.FileEncoding)
assert.True(t, sas.U64)
assert.Equal(t, "2.6.32-754.43.1.", sas.OSType)
assert.Equal(t, "x86_64", sas.OSName)
assert.Equal(t, "9.0401M6", sas.SASRelease)
assert.Equal(t, "Linux", sas.ServerType)
assert.Equal(t, 46641, sas.RowCount())
assert.Equal(t, sas.ColumnLabels(), []string{
"Type of alteration or repair",
"Household member performed alteration or repair",
"Cost of alteration or repair",
"Edit flag for RAS",
"Edit flag for RAD",
"Control number",
})
assert.Equal(t, sas.ColumnNames(), []string{"RAS", "RAH", "RAD", "JRAS", "JRAD", "CONTROL"})
assert.Equal(t, sas.ColumnTypes(), []datareader.ColumnTypeT{
datareader.SASStringType,
datareader.SASStringType,
datareader.SASNumericType,
datareader.SASStringType,
datareader.SASStringType,
datareader.SASStringType,
})
// Timestamp is epoch 01/01/1960
tv := float64(1969085979.342952)
ts := time.Date(1960, 1, 1, 0, 0, 0, 0, time.UTC).Add(time.Duration(tv) * time.Second)
assert.Equal(t, ts, sas.DateCreated)
assert.Equal(t, ts, sas.DateModified)

buf := new(bytes.Buffer)
w := csv.NewWriter(buf)
err = datareader.ToCsv(sas, 1000, w)
require.NoError(t, err)

r := csv.NewReader(buf)
records, err := r.ReadAll()
assert.NoError(t, err)
assert.NotEmpty(t, records)

// there are 46641 records in the file, but b/c the file is truncated there are only 18889 parsed rows
// do a sanity check on the last correctly read row
assert.Equal(t, []string{"57", "2", "4000.000000", "", "", "399074940346"}, records[18888])
// the next row (and all subsequent rows) are garbage
assert.Equal(t, []string{"", "", "0.000000", "", "", ""}, records[18889])

// NOTE: it's bizarre that the garbage data includes 28 extra rows, but documented in the test should the library be fixed
assert.Len(t, records, 46889)
}

func TestToCsvConvertsStata(t *testing.T) {
files, err := filepath.Glob("test_files/data/*.dta")
require.NoError(t, err)
Expand Down
2 changes: 1 addition & 1 deletion test_files/columnize_checksums.json

Large diffs are not rendered by default.

Binary file added test_files/data/project.sas7bdat
Binary file not shown.
Binary file added test_files/data/project_incomplete.sas7bdat
Binary file not shown.
Loading

0 comments on commit 9f529b5

Please sign in to comment.