From be01817df3a2cd00868cae49314a2bbdb2300c88 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Thu, 28 Dec 2017 17:02:59 -0800 Subject: [PATCH 01/10] converted intermediate string_chunk/byte_chunk variables to Dict from 2-d array so we can reference the by name rather than by index --- src/SASLib.jl | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/src/SASLib.jl b/src/SASLib.jl index 48b72f4..1ff9ee4 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -106,8 +106,8 @@ mutable struct Handler column_count::Int64 # creator_proc::Union{Void, Vector{UInt8}} - byte_chunk::Array{UInt8, 2} - string_chunk::Array{String, 2} + byte_chunk::Dict{Symbol, Vector{UInt8}} + string_chunk::Dict{Symbol, Vector{Union{Missing, AbstractString}}} current_row_in_chunk_index::Int64 current_page::Int64 @@ -866,8 +866,20 @@ function read_chunk(handler, nrows=0) # println("nd = $nd (number of decimal columns)") # println("ns = $ns (number of string columns)") - handler.string_chunk = fill("", (Int64(ns), Int64(nrows))) - handler.byte_chunk = fill(UInt8(0), (Int64(nd), Int64(8 * nrows))) # 8-byte values + + # allocate column space + handler.byte_chunk = Dict() + handler.string_chunk = Dict() + for j in 1:nd+ns + name = Symbol(handler.column_names[j]) + if handler.column_types[j] == column_type_decimal + handler.byte_chunk[name] = fill(UInt8(0), Int64(8 * nrows)) # 8-byte values + elseif handler.column_types[j] == column_type_string + handler.string_chunk[name] = fill(missing, Int64(nrows)) + else + throw(FileFormatError("unknown column type: $(handler.column_types[j])")) + end + end # don't do this or else the state is polluted if user wants to # read lines separately. @@ -973,7 +985,7 @@ function _chunk_to_dataframe(handler) m = handler.current_row_in_file_index rslt = Dict() - js, jb = 1, 1 + # js, jb = 1, 1 # println("handler.column_names=$(handler.column_names)") for j in 1:handler.column_count @@ -982,7 +994,7 @@ function _chunk_to_dataframe(handler) if handler.column_types[j] == column_type_decimal # number, date, or datetime # println(" String: size=$(size(handler.byte_chunk))") # println(" Decimal: column $j, name $name, size=$(size(handler.byte_chunk[jb, :]))") - bytes = handler.byte_chunk[jb, :] + bytes = handler.byte_chunk[name] #if j == 1 && length(bytes) < 100 #debug only # println(" bytes=$bytes") #end @@ -999,12 +1011,12 @@ function _chunk_to_dataframe(handler) rslt[name] = datetime_from_float(rslt[name]) end end - jb += 1 + # jb += 1 elseif handler.column_types[j] == column_type_string # println(" String: size=$(size(handler.string_chunk))") # println(" String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))") - rslt[name] = handler.string_chunk[js, :] - js += 1 + rslt[name] = handler.string_chunk[name] + # js += 1 else throw(FileFormatError("Unknown column type $(handler.column_types[j])")) end @@ -1177,6 +1189,7 @@ function process_byte_array_with_data(handler, offset, length) # println(" handler.file_endianness = $(handler.file_endianness)") for j in 1:handler.column_count + name = Symbol(handler.column_names[j]) lngt = lengths[j] # TODO commented out for perf reason. do we need this? # if lngt == 0 @@ -1201,10 +1214,10 @@ function process_byte_array_with_data(handler, offset, length) # for k in 1:lngt # byte_chunk[jb, m + k] = source[start + k] # end - @inbounds byte_chunk[jb, m+1:m+lngt] = source[start+1:start+lngt] + byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt] jb += 1 elseif ct == column_type_string - @inbounds string_chunk[js, current_row+1] = + string_chunk[name][current_row+1] = rstrip(transcode(handler, source[start+1:(start+lngt)])) #rstrip(decode(source[start+1:(start+lngt)], handler.config.encoding)) js += 1 From 22c485a34ac32d943d5fcace81d0cf551bce3a86 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Thu, 28 Dec 2017 22:07:01 -0800 Subject: [PATCH 02/10] implemented include_columns config option --- src/SASLib.jl | 88 ++++++++++++++++++++++++--------------------------- 1 file changed, 42 insertions(+), 46 deletions(-) diff --git a/src/SASLib.jl b/src/SASLib.jl index 1ff9ee4..5750d92 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -22,6 +22,7 @@ struct ReaderConfig convert_dates::Bool convert_text::Bool convert_header_text::Bool + include_columns::Vector{Union{Symbol, Int64}} verbose_level::Int8 ReaderConfig(filename, config = Dict()) = new(filename, get(config, :encoding, default_encoding), @@ -29,6 +30,7 @@ struct ReaderConfig get(config, :convert_dates, default_convert_dates), get(config, :convert_text, default_convert_text), get(config, :convert_header_text, default_convert_header_text), + get(config, :include_columns, []), get(config, :verbose_level, default_verbose_level)) end @@ -56,9 +58,14 @@ mutable struct Handler compression::Vector{UInt8} column_names_strings::Vector{Vector{UInt8}} column_names::Vector{AbstractString} + column_symbols::Vector{Symbol} column_types::Vector{UInt8} column_formats::Vector{AbstractString} columns::Vector{Column} + + # column indices being read/returned + # tuple of column index, column symbol, column type + column_indices::Vector{Tuple{Int64, Symbol, UInt8}} current_page_data_subheader_pointers::Vector{SubHeaderPointer} cached_page::Vector{UInt8} @@ -123,6 +130,7 @@ function open(config::ReaderConfig) handler.compression = b"" handler.column_names_strings = [] handler.column_names = [] + handler.column_symbols = [] handler.columns = [] handler.column_formats = [] handler.current_page_data_subheader_pointers = [] @@ -725,6 +733,7 @@ function _process_columnname_subheader(handler, offset, length) # name = decode(name, handler.config.encoding) # end push!(handler.column_names, name) + push!(handler.column_symbols, Symbol(name)) println2(handler, " i=$i name=$name") end end @@ -863,21 +872,21 @@ function read_chunk(handler, nrows=0) # TODO not the most efficient but normally it should be ok for non-wide tables nd = count(x -> x == column_type_decimal, handler.column_types) ns = count(x -> x == column_type_string, handler.column_types) - # println("nd = $nd (number of decimal columns)") # println("ns = $ns (number of string columns)") - # allocate column space + fill_column_indices(handler) + + # allocate columns handler.byte_chunk = Dict() handler.string_chunk = Dict() - for j in 1:nd+ns - name = Symbol(handler.column_names[j]) - if handler.column_types[j] == column_type_decimal + for (k, name, ty) in handler.column_indices + if ty == column_type_decimal handler.byte_chunk[name] = fill(UInt8(0), Int64(8 * nrows)) # 8-byte values - elseif handler.column_types[j] == column_type_string + elseif ty == column_type_string handler.string_chunk[name] = fill(missing, Int64(nrows)) else - throw(FileFormatError("unknown column type: $(handler.column_types[j])")) + throw(FileFormatError("unknown column type: $ty for column $name")) end end @@ -894,6 +903,8 @@ function read_chunk(handler, nrows=0) rslt = _chunk_to_dataframe(handler) perf_chunk_to_data_frame = toq() + # construct column symbols/names from actual results since we may have + # read fewer columns than what's in the file column_symbols = [col for col in keys(rslt)] column_names = String.(column_symbols) @@ -985,13 +996,9 @@ function _chunk_to_dataframe(handler) m = handler.current_row_in_file_index rslt = Dict() - # js, jb = 1, 1 # println("handler.column_names=$(handler.column_names)") - for j in 1:handler.column_count - - name = Symbol(handler.column_names[j]) - - if handler.column_types[j] == column_type_decimal # number, date, or datetime + for (k, name, ty) in handler.column_indices + if ty == column_type_decimal # number, date, or datetime # println(" String: size=$(size(handler.byte_chunk))") # println(" Decimal: column $j, name $name, size=$(size(handler.byte_chunk[jb, :]))") bytes = handler.byte_chunk[name] @@ -1004,19 +1011,17 @@ function _chunk_to_dataframe(handler) #rslt[name] = bswap(rslt[name]) rslt[name] = values if handler.config.convert_dates - if handler.column_formats[j] in sas_date_formats + if handler.column_formats[k] in sas_date_formats rslt[name] = date_from_float(rslt[name]) - elseif handler.column_formats[j] in sas_datetime_formats + elseif handler.column_formats[k] in sas_datetime_formats # TODO probably have to deal with timezone somehow rslt[name] = datetime_from_float(rslt[name]) end end - # jb += 1 - elseif handler.column_types[j] == column_type_string + elseif ty == column_type_string # println(" String: size=$(size(handler.string_chunk))") # println(" String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))") rslt[name] = handler.string_chunk[name] - # js += 1 else throw(FileFormatError("Unknown column type $(handler.column_types[j])")) end @@ -1176,31 +1181,11 @@ function process_byte_array_with_data(handler, offset, length) byte_chunk = handler.byte_chunk string_chunk = handler.string_chunk s = 8 * current_row - js = 1 - jb = 1 - - # if current_row == 1 - # println(" current_row = $current_row") - # println(" column_types = $column_types") - # println(" lengths = $lengths") - # println(" offsets = $offsets") - # end - # println(" s = $s") - # println(" handler.file_endianness = $(handler.file_endianness)") - - for j in 1:handler.column_count - name = Symbol(handler.column_names[j]) - lngt = lengths[j] - # TODO commented out for perf reason. do we need this? - # if lngt == 0 - # break - # end - #if j == 1 - # println(" lngt = $lngt") - #end - #println(lngt) - start = offsets[j] - ct = column_types[j] + + for (k, name, ty) in handler.column_indices + lngt = lengths[k] + start = offsets[k] + ct = column_types[k] if ct == column_type_decimal # The data may have 3,4,5,6,7, or 8 bytes (lngt) # and we need to copy into an 8-byte destination. @@ -1215,12 +1200,9 @@ function process_byte_array_with_data(handler, offset, length) # byte_chunk[jb, m + k] = source[start + k] # end byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt] - jb += 1 elseif ct == column_type_string string_chunk[name][current_row+1] = rstrip(transcode(handler, source[start+1:(start+lngt)])) - #rstrip(decode(source[start+1:(start+lngt)], handler.config.encoding)) - js += 1 end end @@ -1476,4 +1458,18 @@ function currentpos(handler) return d end +# fill column indices as a dictionary (key = column index, value = column symbol) +function fill_column_indices(handler) + handler.column_indices = Vector{Tuple{Int64, Symbol, UInt8}}() + for j in 1:length(handler.column_symbols) + name = handler.column_symbols[j] + if handler.config.include_columns == [] || + j in handler.config.include_columns || + name in handler.config.include_columns + push!(handler.column_indices, (j, name, handler.column_types[j])) + end + end + println3(handler, "column_indices = $(handler.column_indices)") +end + end # module From 6f47c1d093f7cb559447a7faa1327494c897fb1d Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Thu, 28 Dec 2017 22:28:05 -0800 Subject: [PATCH 03/10] added exclude_columns option; fixed ncols attribute in the return result. --- src/SASLib.jl | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/SASLib.jl b/src/SASLib.jl index 5750d92..0eba206 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -15,6 +15,10 @@ struct FileFormatError <: Exception message::AbstractString end +struct ConfigError <: Exception + message::AbstractString +end + struct ReaderConfig filename::AbstractString encoding::AbstractString @@ -23,6 +27,7 @@ struct ReaderConfig convert_text::Bool convert_header_text::Bool include_columns::Vector{Union{Symbol, Int64}} + exclude_columns::Vector{Union{Symbol, Int64}} verbose_level::Int8 ReaderConfig(filename, config = Dict()) = new(filename, get(config, :encoding, default_encoding), @@ -31,6 +36,7 @@ struct ReaderConfig get(config, :convert_text, default_convert_text), get(config, :convert_header_text, default_convert_header_text), get(config, :include_columns, []), + get(config, :exclude_columns, []), get(config, :verbose_level, default_verbose_level)) end @@ -911,7 +917,7 @@ function read_chunk(handler, nrows=0) return Dict( :data => rslt, :nrows => nrows, - :ncols => nd+ns, + :ncols => length(column_symbols), :filename => handler.config.filename, :page_count => handler.current_page, :page_length => Int64(handler.page_length), @@ -1461,15 +1467,24 @@ end # fill column indices as a dictionary (key = column index, value = column symbol) function fill_column_indices(handler) handler.column_indices = Vector{Tuple{Int64, Symbol, UInt8}}() + inflag = length(handler.config.include_columns) > 0 + exflag = length(handler.config.exclude_columns) > 0 + inflag && exflag && throw(ConfigError("You can specify either include_columns or exclude_columns but not both.")) for j in 1:length(handler.column_symbols) name = handler.column_symbols[j] - if handler.config.include_columns == [] || - j in handler.config.include_columns || - name in handler.config.include_columns + if inflag + if j in handler.config.include_columns || name in handler.config.include_columns + push!(handler.column_indices, (j, name, handler.column_types[j])) + end + elseif exflag + if !(j in handler.config.exclude_columns || name in handler.config.exclude_columns) + push!(handler.column_indices, (j, name, handler.column_types[j])) + end + else push!(handler.column_indices, (j, name, handler.column_types[j])) end end - println3(handler, "column_indices = $(handler.column_indices)") + println2(handler, "column_indices = $(handler.column_indices)") end end # module From ab4799e771b46caa8dca69fdd337af496c4a7fda Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Fri, 29 Dec 2017 11:23:43 -0800 Subject: [PATCH 04/10] converted config Dict to keyword arguments --- src/SASLib.jl | 59 ++++++++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 29 deletions(-) diff --git a/src/SASLib.jl b/src/SASLib.jl index 0eba206..43af9e6 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -22,22 +22,13 @@ end struct ReaderConfig filename::AbstractString encoding::AbstractString - chunksize::UInt8 + chunksize::Int64 convert_dates::Bool convert_text::Bool convert_header_text::Bool - include_columns::Vector{Union{Symbol, Int64}} - exclude_columns::Vector{Union{Symbol, Int64}} - verbose_level::Int8 - ReaderConfig(filename, config = Dict()) = new(filename, - get(config, :encoding, default_encoding), - get(config, :chunksize, default_chunksize), - get(config, :convert_dates, default_convert_dates), - get(config, :convert_text, default_convert_text), - get(config, :convert_header_text, default_convert_header_text), - get(config, :include_columns, []), - get(config, :exclude_columns, []), - get(config, :verbose_level, default_verbose_level)) + include_columns::Vector + exclude_columns::Vector + verbose_level::Int64 end struct Column @@ -150,12 +141,17 @@ function open(config::ReaderConfig) end """ -Open a SAS7BDAT data file. The `config` parameter accepts the same -settings as described in `SASLib.readsas()` function. Returns a -handler object. +Open a SAS7BDAT data file. Returns a handler object that can be used in +the `read` function. """ -function open(fname::AbstractString, config=Dict()) - return open(ReaderConfig(fname, config)) +function open(filename::AbstractString; + encoding::AbstractString = default_encoding, + convert_dates::Bool = default_convert_dates, + include_columns::Vector = [], + exclude_columns::Vector = [], + verbose_level::Int64 = 1) + return open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text, + default_convert_header_text, include_columns, exclude_columns, verbose_level)) end """ @@ -179,14 +175,17 @@ end """ Read a SAS7BDAT file. -* `:encoding`: character encoding for strings (default: "UTF-8") -* `:convert_text`: convert text data to strings (default: true) -* `:convert_header_text`: convert header text data to strings (default: true) """ -function readsas(filename, config=Dict()) +function readsas(filename::AbstractString; + encoding::AbstractString = default_encoding, + convert_dates::Bool = default_convert_dates, + include_columns::Vector = [], + exclude_columns::Vector = [], + verbose_level::Int64 = 1) handler = nothing try - handler = open(ReaderConfig(filename, config)) + handler = open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text, + default_convert_header_text, include_columns, exclude_columns, verbose_level)) # println(push!(history, handler)) t1 = time() result = read(handler) @@ -1170,10 +1169,10 @@ function process_byte_array_with_data(handler, offset, length) # println(" handler.row_length=$(handler.row_length)") if length < handler.row_length if handler.compression == rle_compression - #println("decompress using rle_compression method, length=$length, row_length=$(handler.row_length)") + # println4(handler, "decompress using rle_compression method, length=$length, row_length=$(handler.row_length)") source = rle_decompress(handler.row_length, source) elseif handler.compression == rdc_compression - #println("decompress using rdc_compression method, length=$length, row_length=$(handler.row_length)") + # println4(handler, "decompress using rdc_compression method, length=$length, row_length=$(handler.row_length)") source = rdc_decompress(handler.row_length, source) else throw(FileFormatError("Unknown compression method: $(handler.compression)")) @@ -1206,6 +1205,7 @@ function process_byte_array_with_data(handler, offset, length) # byte_chunk[jb, m + k] = source[start + k] # end byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt] + #println4(handler, "byte_chunk[$name][$(m+1):$(m+lngt)] = source[$(start+1):$(start+lngt)] => $(source[start+1:start+lngt])") elseif ct == column_type_string string_chunk[name][current_row+1] = rstrip(transcode(handler, source[start+1:(start+lngt)])) @@ -1438,10 +1438,11 @@ end # ---- Debugging methods ---- -# verbose printing. 1=little verbose, 2=medium verbose, 3=very verbose -@inline println1(handler::Handler, msg) = handler.config.verbose_level >= 1 && println(msg) -@inline println2(handler::Handler, msg) = handler.config.verbose_level >= 2 && println(msg) -@inline println3(handler::Handler, msg) = handler.config.verbose_level >= 3 && println(msg) +# verbose printing. 1=little verbose, 2=medium verbose, 3=very verbose, 4=very very verbose :-) +@inline println1(handler::Handler, msg::String) = handler.config.verbose_level >= 1 && println(msg) +@inline println2(handler::Handler, msg::String) = handler.config.verbose_level >= 2 && println(msg) +@inline println3(handler::Handler, msg::String) = handler.config.verbose_level >= 3 && println(msg) +@inline println4(handler::Handler, msg::String) = handler.config.verbose_level >= 4 && println(msg) # string representation of the SubHeaderPointer structure function tostring(x::SubHeaderPointer) From ccd9302692bb923ea76dc11d8fff0f9512e7f382 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Fri, 29 Dec 2017 15:07:45 -0800 Subject: [PATCH 05/10] fixed test case for open() --- test/runtests.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/runtests.jl b/test/runtests.jl index 41d0628..5679a45 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -4,7 +4,7 @@ using Base.Test @testset "SASLib" begin @testset "open and close" begin - handler = SASLib.open(SASLib.ReaderConfig("test1.sas7bdat")) + handler = SASLib.open("test1.sas7bdat") @test typeof(handler) == SASLib.Handler @test handler.config.filename == "test1.sas7bdat" @test SASLib.close(handler) == nothing From ab2413f14d7167ca4960a423700337041a24456c Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Fri, 29 Dec 2017 15:57:22 -0800 Subject: [PATCH 06/10] updated doc strings; prefixed some internal functions with underscore --- src/SASLib.jl | 57 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 10 deletions(-) diff --git a/src/SASLib.jl b/src/SASLib.jl index 43af9e6..82ce68e 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -121,7 +121,7 @@ mutable struct Handler config) end -function open(config::ReaderConfig) +function _open(config::ReaderConfig) # println("Opening $(config.filename)") handler = Handler(config) handler.compression = b"" @@ -141,8 +141,15 @@ function open(config::ReaderConfig) end """ -Open a SAS7BDAT data file. Returns a handler object that can be used in -the `read` function. +open(filename::AbstractString; + encoding::AbstractString = default_encoding, + convert_dates::Bool = default_convert_dates, + include_columns::Vector = [], + exclude_columns::Vector = [], + verbose_level::Int64 = 1) + +Open a SAS7BDAT data file. Returns a `SASLib.Handler` object that can be used in +the subsequent `SASLib.read` and `SASLib.close` functions. """ function open(filename::AbstractString; encoding::AbstractString = default_encoding, @@ -150,13 +157,15 @@ function open(filename::AbstractString; include_columns::Vector = [], exclude_columns::Vector = [], verbose_level::Int64 = 1) - return open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text, + return _open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text, default_convert_header_text, include_columns, exclude_columns, verbose_level)) end """ -Read data from the `handler`. If `nrows` is not specified, read the -entire files content. When called again, fetch the next `nrows` rows. +read(handler::Handler, nrows=0) + +Read data from the `handler` (see `SASLib.open`). If `nrows` is not specified, +read the entire file content. When called again, fetch the next `nrows` rows. """ function read(handler::Handler, nrows=0) # println("Reading $(handler.config.filename)") @@ -164,8 +173,13 @@ function read(handler::Handler, nrows=0) end """ +close(handler::Handler) + Close the `handler` object. This function effectively closes the -underlying iostream. It must be called if `open` and `read` +underlying iostream. It must be called after the program +finished reading data. + +This function is needed only when `SASLib.open` and `SASLib.read` functions are used instead of the more convenient `readsas` function. """ function close(handler::Handler) @@ -174,7 +188,30 @@ function close(handler::Handler) end """ +readsas(filename::AbstractString; + encoding::AbstractString = "UTF-8", + convert_dates::Bool = true, + include_columns::Vector = [], + exclude_columns::Vector = [], + verbose_level::Int64 = 1) + Read a SAS7BDAT file. + +The `encoding` argument may be used if string data does not have UTF-8 +encoding. + +If `convert_dates == false` then no conversion is made +and you will get the number of days for Date columns (or number of +seconds for DateTime columns) since 1-JAN-1960. + +By default, all columns will be read. If you only need a subset of the +columns, you may specify +either `include_columns` or `exclude_columns` but not both. They are just +arrays of columns indices or symbols e.g. [1, 2, 3] or [:employeeid, :firstname, :lastname] + +For debugging purpose, `verbose_level` may be set to a value higher than 1. +Verbose level 0 will output nothing to the console, essentially a total quiet +option. """ function readsas(filename::AbstractString; encoding::AbstractString = default_encoding, @@ -184,7 +221,7 @@ function readsas(filename::AbstractString; verbose_level::Int64 = 1) handler = nothing try - handler = open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text, + handler = _open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text, default_convert_header_text, include_columns, exclude_columns, verbose_level)) # println(push!(history, handler)) t1 = time() @@ -880,7 +917,7 @@ function read_chunk(handler, nrows=0) # println("nd = $nd (number of decimal columns)") # println("ns = $ns (number of string columns)") - fill_column_indices(handler) + _fill_column_indices(handler) # allocate columns handler.byte_chunk = Dict() @@ -1466,7 +1503,7 @@ function currentpos(handler) end # fill column indices as a dictionary (key = column index, value = column symbol) -function fill_column_indices(handler) +function _fill_column_indices(handler) handler.column_indices = Vector{Tuple{Int64, Symbol, UInt8}}() inflag = length(handler.config.include_columns) > 0 exflag = length(handler.config.exclude_columns) > 0 From 232258de54245a8e96c2fa50deafd1406bbf5d91 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Fri, 29 Dec 2017 16:23:41 -0800 Subject: [PATCH 07/10] updated readme, perf test script, and new jl/py test results --- README.md | 117 ++++++++++++++++--------- test/perf_results_0.3.0/py_jl_test1.md | 62 +++++++++++++ test/perf_results_0.3.0/py_jl_test2.md | 60 +++++++++++++ test/perf_results_0.3.0/py_jl_test3.md | 60 +++++++++++++ test/perf_test1.jl | 2 +- 5 files changed, 257 insertions(+), 44 deletions(-) create mode 100644 test/perf_results_0.3.0/py_jl_test1.md create mode 100644 test/perf_results_0.3.0/py_jl_test2.md create mode 100644 test/perf_results_0.3.0/py_jl_test3.md diff --git a/README.md b/README.md index 0347c03..1c26d4b 100644 --- a/README.md +++ b/README.md @@ -20,25 +20,25 @@ Use the `readsas` function to read the file. The result is a dictionary of vari ```julia julia> using SASLib -julia> x = readsas("test1.sas7bdat") -Read data set of size 10 x 100 in 0.019 seconds +julia> x = readsas("productsales.sas7bdat") +Read data set of size 1440 x 10 in 2.0 seconds Dict{Symbol,Any} with 16 entries: - :filename => "test1.sas7bdat" - :page_length => 65536 - :file_encoding => "wlatin1" + :filename => "productsales.sas7bdat" + :page_length => 8192 + :file_encoding => "US-ASCII" :system_endianness => :LittleEndian - :ncols => 100 - :column_types => DataType[Float64, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64 … Float64, Float64… - :data => Dict{Any,Any}(Pair{Any,Any}(:Column60, [2987.0, 8194.0, 9820.0, 8252.0, 9640.0, 9168.0, 7547.0, 1419.0, 4884.0, NaN])… - :perf_type_conversion => 0.0052096 - :page_count => 1 - :column_names => String["Column60", "Column42", "Column68", "Column35", "Column33", "Column1", "Column41", "Column16", "Column72", "Co… - :column_symbols => Symbol[:Column60, :Column42, :Column68, :Column35, :Column33, :Column1, :Column41, :Column16, :Column72, :Column19 …… - :column_lengths => [8, 9, 8, 8, 8, 9, 8, 8, 8, 9 … 8, 8, 8, 5, 8, 8, 8, 9, 8, 8] + :ncols => 10 + :column_types => Type[Float64, Float64, Union{AbstractString, Missings.Missing}, Union{AbstractString, Missings.Missing}, Union{AbstractString,… + :data => Dict{Any,Any}(Pair{Any,Any}(:QUARTER, [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0 … 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0,… + :perf_type_conversion => 0.0262305 + :page_count => 18 + :column_names => String["QUARTER", "YEAR", "COUNTRY", "DIVISION", "REGION", "MONTH", "PREDICT", "ACTUAL", "PRODTYPE", "PRODUCT"] + :column_symbols => Symbol[:QUARTER, :YEAR, :COUNTRY, :DIVISION, :REGION, :MONTH, :PREDICT, :ACTUAL, :PRODTYPE, :PRODUCT] + :column_lengths => [8, 8, 10, 10, 10, 10, 10, 8, 8, 8] :file_endianness => :LittleEndian - :nrows => 10 - :perf_read_data => 0.00612195 - :column_offsets => [0, 600, 8, 16, 24, 609, 32, 40, 48, 618 … 536, 544, 552, 795, 560, 568, 576, 800, 584, 592] + :nrows => 1440 + :perf_read_data => 0.00639309 + :column_offsets => [0, 8, 40, 50, 60, 70, 80, 16, 24, 32] ``` Number of columns and rows are returned as in `:ncols` and `:nrows` respectively. @@ -46,18 +46,22 @@ Number of columns and rows are returned as in `:ncols` and `:nrows` respectively The data, reference by `:data` key, is represented as a Dict object with the column symbol as the key. ```juia -julia> x[:data][:Column1] -10-element Array{Float64,1}: - 0.636 - 0.283 - 0.452 - 0.557 - 0.138 - 0.948 - 0.162 - 0.148 - NaN - 0.663 +julia> x[:data][:ACTUAL] +1440-element Array{Float64,1}: + 925.0 + 999.0 + 608.0 + 642.0 + 656.0 + 948.0 + 612.0 + 114.0 + 685.0 + 657.0 + 608.0 + 353.0 + 107.0 + ⋮ ``` If you really like DataFrame, you can easily convert as such: @@ -67,26 +71,53 @@ julia> using DataFrames julia> df = DataFrame(x[:data]); -julia> df[:, 1:5] -10×5 DataFrames.DataFrame -│ Row │ Column1 │ Column10 │ Column100 │ Column11 │ Column12 │ -├─────┼─────────┼─────────────┼───────────┼──────────┼────────────┤ -│ 1 │ 0.636 │ "apple" │ 3230.0 │ NaN │ 1986-07-20 │ -│ 2 │ 0.283 │ "apple" │ 4904.0 │ 22.0 │ 1983-07-15 │ -│ 3 │ 0.452 │ "apple" │ NaN │ 7.0 │ 1973-11-27 │ -│ 4 │ 0.557 │ "dog" │ 8566.0 │ 26.0 │ 1967-01-20 │ -│ 5 │ 0.138 │ "crocodile" │ 894.0 │ 11.0 │ 1970-11-29 │ -│ 6 │ 0.948 │ "crocodile" │ 6088.0 │ 27.0 │ 1963-01-09 │ -│ 7 │ 0.162 │ "" │ 6122.0 │ NaN │ 1979-10-18 │ -│ 8 │ 0.148 │ "crocodile" │ 2570.0 │ 5.0 │ 1961-03-15 │ -│ 9 │ NaN │ "pear" │ 2709.0 │ 12.0 │ 1964-06-15 │ -│ 10 │ 0.663 │ "pear" │ NaN │ 16.0 │ 1985-01-28 │ +julia> head(df, 5) +5×10 DataFrames.DataFrame +│ Row │ ACTUAL │ COUNTRY │ DIVISION │ MONTH │ PREDICT │ PRODTYPE │ PRODUCT │ QUARTER │ REGION │ YEAR │ +├─────┼────────┼─────────┼───────────┼────────────┼─────────┼───────────┼─────────┼─────────┼────────┼────────┤ +│ 1 │ 925.0 │ CANADA │ EDUCATION │ 1993-01-01 │ 850.0 │ FURNITURE │ SOFA │ 1.0 │ EAST │ 1993.0 │ +│ 2 │ 999.0 │ CANADA │ EDUCATION │ 1993-02-01 │ 297.0 │ FURNITURE │ SOFA │ 1.0 │ EAST │ 1993.0 │ +│ 3 │ 608.0 │ CANADA │ EDUCATION │ 1993-03-01 │ 846.0 │ FURNITURE │ SOFA │ 1.0 │ EAST │ 1993.0 │ +│ 4 │ 642.0 │ CANADA │ EDUCATION │ 1993-04-01 │ 533.0 │ FURNITURE │ SOFA │ 2.0 │ EAST │ 1993.0 │ +│ 5 │ 656.0 │ CANADA │ EDUCATION │ 1993-05-01 │ 646.0 │ FURNITURE │ SOFA │ 2.0 │ EAST │ 1993.0 │ +``` + +If you only need to read few columns, just pass an `include_columns` argument: + +``` +julia> head(DataFrame(readsas("productsales.sas7bdat", include_columns=[:YEAR, :MONTH, :PRODUCT, :ACTUAL])[:data])) +Read data set of size 1440 x 4 in 0.004 seconds +6×4 DataFrames.DataFrame +│ Row │ ACTUAL │ MONTH │ PRODUCT │ YEAR │ +├─────┼────────┼────────────┼─────────┼────────┤ +│ 1 │ 925.0 │ 1993-01-01 │ SOFA │ 1993.0 │ +│ 2 │ 999.0 │ 1993-02-01 │ SOFA │ 1993.0 │ +│ 3 │ 608.0 │ 1993-03-01 │ SOFA │ 1993.0 │ +│ 4 │ 642.0 │ 1993-04-01 │ SOFA │ 1993.0 │ +│ 5 │ 656.0 │ 1993-05-01 │ SOFA │ 1993.0 │ +│ 6 │ 948.0 │ 1993-06-01 │ SOFA │ 1993.0 │ +``` + +Likewise, you can read all columns except the ones you don't want as specified in `exclude_columns` argument: + +``` +julia> head(DataFrame(readsas("productsales.sas7bdat", exclude_columns=[:YEAR, :MONTH, :PRODUCT, :ACTUAL])[:data])) +Read data set of size 1440 x 6 in 0.031 seconds +6×6 DataFrames.DataFrame +│ Row │ COUNTRY │ DIVISION │ PREDICT │ PRODTYPE │ QUARTER │ REGION │ +├─────┼─────────┼───────────┼─────────┼───────────┼─────────┼────────┤ +│ 1 │ CANADA │ EDUCATION │ 850.0 │ FURNITURE │ 1.0 │ EAST │ +│ 2 │ CANADA │ EDUCATION │ 297.0 │ FURNITURE │ 1.0 │ EAST │ +│ 3 │ CANADA │ EDUCATION │ 846.0 │ FURNITURE │ 1.0 │ EAST │ +│ 4 │ CANADA │ EDUCATION │ 533.0 │ FURNITURE │ 2.0 │ EAST │ +│ 5 │ CANADA │ EDUCATION │ 646.0 │ FURNITURE │ 2.0 │ EAST │ +│ 6 │ CANADA │ EDUCATION │ 486.0 │ FURNITURE │ 2.0 │ EAST │ ``` If you need to read files incrementally: ```julia -handler = SASLib.open("test1.sas7bdat") +handler = SASLib.open("productsales.sas7bdat") results = SASLib.read(handler, 3) # read 3 rows results = SASLib.read(handler, 4) # read next 4 rows SASLib.close(handler) # remember to close the handler when done diff --git a/test/perf_results_0.3.0/py_jl_test1.md b/test/perf_results_0.3.0/py_jl_test1.md new file mode 100644 index 0000000..f4b43ed --- /dev/null +++ b/test/perf_results_0.3.0/py_jl_test1.md @@ -0,0 +1,62 @@ +# Performance Test 1 + +## Summary + +SASLib is ~4.3x faster than Pandas. + +## Test File + +Filename|Rows|Columns|Numeric Columns|String Columns +--------|----|-------|---------------|-------------- +numeric_1000000_2.sas7bdat|1,000,000|2|2|0 + +## Test Environment + +Test system information: +``` +julia> versioninfo() +Julia Version 0.6.2 +Commit d386e40c17 (2017-12-13 18:08 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin14.5.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell) + LAPACK: libopenblas64_ + LIBM: libopenlibm + LLVM: libLLVM-3.9.1 (ORCJIT, haswell) +``` + +## Python +``` +$ python perf_test1.py numeric_1000000_2.sas7bdat +1: elapsed 1.976702 seconds +2: elapsed 1.984404 seconds +3: elapsed 2.266284 seconds +4: elapsed 1.978403 seconds +5: elapsed 1.946053 seconds +6: elapsed 1.919336 seconds +7: elapsed 1.918322 seconds +8: elapsed 1.926547 seconds +9: elapsed 1.962013 seconds +10: elapsed 1.939654 seconds +Average: 1.9818 seconds +``` + +## Julia +``` +$ julia perf_test1.jl numeric_1000000_2.sas7bdat +Loaded library in 0.343 seconds +Bootstrap elapsed 4.211 seconds +Elapsed 0.481 seconds +Elapsed 0.462 seconds +Elapsed 0.414 seconds +Elapsed 0.480 seconds +Elapsed 0.473 seconds +Elapsed 0.472 seconds +Elapsed 0.473 seconds +Elapsed 0.479 seconds +Elapsed 0.401 seconds +Elapsed 0.463 seconds +Average: 0.4598392924 seconds +``` diff --git a/test/perf_results_0.3.0/py_jl_test2.md b/test/perf_results_0.3.0/py_jl_test2.md new file mode 100644 index 0000000..495ffdd --- /dev/null +++ b/test/perf_results_0.3.0/py_jl_test2.md @@ -0,0 +1,60 @@ +# Performance Test 2 + +## Summary + +SASLib is 16.9x faster than Pandas. + +## Test File + +Filename |Rows|Columns|Numeric Columns|String Columns +--------------|----|-------|---------------|-------------- +test1.sas7bdat|10 |100 |73 |27 + +## Test Environment + +``` +Julia Version 0.6.2 +Commit d386e40c17 (2017-12-13 18:08 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin14.5.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell) + LAPACK: libopenblas64_ + LIBM: libopenlibm + LLVM: libLLVM-3.9.1 (ORCJIT, haswell) +``` + +## Python +``` +$ python perf_test1.py test1.sas7bdat +1: elapsed 0.099821 seconds +2: elapsed 0.116454 seconds +3: elapsed 0.095141 seconds +4: elapsed 0.100083 seconds +5: elapsed 0.100060 seconds +6: elapsed 0.098249 seconds +7: elapsed 0.101819 seconds +8: elapsed 0.099673 seconds +9: elapsed 0.096865 seconds +10: elapsed 0.109412 seconds +Average: 0.1018 seconds +``` + +## Julia +``` +$ julia perf_test1.jl test1.sas7bdat +Loaded library in 0.326 seconds +Bootstrap elapsed 3.606 seconds +Elapsed 0.011 seconds +Elapsed 0.004 seconds +Elapsed 0.004 seconds +Elapsed 0.004 seconds +Elapsed 0.004 seconds +Elapsed 0.004 seconds +Elapsed 0.010 seconds +Elapsed 0.013 seconds +Elapsed 0.004 seconds +Elapsed 0.004 seconds +Average: 0.0060341937 seconds +``` diff --git a/test/perf_results_0.3.0/py_jl_test3.md b/test/perf_results_0.3.0/py_jl_test3.md new file mode 100644 index 0000000..641fa93 --- /dev/null +++ b/test/perf_results_0.3.0/py_jl_test3.md @@ -0,0 +1,60 @@ +# Performance Test 3 + +## Summary + +SASLib is 5.2x faster than Pandas. + +## Test File + +Filename |Rows |Columns|Numeric Columns|String Columns +---------------------|------|-------|---------------|-------------- +productsales.sas7bdat|1440 |10 |4 |6 + +## Test Environment + +``` +Julia Version 0.6.2 +Commit d386e40c17 (2017-12-13 18:08 UTC) +Platform Info: + OS: macOS (x86_64-apple-darwin14.5.0) + CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz + WORD_SIZE: 64 + BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell) + LAPACK: libopenblas64_ + LIBM: libopenlibm + LLVM: libLLVM-3.9.1 (ORCJIT, haswell) +``` + +## Python +``` +$ python perf_test1.py productsales.sas7bdat +1: elapsed 0.035160 seconds +2: elapsed 0.031523 seconds +3: elapsed 0.041026 seconds +4: elapsed 0.033476 seconds +5: elapsed 0.045547 seconds +6: elapsed 0.030253 seconds +7: elapsed 0.038022 seconds +8: elapsed 0.032196 seconds +9: elapsed 0.046579 seconds +10: elapsed 0.033603 seconds +Average: 0.0367 seconds +``` + +## Julia +``` +$ julia perf_test1.jl productsales.sas7bdat +Loaded library in 0.328 seconds +Bootstrap elapsed 3.613 seconds +Elapsed 0.013 seconds +Elapsed 0.005 seconds +Elapsed 0.005 seconds +Elapsed 0.004 seconds +Elapsed 0.007 seconds +Elapsed 0.008 seconds +Elapsed 0.007 seconds +Elapsed 0.011 seconds +Elapsed 0.007 seconds +Elapsed 0.005 seconds +Average: 0.0071251584000000005 seconds +``` diff --git a/test/perf_test1.jl b/test/perf_test1.jl index 854b7f2..bce738a 100644 --- a/test/perf_test1.jl +++ b/test/perf_test1.jl @@ -25,4 +25,4 @@ function perf(f, n) println("Average: $(total / n) seconds") end -perf(() -> readsas(ARGS[1], Dict(:verbose_level => 0)), 10) +perf(() -> readsas(ARGS[1], verbose_level = 0), 10) From fa8840bec00bdd98df8236269082c9a0bc556a88 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Fri, 29 Dec 2017 16:31:34 -0800 Subject: [PATCH 08/10] added perf result for half reads --- test/perf_results_0.3.0/half_columns.md | 47 +++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 test/perf_results_0.3.0/half_columns.md diff --git a/test/perf_results_0.3.0/half_columns.md b/test/perf_results_0.3.0/half_columns.md new file mode 100644 index 0000000..7a5c76e --- /dev/null +++ b/test/perf_results_0.3.0/half_columns.md @@ -0,0 +1,47 @@ +# Read performance when reading only half of the data + +## Results + +Read time is reduced by 40% when reading half of the data. + +## Test Scenario + +This test file has just 2 numeric columns. We would like to know the performance +of reading only 1 column from this file. + +Filename|Rows|Columns|Numeric Columns|String Columns +--------|----|-------|---------------|-------------- +numeric_1000000_2.sas7bdat|1,000,000|2|2|0 + +## Test Log + +``` +julia> @benchmark readsas("numeric_1000000_2.sas7bdat", verbose_level=0) +BenchmarkTools.Trial: + memory estimate: 399.04 MiB + allocs estimate: 3031083 + -------------- + minimum time: 358.695 ms (9.31% GC) + median time: 442.709 ms (25.96% GC) + mean time: 427.870 ms (20.97% GC) + maximum time: 482.786 ms (25.29% GC) + -------------- + samples: 12 + evals/sample: 1 + +julia> @benchmark readsas("numeric_1000000_2.sas7bdat", include_columns=[:f], verbose_level=0) +BenchmarkTools.Trial: + memory estimate: 261.71 MiB + allocs estimate: 2031028 + -------------- + minimum time: 222.832 ms (9.67% GC) + median time: 235.396 ms (9.70% GC) + mean time: 261.782 ms (20.75% GC) + maximum time: 327.359 ms (33.53% GC) + -------------- + samples: 20 + evals/sample: 1 + +julia> 262/428 +0.6121495327102804 +``` From 53e046652a6b2f9a10b8f887634fc0432ad8ac89 Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Fri, 29 Dec 2017 16:48:40 -0800 Subject: [PATCH 09/10] put back @inbounds for now... will have to refactor later for performance tuning --- src/SASLib.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/SASLib.jl b/src/SASLib.jl index 82ce68e..a622834 100644 --- a/src/SASLib.jl +++ b/src/SASLib.jl @@ -1241,10 +1241,10 @@ function process_byte_array_with_data(handler, offset, length) # for k in 1:lngt # byte_chunk[jb, m + k] = source[start + k] # end - byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt] + @inbounds byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt] #println4(handler, "byte_chunk[$name][$(m+1):$(m+lngt)] = source[$(start+1):$(start+lngt)] => $(source[start+1:start+lngt])") elseif ct == column_type_string - string_chunk[name][current_row+1] = + @inbounds string_chunk[name][current_row+1] = rstrip(transcode(handler, source[start+1:(start+lngt)])) end end From 71c1ad1c68757fcff2a93e8d712c8e60a00b656f Mon Sep 17 00:00:00 2001 From: Tom Kwong Date: Fri, 29 Dec 2017 18:12:16 -0800 Subject: [PATCH 10/10] added unit tests for include/exclude column feature --- test/runtests.jl | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5679a45..656ae0c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -19,7 +19,7 @@ using Base.Test end end - @testset "manual" begin + @testset "incremental read" begin fname = "test1.sas7bdat" # 10 rows handler = SASLib.open(fname) @test handler.config.filename == fname @@ -31,7 +31,7 @@ using Base.Test @test result[:nrows] == 3 end - @testset "numeric" begin + @testset "various data types" begin result = readsas("test1.sas7bdat") df = result[:data] @test sum(df[:Column1][1:5]) == 2.066 @@ -40,7 +40,7 @@ using Base.Test @test df[:Column4][1:3] == [Date("1965-12-10"), Date("1977-03-07"), Date("1983-08-15")] end - @testset "datetime" begin + @testset "datetime with missing values" begin result = readsas("datetime.sas7bdat") df = result[:data] @test (result[:nrows], result[:ncols]) == (5, 4) @@ -50,6 +50,28 @@ using Base.Test @test count(ismissing, result[:data][:dt]) == 3 end + @testset "include/exclude columns" begin + result = readsas("productsales.sas7bdat", include_columns=[:MONTH, :YEAR]) + @test result[:ncols] == 2 + @test sort(result[:column_symbols]) == sort([:MONTH, :YEAR]) + + result = readsas("productsales.sas7bdat", include_columns=[1, 2, 7]) + @test result[:ncols] == 3 + @test sort(result[:column_symbols]) == sort([:ACTUAL, :PREDICT, :PRODUCT]) + + result = readsas("productsales.sas7bdat", exclude_columns=[:DIVISION]) + @test result[:ncols] == 9 + @test !(:DIVISION in result[:column_symbols]) + + result = readsas("productsales.sas7bdat", exclude_columns=collect(2:10)) + @test result[:ncols] == 1 + @test sort(result[:column_symbols]) == sort([:ACTUAL]) + + # error handling + @test_throws SASLib.ConfigError readsas("productsales.sas7bdat", + include_columns=[1], exclude_columns=[1]) + end + @testset "misc" begin result = readsas("productsales.sas7bdat") df = result[:data]