From be01817df3a2cd00868cae49314a2bbdb2300c88 Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Thu, 28 Dec 2017 17:02:59 -0800
Subject: [PATCH 01/10] converted intermediate string_chunk/byte_chunk
 variables to Dict from 2-d array so we can reference the by name rather than
 by index

---
 src/SASLib.jl | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/src/SASLib.jl b/src/SASLib.jl
index 48b72f4..1ff9ee4 100644
--- a/src/SASLib.jl
+++ b/src/SASLib.jl
@@ -106,8 +106,8 @@ mutable struct Handler
     column_count::Int64
     # creator_proc::Union{Void, Vector{UInt8}}
 
-    byte_chunk::Array{UInt8, 2}
-    string_chunk::Array{String, 2}
+    byte_chunk::Dict{Symbol, Vector{UInt8}}
+    string_chunk::Dict{Symbol, Vector{Union{Missing, AbstractString}}}
     current_row_in_chunk_index::Int64
 
     current_page::Int64
@@ -866,8 +866,20 @@ function read_chunk(handler, nrows=0)
     
     # println("nd = $nd (number of decimal columns)")
     # println("ns = $ns (number of string columns)")
-    handler.string_chunk = fill("", (Int64(ns), Int64(nrows)))
-    handler.byte_chunk = fill(UInt8(0), (Int64(nd), Int64(8 * nrows))) # 8-byte values
+
+    # allocate column space
+    handler.byte_chunk = Dict()
+    handler.string_chunk = Dict()
+    for j in 1:nd+ns
+        name = Symbol(handler.column_names[j])
+        if handler.column_types[j] == column_type_decimal
+            handler.byte_chunk[name] = fill(UInt8(0), Int64(8 * nrows)) # 8-byte values
+        elseif handler.column_types[j] == column_type_string
+            handler.string_chunk[name] = fill(missing, Int64(nrows)) 
+        else
+            throw(FileFormatError("unknown column type: $(handler.column_types[j])"))
+        end
+    end
 
     # don't do this or else the state is polluted if user wants to 
     # read lines separately.
@@ -973,7 +985,7 @@ function _chunk_to_dataframe(handler)
     m = handler.current_row_in_file_index
     rslt = Dict()
 
-    js, jb = 1, 1
+    # js, jb = 1, 1
     # println("handler.column_names=$(handler.column_names)")
     for j in 1:handler.column_count
 
@@ -982,7 +994,7 @@ function _chunk_to_dataframe(handler)
         if handler.column_types[j] == column_type_decimal  # number, date, or datetime
             # println("  String: size=$(size(handler.byte_chunk))")
             # println("  Decimal: column $j, name $name, size=$(size(handler.byte_chunk[jb, :]))")
-            bytes = handler.byte_chunk[jb, :]
+            bytes = handler.byte_chunk[name]
             #if j == 1  && length(bytes) < 100  #debug only
                 # println("  bytes=$bytes")
             #end
@@ -999,12 +1011,12 @@ function _chunk_to_dataframe(handler)
                     rslt[name] = datetime_from_float(rslt[name])
                 end
             end
-            jb += 1
+            # jb += 1
         elseif handler.column_types[j] == column_type_string
             # println("  String: size=$(size(handler.string_chunk))")
             # println("  String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))")
-            rslt[name] = handler.string_chunk[js, :]
-            js += 1
+            rslt[name] = handler.string_chunk[name]
+            # js += 1
         else
             throw(FileFormatError("Unknown column type $(handler.column_types[j])"))
         end
@@ -1177,6 +1189,7 @@ function process_byte_array_with_data(handler, offset, length)
     # println("  handler.file_endianness = $(handler.file_endianness)")
         
     for j in 1:handler.column_count
+        name = Symbol(handler.column_names[j])
         lngt = lengths[j]
         # TODO commented out for perf reason. do we need this?
         # if lngt == 0
@@ -1201,10 +1214,10 @@ function process_byte_array_with_data(handler, offset, length)
             # for k in 1:lngt
             #     byte_chunk[jb, m + k] = source[start + k]
             # end
-            @inbounds byte_chunk[jb, m+1:m+lngt] = source[start+1:start+lngt]
+            byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt]
             jb += 1
         elseif ct == column_type_string
-            @inbounds string_chunk[js, current_row+1] = 
+            string_chunk[name][current_row+1] = 
                 rstrip(transcode(handler, source[start+1:(start+lngt)]))
                 #rstrip(decode(source[start+1:(start+lngt)], handler.config.encoding))
             js += 1

From 22c485a34ac32d943d5fcace81d0cf551bce3a86 Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Thu, 28 Dec 2017 22:07:01 -0800
Subject: [PATCH 02/10] implemented include_columns config option

---
 src/SASLib.jl | 88 ++++++++++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 46 deletions(-)

diff --git a/src/SASLib.jl b/src/SASLib.jl
index 1ff9ee4..5750d92 100644
--- a/src/SASLib.jl
+++ b/src/SASLib.jl
@@ -22,6 +22,7 @@ struct ReaderConfig
     convert_dates::Bool
     convert_text::Bool
     convert_header_text::Bool
+    include_columns::Vector{Union{Symbol, Int64}}
     verbose_level::Int8
     ReaderConfig(filename, config = Dict()) = new(filename, 
         get(config, :encoding, default_encoding),
@@ -29,6 +30,7 @@ struct ReaderConfig
         get(config, :convert_dates, default_convert_dates), 
         get(config, :convert_text, default_convert_text), 
         get(config, :convert_header_text, default_convert_header_text),
+        get(config, :include_columns, []),
         get(config, :verbose_level, default_verbose_level))
 end
 
@@ -56,9 +58,14 @@ mutable struct Handler
     compression::Vector{UInt8}
     column_names_strings::Vector{Vector{UInt8}}
     column_names::Vector{AbstractString}
+    column_symbols::Vector{Symbol}
     column_types::Vector{UInt8}
     column_formats::Vector{AbstractString}
     columns::Vector{Column}
+
+    # column indices being read/returned 
+    # tuple of column index, column symbol, column type
+    column_indices::Vector{Tuple{Int64, Symbol, UInt8}}
     
     current_page_data_subheader_pointers::Vector{SubHeaderPointer}
     cached_page::Vector{UInt8}
@@ -123,6 +130,7 @@ function open(config::ReaderConfig)
     handler.compression = b""
     handler.column_names_strings = []
     handler.column_names = []
+    handler.column_symbols = []
     handler.columns = []
     handler.column_formats = []
     handler.current_page_data_subheader_pointers = []
@@ -725,6 +733,7 @@ function _process_columnname_subheader(handler, offset, length)
         #     name = decode(name, handler.config.encoding)
         # end
         push!(handler.column_names, name)
+        push!(handler.column_symbols, Symbol(name))
         println2(handler, " i=$i name=$name")
     end
 end
@@ -863,21 +872,21 @@ function read_chunk(handler, nrows=0)
     # TODO not the most efficient but normally it should be ok for non-wide tables
     nd = count(x -> x == column_type_decimal, handler.column_types)
     ns = count(x -> x == column_type_string,  handler.column_types)
-    
     # println("nd = $nd (number of decimal columns)")
     # println("ns = $ns (number of string columns)")
 
-    # allocate column space
+    fill_column_indices(handler)
+
+    # allocate columns
     handler.byte_chunk = Dict()
     handler.string_chunk = Dict()
-    for j in 1:nd+ns
-        name = Symbol(handler.column_names[j])
-        if handler.column_types[j] == column_type_decimal
+    for (k, name, ty) in handler.column_indices
+        if ty == column_type_decimal
             handler.byte_chunk[name] = fill(UInt8(0), Int64(8 * nrows)) # 8-byte values
-        elseif handler.column_types[j] == column_type_string
+        elseif ty == column_type_string
             handler.string_chunk[name] = fill(missing, Int64(nrows)) 
         else
-            throw(FileFormatError("unknown column type: $(handler.column_types[j])"))
+            throw(FileFormatError("unknown column type: $ty for column $name"))
         end
     end
 
@@ -894,6 +903,8 @@ function read_chunk(handler, nrows=0)
     rslt = _chunk_to_dataframe(handler)
     perf_chunk_to_data_frame = toq()
 
+    # construct column symbols/names from actual results since we may have
+    # read fewer columns than what's in the file
     column_symbols = [col for col in keys(rslt)]
     column_names = String.(column_symbols)
 
@@ -985,13 +996,9 @@ function _chunk_to_dataframe(handler)
     m = handler.current_row_in_file_index
     rslt = Dict()
 
-    # js, jb = 1, 1
     # println("handler.column_names=$(handler.column_names)")
-    for j in 1:handler.column_count
-
-        name = Symbol(handler.column_names[j])
-
-        if handler.column_types[j] == column_type_decimal  # number, date, or datetime
+    for (k, name, ty) in handler.column_indices
+        if ty == column_type_decimal  # number, date, or datetime
             # println("  String: size=$(size(handler.byte_chunk))")
             # println("  Decimal: column $j, name $name, size=$(size(handler.byte_chunk[jb, :]))")
             bytes = handler.byte_chunk[name]
@@ -1004,19 +1011,17 @@ function _chunk_to_dataframe(handler)
             #rslt[name] = bswap(rslt[name])
             rslt[name] = values
             if handler.config.convert_dates
-                if handler.column_formats[j] in sas_date_formats
+                if handler.column_formats[k] in sas_date_formats
                     rslt[name] = date_from_float(rslt[name])
-                elseif handler.column_formats[j] in sas_datetime_formats
+                elseif handler.column_formats[k] in sas_datetime_formats
                     # TODO probably have to deal with timezone somehow
                     rslt[name] = datetime_from_float(rslt[name])
                 end
             end
-            # jb += 1
-        elseif handler.column_types[j] == column_type_string
+        elseif ty == column_type_string
             # println("  String: size=$(size(handler.string_chunk))")
             # println("  String: column $j, name $name, size=$(size(handler.string_chunk[js, :]))")
             rslt[name] = handler.string_chunk[name]
-            # js += 1
         else
             throw(FileFormatError("Unknown column type $(handler.column_types[j])"))
         end
@@ -1176,31 +1181,11 @@ function process_byte_array_with_data(handler, offset, length)
     byte_chunk = handler.byte_chunk
     string_chunk = handler.string_chunk
     s = 8 * current_row
-    js = 1
-    jb = 1
-
-    # if current_row == 1
-    #     println("  current_row = $current_row")
-    #     println("  column_types = $column_types")
-    #     println("  lengths = $lengths")
-    #     println("  offsets = $offsets")
-    # end
-    # println("  s = $s")
-    # println("  handler.file_endianness = $(handler.file_endianness)")
-        
-    for j in 1:handler.column_count
-        name = Symbol(handler.column_names[j])
-        lngt = lengths[j]
-        # TODO commented out for perf reason. do we need this?
-        # if lngt == 0
-        #     break
-        # end
-        #if j == 1
-            # println("  lngt = $lngt")
-        #end
-        #println(lngt)
-        start = offsets[j]
-        ct = column_types[j]
+      
+    for (k, name, ty) in handler.column_indices
+        lngt = lengths[k]
+        start = offsets[k]
+        ct = column_types[k]
         if ct == column_type_decimal
             # The data may have 3,4,5,6,7, or 8 bytes (lngt)
             # and we need to copy into an 8-byte destination.
@@ -1215,12 +1200,9 @@ function process_byte_array_with_data(handler, offset, length)
             #     byte_chunk[jb, m + k] = source[start + k]
             # end
             byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt]
-            jb += 1
         elseif ct == column_type_string
             string_chunk[name][current_row+1] = 
                 rstrip(transcode(handler, source[start+1:(start+lngt)]))
-                #rstrip(decode(source[start+1:(start+lngt)], handler.config.encoding))
-            js += 1
         end
     end
 
@@ -1476,4 +1458,18 @@ function currentpos(handler)
     return d
 end
 
+# fill column indices as a dictionary (key = column index, value = column symbol)
+function fill_column_indices(handler)
+    handler.column_indices = Vector{Tuple{Int64, Symbol, UInt8}}()
+    for j in 1:length(handler.column_symbols)
+        name = handler.column_symbols[j]
+        if handler.config.include_columns == [] ||
+                j in handler.config.include_columns || 
+                name in handler.config.include_columns
+            push!(handler.column_indices, (j, name, handler.column_types[j]))
+        end
+    end
+    println3(handler, "column_indices = $(handler.column_indices)")
+end
+
 end # module

From 6f47c1d093f7cb559447a7faa1327494c897fb1d Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Thu, 28 Dec 2017 22:28:05 -0800
Subject: [PATCH 03/10] added exclude_columns option; fixed ncols attribute in
 the return result.

---
 src/SASLib.jl | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/SASLib.jl b/src/SASLib.jl
index 5750d92..0eba206 100644
--- a/src/SASLib.jl
+++ b/src/SASLib.jl
@@ -15,6 +15,10 @@ struct FileFormatError <: Exception
     message::AbstractString
 end 
 
+struct ConfigError <: Exception
+    message::AbstractString
+end 
+
 struct ReaderConfig 
     filename::AbstractString
     encoding::AbstractString
@@ -23,6 +27,7 @@ struct ReaderConfig
     convert_text::Bool
     convert_header_text::Bool
     include_columns::Vector{Union{Symbol, Int64}}
+    exclude_columns::Vector{Union{Symbol, Int64}}
     verbose_level::Int8
     ReaderConfig(filename, config = Dict()) = new(filename, 
         get(config, :encoding, default_encoding),
@@ -31,6 +36,7 @@ struct ReaderConfig
         get(config, :convert_text, default_convert_text), 
         get(config, :convert_header_text, default_convert_header_text),
         get(config, :include_columns, []),
+        get(config, :exclude_columns, []),
         get(config, :verbose_level, default_verbose_level))
 end
 
@@ -911,7 +917,7 @@ function read_chunk(handler, nrows=0)
     return Dict(
         :data => rslt, 
         :nrows => nrows, 
-        :ncols => nd+ns, 
+        :ncols => length(column_symbols), 
         :filename => handler.config.filename,
         :page_count => handler.current_page,
         :page_length => Int64(handler.page_length),
@@ -1461,15 +1467,24 @@ end
 # fill column indices as a dictionary (key = column index, value = column symbol)
 function fill_column_indices(handler)
     handler.column_indices = Vector{Tuple{Int64, Symbol, UInt8}}()
+    inflag = length(handler.config.include_columns) > 0
+    exflag = length(handler.config.exclude_columns) > 0
+    inflag && exflag && throw(ConfigError("You can specify either include_columns or exclude_columns but not both."))
     for j in 1:length(handler.column_symbols)
         name = handler.column_symbols[j]
-        if handler.config.include_columns == [] ||
-                j in handler.config.include_columns || 
-                name in handler.config.include_columns
+        if inflag 
+            if j in handler.config.include_columns || name in handler.config.include_columns
+                push!(handler.column_indices, (j, name, handler.column_types[j]))
+            end
+        elseif exflag 
+            if !(j in handler.config.exclude_columns || name in handler.config.exclude_columns)
+                push!(handler.column_indices, (j, name, handler.column_types[j]))
+            end
+        else
             push!(handler.column_indices, (j, name, handler.column_types[j]))
         end
     end
-    println3(handler, "column_indices = $(handler.column_indices)")
+    println2(handler, "column_indices = $(handler.column_indices)")
 end
 
 end # module

From ab4799e771b46caa8dca69fdd337af496c4a7fda Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Fri, 29 Dec 2017 11:23:43 -0800
Subject: [PATCH 04/10] converted config Dict to keyword arguments

---
 src/SASLib.jl | 59 ++++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/src/SASLib.jl b/src/SASLib.jl
index 0eba206..43af9e6 100644
--- a/src/SASLib.jl
+++ b/src/SASLib.jl
@@ -22,22 +22,13 @@ end
 struct ReaderConfig 
     filename::AbstractString
     encoding::AbstractString
-    chunksize::UInt8
+    chunksize::Int64
     convert_dates::Bool
     convert_text::Bool
     convert_header_text::Bool
-    include_columns::Vector{Union{Symbol, Int64}}
-    exclude_columns::Vector{Union{Symbol, Int64}}
-    verbose_level::Int8
-    ReaderConfig(filename, config = Dict()) = new(filename, 
-        get(config, :encoding, default_encoding),
-        get(config, :chunksize, default_chunksize),
-        get(config, :convert_dates, default_convert_dates), 
-        get(config, :convert_text, default_convert_text), 
-        get(config, :convert_header_text, default_convert_header_text),
-        get(config, :include_columns, []),
-        get(config, :exclude_columns, []),
-        get(config, :verbose_level, default_verbose_level))
+    include_columns::Vector
+    exclude_columns::Vector
+    verbose_level::Int64
 end
 
 struct Column
@@ -150,12 +141,17 @@ function open(config::ReaderConfig)
 end
 
 """
-Open a SAS7BDAT data file.  The `config` parameter accepts the same 
-settings as described in `SASLib.readsas()` function.  Returns a
-handler object.
+Open a SAS7BDAT data file.  Returns a handler object that can be used in
+the `read` function.
 """
-function open(fname::AbstractString, config=Dict())
-    return open(ReaderConfig(fname, config))
+function open(filename::AbstractString; 
+        encoding::AbstractString = default_encoding,
+        convert_dates::Bool = default_convert_dates,
+        include_columns::Vector = [],
+        exclude_columns::Vector = [],
+        verbose_level::Int64 = 1)
+    return open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text,
+        default_convert_header_text, include_columns, exclude_columns, verbose_level))
 end
 
 """
@@ -179,14 +175,17 @@ end
 
 """
 Read a SAS7BDAT file.  
-* `:encoding`: character encoding for strings (default: "UTF-8")
-* `:convert_text`: convert text data to strings (default: true)
-* `:convert_header_text`: convert header text data to strings (default: true)
 """
-function readsas(filename, config=Dict())
+function readsas(filename::AbstractString; 
+        encoding::AbstractString = default_encoding,
+        convert_dates::Bool = default_convert_dates,
+        include_columns::Vector = [],
+        exclude_columns::Vector = [],
+        verbose_level::Int64 = 1)
     handler = nothing
     try
-        handler = open(ReaderConfig(filename, config))
+        handler = open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text,
+            default_convert_header_text, include_columns, exclude_columns, verbose_level))
         # println(push!(history, handler))
         t1 = time()
         result = read(handler)
@@ -1170,10 +1169,10 @@ function process_byte_array_with_data(handler, offset, length)
     # println("  handler.row_length=$(handler.row_length)")
     if length < handler.row_length
         if handler.compression == rle_compression
-            #println("decompress using rle_compression method, length=$length, row_length=$(handler.row_length)")
+            # println4(handler, "decompress using rle_compression method, length=$length, row_length=$(handler.row_length)")
             source = rle_decompress(handler.row_length, source)
         elseif handler.compression == rdc_compression
-            #println("decompress using rdc_compression method, length=$length, row_length=$(handler.row_length)")
+            # println4(handler, "decompress using rdc_compression method, length=$length, row_length=$(handler.row_length)")
             source = rdc_decompress(handler.row_length, source)
         else
             throw(FileFormatError("Unknown compression method: $(handler.compression)"))
@@ -1206,6 +1205,7 @@ function process_byte_array_with_data(handler, offset, length)
             #     byte_chunk[jb, m + k] = source[start + k]
             # end
             byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt]
+            #println4(handler, "byte_chunk[$name][$(m+1):$(m+lngt)] = source[$(start+1):$(start+lngt)] => $(source[start+1:start+lngt])")
         elseif ct == column_type_string
             string_chunk[name][current_row+1] = 
                 rstrip(transcode(handler, source[start+1:(start+lngt)]))
@@ -1438,10 +1438,11 @@ end
 
 # ---- Debugging methods ----
 
-# verbose printing.  1=little verbose, 2=medium verbose, 3=very verbose
-@inline println1(handler::Handler, msg) = handler.config.verbose_level >= 1 && println(msg)
-@inline println2(handler::Handler, msg) = handler.config.verbose_level >= 2 && println(msg)
-@inline println3(handler::Handler, msg) = handler.config.verbose_level >= 3 && println(msg)
+# verbose printing.  1=little verbose, 2=medium verbose, 3=very verbose, 4=very very verbose :-)
+@inline println1(handler::Handler, msg::String) = handler.config.verbose_level >= 1 && println(msg)
+@inline println2(handler::Handler, msg::String) = handler.config.verbose_level >= 2 && println(msg)
+@inline println3(handler::Handler, msg::String) = handler.config.verbose_level >= 3 && println(msg)
+@inline println4(handler::Handler, msg::String) = handler.config.verbose_level >= 4 && println(msg)
 
 # string representation of the SubHeaderPointer structure
 function tostring(x::SubHeaderPointer) 

From ccd9302692bb923ea76dc11d8fff0f9512e7f382 Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Fri, 29 Dec 2017 15:07:45 -0800
Subject: [PATCH 05/10] fixed test case for open()

---
 test/runtests.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 41d0628..5679a45 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -4,7 +4,7 @@ using Base.Test
 @testset "SASLib" begin
 
     @testset "open and close" begin
-        handler = SASLib.open(SASLib.ReaderConfig("test1.sas7bdat"))
+        handler = SASLib.open("test1.sas7bdat")
         @test typeof(handler) == SASLib.Handler
         @test handler.config.filename == "test1.sas7bdat"
         @test SASLib.close(handler) == nothing

From ab2413f14d7167ca4960a423700337041a24456c Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Fri, 29 Dec 2017 15:57:22 -0800
Subject: [PATCH 06/10] updated doc strings; prefixed some internal functions
 with underscore

---
 src/SASLib.jl | 57 ++++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 10 deletions(-)

diff --git a/src/SASLib.jl b/src/SASLib.jl
index 43af9e6..82ce68e 100644
--- a/src/SASLib.jl
+++ b/src/SASLib.jl
@@ -121,7 +121,7 @@ mutable struct Handler
         config)
 end
 
-function open(config::ReaderConfig) 
+function _open(config::ReaderConfig) 
     # println("Opening $(config.filename)")
     handler = Handler(config)
     handler.compression = b""
@@ -141,8 +141,15 @@ function open(config::ReaderConfig)
 end
 
 """
-Open a SAS7BDAT data file.  Returns a handler object that can be used in
-the `read` function.
+open(filename::AbstractString; 
+        encoding::AbstractString = default_encoding,
+        convert_dates::Bool = default_convert_dates,
+        include_columns::Vector = [],
+        exclude_columns::Vector = [],
+        verbose_level::Int64 = 1)
+
+Open a SAS7BDAT data file.  Returns a `SASLib.Handler` object that can be used in
+the subsequent `SASLib.read` and `SASLib.close` functions.
 """
 function open(filename::AbstractString; 
         encoding::AbstractString = default_encoding,
@@ -150,13 +157,15 @@ function open(filename::AbstractString;
         include_columns::Vector = [],
         exclude_columns::Vector = [],
         verbose_level::Int64 = 1)
-    return open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text,
+    return _open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text,
         default_convert_header_text, include_columns, exclude_columns, verbose_level))
 end
 
 """
-Read data from the `handler`.  If `nrows` is not specified, read the
-entire files content.  When called again, fetch the next `nrows` rows.
+read(handler::Handler, nrows=0) 
+
+Read data from the `handler` (see `SASLib.open`).  If `nrows` is not specified, 
+read the entire file content.  When called again, fetch the next `nrows` rows.
 """
 function read(handler::Handler, nrows=0) 
     # println("Reading $(handler.config.filename)")
@@ -164,8 +173,13 @@ function read(handler::Handler, nrows=0)
 end
 
 """
+close(handler::Handler) 
+
 Close the `handler` object.  This function effectively closes the
-underlying iostream.  It must be called if `open` and `read` 
+underlying iostream.  It must be called after the program 
+finished reading data.
+
+This function is needed only when `SASLib.open` and `SASLib.read` 
 functions are used instead of the more convenient `readsas` function.
 """
 function close(handler::Handler) 
@@ -174,7 +188,30 @@ function close(handler::Handler)
 end
 
 """
+readsas(filename::AbstractString; 
+        encoding::AbstractString = "UTF-8",
+        convert_dates::Bool = true,
+        include_columns::Vector = [],
+        exclude_columns::Vector = [],
+        verbose_level::Int64 = 1)
+
 Read a SAS7BDAT file.  
+
+The `encoding` argument may be used if string data does not have UTF-8 
+encoding.  
+
+If `convert_dates == false` then no conversion is made
+and you will get the number of days for Date columns (or number of 
+seconds for DateTime columns) since 1-JAN-1960.  
+
+By default, all columns will be read.  If you only need a subset of the 
+columns, you may specify
+either `include_columns` or `exclude_columns` but not both.  They are just 
+arrays of columns indices or symbols e.g. [1, 2, 3] or [:employeeid, :firstname, :lastname]
+
+For debugging purpose, `verbose_level` may be set to a value higher than 1.
+Verbose level 0 will output nothing to the console, essentially a total quiet 
+option.
 """
 function readsas(filename::AbstractString; 
         encoding::AbstractString = default_encoding,
@@ -184,7 +221,7 @@ function readsas(filename::AbstractString;
         verbose_level::Int64 = 1)
     handler = nothing
     try
-        handler = open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text,
+        handler = _open(ReaderConfig(filename, encoding, default_chunksize, convert_dates, default_convert_text,
             default_convert_header_text, include_columns, exclude_columns, verbose_level))
         # println(push!(history, handler))
         t1 = time()
@@ -880,7 +917,7 @@ function read_chunk(handler, nrows=0)
     # println("nd = $nd (number of decimal columns)")
     # println("ns = $ns (number of string columns)")
 
-    fill_column_indices(handler)
+    _fill_column_indices(handler)
 
     # allocate columns
     handler.byte_chunk = Dict()
@@ -1466,7 +1503,7 @@ function currentpos(handler)
 end
 
 # fill column indices as a dictionary (key = column index, value = column symbol)
-function fill_column_indices(handler)
+function _fill_column_indices(handler)
     handler.column_indices = Vector{Tuple{Int64, Symbol, UInt8}}()
     inflag = length(handler.config.include_columns) > 0
     exflag = length(handler.config.exclude_columns) > 0

From 232258de54245a8e96c2fa50deafd1406bbf5d91 Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Fri, 29 Dec 2017 16:23:41 -0800
Subject: [PATCH 07/10] updated readme, perf test script, and new jl/py test
 results

---
 README.md                              | 117 ++++++++++++++++---------
 test/perf_results_0.3.0/py_jl_test1.md |  62 +++++++++++++
 test/perf_results_0.3.0/py_jl_test2.md |  60 +++++++++++++
 test/perf_results_0.3.0/py_jl_test3.md |  60 +++++++++++++
 test/perf_test1.jl                     |   2 +-
 5 files changed, 257 insertions(+), 44 deletions(-)
 create mode 100644 test/perf_results_0.3.0/py_jl_test1.md
 create mode 100644 test/perf_results_0.3.0/py_jl_test2.md
 create mode 100644 test/perf_results_0.3.0/py_jl_test3.md

diff --git a/README.md b/README.md
index 0347c03..1c26d4b 100644
--- a/README.md
+++ b/README.md
@@ -20,25 +20,25 @@ Use the `readsas` function to read the file.  The result is a dictionary of vari
 ```julia
 julia> using SASLib
 
-julia> x = readsas("test1.sas7bdat")
-Read data set of size 10 x 100 in 0.019 seconds
+julia> x = readsas("productsales.sas7bdat")
+Read data set of size 1440 x 10 in 2.0 seconds
 Dict{Symbol,Any} with 16 entries:
-  :filename             => "test1.sas7bdat"
-  :page_length          => 65536
-  :file_encoding        => "wlatin1"
+  :filename             => "productsales.sas7bdat"
+  :page_length          => 8192
+  :file_encoding        => "US-ASCII"
   :system_endianness    => :LittleEndian
-  :ncols                => 100
-  :column_types         => DataType[Float64, String, Float64, Float64, Float64, Float64, Float64, Float64, Float64, Float64  …  Float64, Float64…
-  :data                 => Dict{Any,Any}(Pair{Any,Any}(:Column60, [2987.0, 8194.0, 9820.0, 8252.0, 9640.0, 9168.0, 7547.0, 1419.0, 4884.0, NaN])…
-  :perf_type_conversion => 0.0052096
-  :page_count           => 1
-  :column_names         => String["Column60", "Column42", "Column68", "Column35", "Column33", "Column1", "Column41", "Column16", "Column72", "Co…
-  :column_symbols       => Symbol[:Column60, :Column42, :Column68, :Column35, :Column33, :Column1, :Column41, :Column16, :Column72, :Column19  ……
-  :column_lengths       => [8, 9, 8, 8, 8, 9, 8, 8, 8, 9  …  8, 8, 8, 5, 8, 8, 8, 9, 8, 8]
+  :ncols                => 10
+  :column_types         => Type[Float64, Float64, Union{AbstractString, Missings.Missing}, Union{AbstractString, Missings.Missing}, Union{AbstractString,…
+  :data                 => Dict{Any,Any}(Pair{Any,Any}(:QUARTER, [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 4.0  …  1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0,…
+  :perf_type_conversion => 0.0262305
+  :page_count           => 18
+  :column_names         => String["QUARTER", "YEAR", "COUNTRY", "DIVISION", "REGION", "MONTH", "PREDICT", "ACTUAL", "PRODTYPE", "PRODUCT"]
+  :column_symbols       => Symbol[:QUARTER, :YEAR, :COUNTRY, :DIVISION, :REGION, :MONTH, :PREDICT, :ACTUAL, :PRODTYPE, :PRODUCT]
+  :column_lengths       => [8, 8, 10, 10, 10, 10, 10, 8, 8, 8]
   :file_endianness      => :LittleEndian
-  :nrows                => 10
-  :perf_read_data       => 0.00612195
-  :column_offsets       => [0, 600, 8, 16, 24, 609, 32, 40, 48, 618  …  536, 544, 552, 795, 560, 568, 576, 800, 584, 592]
+  :nrows                => 1440
+  :perf_read_data       => 0.00639309
+  :column_offsets       => [0, 8, 40, 50, 60, 70, 80, 16, 24, 32]
 ```
 
 Number of columns and rows are returned as in `:ncols` and `:nrows` respectively.
@@ -46,18 +46,22 @@ Number of columns and rows are returned as in `:ncols` and `:nrows` respectively
 The data, reference by `:data` key, is represented as a Dict object with the column symbol as the key.
 
 ```juia
-julia> x[:data][:Column1]
-10-element Array{Float64,1}:
-   0.636
-   0.283
-   0.452
-   0.557
-   0.138
-   0.948
-   0.162
-   0.148
- NaN    
-   0.663
+julia> x[:data][:ACTUAL]
+1440-element Array{Float64,1}:
+ 925.0
+ 999.0
+ 608.0
+ 642.0
+ 656.0
+ 948.0
+ 612.0
+ 114.0
+ 685.0
+ 657.0
+ 608.0
+ 353.0
+ 107.0
+   ⋮  
 ```
 
 If you really like DataFrame, you can easily convert as such:
@@ -67,26 +71,53 @@ julia> using DataFrames
 
 julia> df = DataFrame(x[:data]);
 
-julia> df[:, 1:5]
-10×5 DataFrames.DataFrame
-│ Row │ Column1 │ Column10    │ Column100 │ Column11 │ Column12   │
-├─────┼─────────┼─────────────┼───────────┼──────────┼────────────┤
-│ 1   │ 0.636   │ "apple"     │ 3230.0    │ NaN      │ 1986-07-20 │
-│ 2   │ 0.283   │ "apple"     │ 4904.0    │ 22.0     │ 1983-07-15 │
-│ 3   │ 0.452   │ "apple"     │ NaN       │ 7.0      │ 1973-11-27 │
-│ 4   │ 0.557   │ "dog"       │ 8566.0    │ 26.0     │ 1967-01-20 │
-│ 5   │ 0.138   │ "crocodile" │ 894.0     │ 11.0     │ 1970-11-29 │
-│ 6   │ 0.948   │ "crocodile" │ 6088.0    │ 27.0     │ 1963-01-09 │
-│ 7   │ 0.162   │ ""          │ 6122.0    │ NaN      │ 1979-10-18 │
-│ 8   │ 0.148   │ "crocodile" │ 2570.0    │ 5.0      │ 1961-03-15 │
-│ 9   │ NaN     │ "pear"      │ 2709.0    │ 12.0     │ 1964-06-15 │
-│ 10  │ 0.663   │ "pear"      │ NaN       │ 16.0     │ 1985-01-28 │
+julia> head(df, 5)
+5×10 DataFrames.DataFrame
+│ Row │ ACTUAL │ COUNTRY │ DIVISION  │ MONTH      │ PREDICT │ PRODTYPE  │ PRODUCT │ QUARTER │ REGION │ YEAR   │
+├─────┼────────┼─────────┼───────────┼────────────┼─────────┼───────────┼─────────┼─────────┼────────┼────────┤
+│ 1   │ 925.0  │ CANADA  │ EDUCATION │ 1993-01-01 │ 850.0   │ FURNITURE │ SOFA    │ 1.0     │ EAST   │ 1993.0 │
+│ 2   │ 999.0  │ CANADA  │ EDUCATION │ 1993-02-01 │ 297.0   │ FURNITURE │ SOFA    │ 1.0     │ EAST   │ 1993.0 │
+│ 3   │ 608.0  │ CANADA  │ EDUCATION │ 1993-03-01 │ 846.0   │ FURNITURE │ SOFA    │ 1.0     │ EAST   │ 1993.0 │
+│ 4   │ 642.0  │ CANADA  │ EDUCATION │ 1993-04-01 │ 533.0   │ FURNITURE │ SOFA    │ 2.0     │ EAST   │ 1993.0 │
+│ 5   │ 656.0  │ CANADA  │ EDUCATION │ 1993-05-01 │ 646.0   │ FURNITURE │ SOFA    │ 2.0     │ EAST   │ 1993.0 │
+```
+
+If you only need to read few columns, just pass an `include_columns` argument:
+
+```
+julia> head(DataFrame(readsas("productsales.sas7bdat", include_columns=[:YEAR, :MONTH, :PRODUCT, :ACTUAL])[:data]))
+Read data set of size 1440 x 4 in 0.004 seconds
+6×4 DataFrames.DataFrame
+│ Row │ ACTUAL │ MONTH      │ PRODUCT │ YEAR   │
+├─────┼────────┼────────────┼─────────┼────────┤
+│ 1   │ 925.0  │ 1993-01-01 │ SOFA    │ 1993.0 │
+│ 2   │ 999.0  │ 1993-02-01 │ SOFA    │ 1993.0 │
+│ 3   │ 608.0  │ 1993-03-01 │ SOFA    │ 1993.0 │
+│ 4   │ 642.0  │ 1993-04-01 │ SOFA    │ 1993.0 │
+│ 5   │ 656.0  │ 1993-05-01 │ SOFA    │ 1993.0 │
+│ 6   │ 948.0  │ 1993-06-01 │ SOFA    │ 1993.0 │
+```
+
+Likewise, you can read all columns except the ones you don't want as specified in `exclude_columns` argument:
+
+```
+julia> head(DataFrame(readsas("productsales.sas7bdat", exclude_columns=[:YEAR, :MONTH, :PRODUCT, :ACTUAL])[:data]))
+Read data set of size 1440 x 6 in 0.031 seconds
+6×6 DataFrames.DataFrame
+│ Row │ COUNTRY │ DIVISION  │ PREDICT │ PRODTYPE  │ QUARTER │ REGION │
+├─────┼─────────┼───────────┼─────────┼───────────┼─────────┼────────┤
+│ 1   │ CANADA  │ EDUCATION │ 850.0   │ FURNITURE │ 1.0     │ EAST   │
+│ 2   │ CANADA  │ EDUCATION │ 297.0   │ FURNITURE │ 1.0     │ EAST   │
+│ 3   │ CANADA  │ EDUCATION │ 846.0   │ FURNITURE │ 1.0     │ EAST   │
+│ 4   │ CANADA  │ EDUCATION │ 533.0   │ FURNITURE │ 2.0     │ EAST   │
+│ 5   │ CANADA  │ EDUCATION │ 646.0   │ FURNITURE │ 2.0     │ EAST   │
+│ 6   │ CANADA  │ EDUCATION │ 486.0   │ FURNITURE │ 2.0     │ EAST   │
 ```
 
 If you need to read files incrementally:
 
 ```julia
-handler = SASLib.open("test1.sas7bdat")
+handler = SASLib.open("productsales.sas7bdat")
 results = SASLib.read(handler, 3)   # read 3 rows
 results = SASLib.read(handler, 4)   # read next 4 rows
 SASLib.close(handler)              # remember to close the handler when done
diff --git a/test/perf_results_0.3.0/py_jl_test1.md b/test/perf_results_0.3.0/py_jl_test1.md
new file mode 100644
index 0000000..f4b43ed
--- /dev/null
+++ b/test/perf_results_0.3.0/py_jl_test1.md
@@ -0,0 +1,62 @@
+# Performance Test 1
+
+## Summary
+
+SASLib is ~4.3x faster than Pandas.
+
+## Test File
+
+Filename|Rows|Columns|Numeric Columns|String Columns
+--------|----|-------|---------------|--------------
+numeric_1000000_2.sas7bdat|1,000,000|2|2|0
+
+## Test Environment
+
+Test system information:
+```
+julia> versioninfo()
+Julia Version 0.6.2
+Commit d386e40c17 (2017-12-13 18:08 UTC)
+Platform Info:
+  OS: macOS (x86_64-apple-darwin14.5.0)
+  CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
+  WORD_SIZE: 64
+  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
+  LAPACK: libopenblas64_
+  LIBM: libopenlibm
+  LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
+```
+
+## Python
+```
+$ python perf_test1.py numeric_1000000_2.sas7bdat
+1: elapsed 1.976702 seconds
+2: elapsed 1.984404 seconds
+3: elapsed 2.266284 seconds
+4: elapsed 1.978403 seconds
+5: elapsed 1.946053 seconds
+6: elapsed 1.919336 seconds
+7: elapsed 1.918322 seconds
+8: elapsed 1.926547 seconds
+9: elapsed 1.962013 seconds
+10: elapsed 1.939654 seconds
+Average: 1.9818 seconds
+```
+
+## Julia
+```
+$ julia perf_test1.jl numeric_1000000_2.sas7bdat
+Loaded library in 0.343 seconds
+Bootstrap elapsed 4.211 seconds
+Elapsed 0.481 seconds
+Elapsed 0.462 seconds
+Elapsed 0.414 seconds
+Elapsed 0.480 seconds
+Elapsed 0.473 seconds
+Elapsed 0.472 seconds
+Elapsed 0.473 seconds
+Elapsed 0.479 seconds
+Elapsed 0.401 seconds
+Elapsed 0.463 seconds
+Average: 0.4598392924 seconds
+```
diff --git a/test/perf_results_0.3.0/py_jl_test2.md b/test/perf_results_0.3.0/py_jl_test2.md
new file mode 100644
index 0000000..495ffdd
--- /dev/null
+++ b/test/perf_results_0.3.0/py_jl_test2.md
@@ -0,0 +1,60 @@
+# Performance Test 2
+
+## Summary
+
+SASLib is 16.9x faster than Pandas.
+
+## Test File
+
+Filename      |Rows|Columns|Numeric Columns|String Columns
+--------------|----|-------|---------------|--------------
+test1.sas7bdat|10  |100    |73             |27
+
+## Test Environment
+
+```
+Julia Version 0.6.2
+Commit d386e40c17 (2017-12-13 18:08 UTC)
+Platform Info:
+  OS: macOS (x86_64-apple-darwin14.5.0)
+  CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
+  WORD_SIZE: 64
+  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
+  LAPACK: libopenblas64_
+  LIBM: libopenlibm
+  LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
+```
+
+## Python
+```
+$ python perf_test1.py test1.sas7bdat
+1: elapsed 0.099821 seconds
+2: elapsed 0.116454 seconds
+3: elapsed 0.095141 seconds
+4: elapsed 0.100083 seconds
+5: elapsed 0.100060 seconds
+6: elapsed 0.098249 seconds
+7: elapsed 0.101819 seconds
+8: elapsed 0.099673 seconds
+9: elapsed 0.096865 seconds
+10: elapsed 0.109412 seconds
+Average: 0.1018 seconds
+```
+
+## Julia
+```
+$ julia perf_test1.jl test1.sas7bdat 
+Loaded library in 0.326 seconds
+Bootstrap elapsed 3.606 seconds
+Elapsed 0.011 seconds
+Elapsed 0.004 seconds
+Elapsed 0.004 seconds
+Elapsed 0.004 seconds
+Elapsed 0.004 seconds
+Elapsed 0.004 seconds
+Elapsed 0.010 seconds
+Elapsed 0.013 seconds
+Elapsed 0.004 seconds
+Elapsed 0.004 seconds
+Average: 0.0060341937 seconds
+```
diff --git a/test/perf_results_0.3.0/py_jl_test3.md b/test/perf_results_0.3.0/py_jl_test3.md
new file mode 100644
index 0000000..641fa93
--- /dev/null
+++ b/test/perf_results_0.3.0/py_jl_test3.md
@@ -0,0 +1,60 @@
+# Performance Test 3
+
+## Summary
+
+SASLib is 5.2x faster than Pandas.
+
+## Test File
+
+Filename             |Rows  |Columns|Numeric Columns|String Columns
+---------------------|------|-------|---------------|--------------
+productsales.sas7bdat|1440  |10     |4              |6
+
+## Test Environment
+
+```
+Julia Version 0.6.2
+Commit d386e40c17 (2017-12-13 18:08 UTC)
+Platform Info:
+  OS: macOS (x86_64-apple-darwin14.5.0)
+  CPU: Intel(R) Core(TM) i5-4258U CPU @ 2.40GHz
+  WORD_SIZE: 64
+  BLAS: libopenblas (USE64BITINT DYNAMIC_ARCH NO_AFFINITY Haswell)
+  LAPACK: libopenblas64_
+  LIBM: libopenlibm
+  LLVM: libLLVM-3.9.1 (ORCJIT, haswell)
+```
+
+## Python
+```
+$ python perf_test1.py productsales.sas7bdat 
+1: elapsed 0.035160 seconds
+2: elapsed 0.031523 seconds
+3: elapsed 0.041026 seconds
+4: elapsed 0.033476 seconds
+5: elapsed 0.045547 seconds
+6: elapsed 0.030253 seconds
+7: elapsed 0.038022 seconds
+8: elapsed 0.032196 seconds
+9: elapsed 0.046579 seconds
+10: elapsed 0.033603 seconds
+Average: 0.0367 seconds
+```
+
+## Julia
+```
+$ julia perf_test1.jl productsales.sas7bdat 
+Loaded library in 0.328 seconds
+Bootstrap elapsed 3.613 seconds
+Elapsed 0.013 seconds
+Elapsed 0.005 seconds
+Elapsed 0.005 seconds
+Elapsed 0.004 seconds
+Elapsed 0.007 seconds
+Elapsed 0.008 seconds
+Elapsed 0.007 seconds
+Elapsed 0.011 seconds
+Elapsed 0.007 seconds
+Elapsed 0.005 seconds
+Average: 0.0071251584000000005 seconds
+```
diff --git a/test/perf_test1.jl b/test/perf_test1.jl
index 854b7f2..bce738a 100644
--- a/test/perf_test1.jl
+++ b/test/perf_test1.jl
@@ -25,4 +25,4 @@ function perf(f, n)
     println("Average: $(total / n) seconds")
 end
 
-perf(() -> readsas(ARGS[1], Dict(:verbose_level => 0)), 10)
+perf(() -> readsas(ARGS[1], verbose_level = 0), 10)

From fa8840bec00bdd98df8236269082c9a0bc556a88 Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Fri, 29 Dec 2017 16:31:34 -0800
Subject: [PATCH 08/10] added perf result for half reads

---
 test/perf_results_0.3.0/half_columns.md | 47 +++++++++++++++++++++++++
 1 file changed, 47 insertions(+)
 create mode 100644 test/perf_results_0.3.0/half_columns.md

diff --git a/test/perf_results_0.3.0/half_columns.md b/test/perf_results_0.3.0/half_columns.md
new file mode 100644
index 0000000..7a5c76e
--- /dev/null
+++ b/test/perf_results_0.3.0/half_columns.md
@@ -0,0 +1,47 @@
+# Read performance when reading only half of the data
+
+## Results
+
+Read time is reduced by 40% when reading half of the data.
+
+## Test Scenario
+
+This test file has just 2 numeric columns.  We would like to know the performance
+of reading only 1 column from this file.
+
+Filename|Rows|Columns|Numeric Columns|String Columns
+--------|----|-------|---------------|--------------
+numeric_1000000_2.sas7bdat|1,000,000|2|2|0
+
+## Test Log
+
+```
+julia> @benchmark readsas("numeric_1000000_2.sas7bdat", verbose_level=0)
+BenchmarkTools.Trial:
+  memory estimate:  399.04 MiB
+  allocs estimate:  3031083
+  --------------
+  minimum time:     358.695 ms (9.31% GC)
+  median time:      442.709 ms (25.96% GC)
+  mean time:        427.870 ms (20.97% GC)
+  maximum time:     482.786 ms (25.29% GC)
+  --------------
+  samples:          12
+  evals/sample:     1
+
+julia> @benchmark readsas("numeric_1000000_2.sas7bdat", include_columns=[:f], verbose_level=0)
+BenchmarkTools.Trial:
+  memory estimate:  261.71 MiB
+  allocs estimate:  2031028
+  --------------
+  minimum time:     222.832 ms (9.67% GC)
+  median time:      235.396 ms (9.70% GC)
+  mean time:        261.782 ms (20.75% GC)
+  maximum time:     327.359 ms (33.53% GC)
+  --------------
+  samples:          20
+  evals/sample:     1
+
+julia> 262/428
+0.6121495327102804
+```

From 53e046652a6b2f9a10b8f887634fc0432ad8ac89 Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Fri, 29 Dec 2017 16:48:40 -0800
Subject: [PATCH 09/10] put back @inbounds for now... will have to refactor
 later for performance tuning

---
 src/SASLib.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/SASLib.jl b/src/SASLib.jl
index 82ce68e..a622834 100644
--- a/src/SASLib.jl
+++ b/src/SASLib.jl
@@ -1241,10 +1241,10 @@ function process_byte_array_with_data(handler, offset, length)
             # for k in 1:lngt
             #     byte_chunk[jb, m + k] = source[start + k]
             # end
-            byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt]
+            @inbounds byte_chunk[name][m+1:m+lngt] = source[start+1:start+lngt]
             #println4(handler, "byte_chunk[$name][$(m+1):$(m+lngt)] = source[$(start+1):$(start+lngt)] => $(source[start+1:start+lngt])")
         elseif ct == column_type_string
-            string_chunk[name][current_row+1] = 
+            @inbounds string_chunk[name][current_row+1] = 
                 rstrip(transcode(handler, source[start+1:(start+lngt)]))
         end
     end

From 71c1ad1c68757fcff2a93e8d712c8e60a00b656f Mon Sep 17 00:00:00 2001
From: Tom Kwong <tk3369@gmail.com>
Date: Fri, 29 Dec 2017 18:12:16 -0800
Subject: [PATCH 10/10] added unit tests for include/exclude column feature

---
 test/runtests.jl | 28 +++++++++++++++++++++++++---
 1 file changed, 25 insertions(+), 3 deletions(-)

diff --git a/test/runtests.jl b/test/runtests.jl
index 5679a45..656ae0c 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,7 +19,7 @@ using Base.Test
         end
     end
 
-    @testset "manual" begin
+    @testset "incremental read" begin
         fname = "test1.sas7bdat"  # 10 rows
         handler = SASLib.open(fname)
         @test handler.config.filename == fname
@@ -31,7 +31,7 @@ using Base.Test
         @test result[:nrows] == 3
     end
 
-    @testset "numeric" begin
+    @testset "various data types" begin
         result = readsas("test1.sas7bdat")
         df = result[:data]
         @test sum(df[:Column1][1:5]) == 2.066
@@ -40,7 +40,7 @@ using Base.Test
         @test df[:Column4][1:3] == [Date("1965-12-10"), Date("1977-03-07"), Date("1983-08-15")]
     end
 
-    @testset "datetime" begin
+    @testset "datetime with missing values" begin
         result = readsas("datetime.sas7bdat")
         df = result[:data]
         @test (result[:nrows], result[:ncols]) == (5, 4)
@@ -50,6 +50,28 @@ using Base.Test
         @test count(ismissing, result[:data][:dt]) == 3
     end
 
+    @testset "include/exclude columns" begin
+        result = readsas("productsales.sas7bdat", include_columns=[:MONTH, :YEAR])
+        @test result[:ncols] == 2
+        @test sort(result[:column_symbols]) == sort([:MONTH, :YEAR])
+        
+        result = readsas("productsales.sas7bdat", include_columns=[1, 2, 7])
+        @test result[:ncols] == 3
+        @test sort(result[:column_symbols]) == sort([:ACTUAL, :PREDICT, :PRODUCT])
+
+        result = readsas("productsales.sas7bdat", exclude_columns=[:DIVISION])
+        @test result[:ncols] == 9
+        @test !(:DIVISION in result[:column_symbols])
+
+        result = readsas("productsales.sas7bdat", exclude_columns=collect(2:10))
+        @test result[:ncols] == 1
+        @test sort(result[:column_symbols]) == sort([:ACTUAL])
+
+        # error handling
+        @test_throws SASLib.ConfigError readsas("productsales.sas7bdat", 
+            include_columns=[1], exclude_columns=[1])
+    end
+
     @testset "misc" begin
         result = readsas("productsales.sas7bdat")
         df = result[:data]