From f724a02d9a6beb7ba9a043f95cba33bee75ddfb6 Mon Sep 17 00:00:00 2001 From: tan Date: Thu, 20 Jun 2013 18:51:52 +0530 Subject: [PATCH] readdlm improvements: use mmap by default. options to read header and ignore invalid characters added tests fixed travis tests --- base/ascii.jl | 18 ++++++++ base/datafmt.jl | 100 ++++++++++++++++++++++++++++++-------------- base/utf8.jl | 21 ++++++++++ doc/helpdb.jl | 32 ++++++++++---- doc/stdlib/base.rst | 17 ++++++-- test/Makefile | 2 +- test/readdlm.jl | 10 +++++ test/runtests.jl | 2 +- 8 files changed, 157 insertions(+), 45 deletions(-) create mode 100644 test/readdlm.jl diff --git a/base/ascii.jl b/base/ascii.jl index 7c5e0df632a35..0602a10d16860 100644 --- a/base/ascii.jl +++ b/base/ascii.jl @@ -81,4 +81,22 @@ ascii(x) = convert(ASCIIString, x) convert(::Type{ASCIIString}, s::ASCIIString) = s convert(::Type{ASCIIString}, s::UTF8String) = ascii(s.data) convert(::Type{ASCIIString}, a::Array{Uint8,1}) = is_valid_ascii(a) ? ASCIIString(a) : error("invalid ASCII sequence") +function convert(::Type{ASCIIString}, a::Array{Uint8,1}, invalids_as::ASCIIString) + l = length(a) + idx = 1 + iscopy = false + while idx <= l + (a[idx] < 0x80) && (idx +=1; continue) + !iscopy && (a = copy(a); iscopy = true) + endn = idx + while endn <= l + (a[endn] < 0x80) && break + endn += 1 + end + (endn > idx) && (endn -= 1) + splice!(a, idx:endn, invalids_as.data) + l = length(a) + end + convert(ASCIIString, a) +end convert(::Type{ASCIIString}, s::String) = ascii(bytestring(s)) diff --git a/base/datafmt.jl b/base/datafmt.jl index 54d9b4a653385..5ccf80e5921e6 100644 --- a/base/datafmt.jl +++ b/base/datafmt.jl @@ -32,27 +32,59 @@ function countlines(io::IO, eol::Char) nl end -readdlm(input, T::Type) = readdlm(input, invalid_dlm, T, '\n') -readdlm(input, dlm::Char, T::Type) = readdlm(input, dlm, T, '\n') - -readdlm(input) = readdlm(input, invalid_dlm, '\n') -readdlm(input, dlm::Char) = readdlm(input, dlm, '\n') +readdlm(input, T::Type; opts...) = readdlm(input, invalid_dlm, T, '\n'; opts...) +readdlm(input, dlm::Char, T::Type; opts...) = readdlm(input, dlm, T, '\n'; opts...) + +readdlm(input; opts...) = readdlm(input, invalid_dlm, '\n'; opts...) +readdlm(input, dlm::Char; opts...) = readdlm(input, dlm, '\n'; opts...) + +readdlm(input, dlm::Char, eol::Char; opts...) = readdlm_auto(input, dlm, Float64, eol, true; opts...) +readdlm(input, dlm::Char, T::Type, eol::Char; opts...) = readdlm_auto(input, dlm, T, eol, false; opts...) + +function readdlm_auto(input, dlm::Char, T::Type, eol::Char, auto::Bool; opts...) + optsd = val_opts(opts) + isa(input, String) && (input = get(optsd, :use_mmap, true) ? mmap_array(Uint8, (filesize(input),), open(input, "r")) : readall(input)) + sinp = isa(input, Vector{Uint8}) ? ccall(:jl_array_to_string, ByteString, (Array{Uint8,1},), input) : + isa(input, IO) ? readall(input) : + input + readdlm_string(sinp, dlm, T, eol, auto, optsd) +end -readdlm(input, dlm::Char, eol::Char) = readdlm_auto(input, dlm, Float64, eol, true) -readdlm(input, dlm::Char, T::Type, eol::Char) = readdlm_auto(input, dlm, T, eol, false) +function ascii_if_possible(sbuff::String) + isa(sbuff, ASCIIString) && return sbuff -readdlm_auto(input, dlm::Char, T::Type, eol::Char, auto::Bool=false) = readdlm_string(readall(input), dlm, T, eol, auto) -function readdlm_auto(input::Vector{Uint8}, dlm::Char, T::Type, eol::Char, auto::Bool=false) - s = ccall(:jl_array_to_string, ByteString, (Array{Uint8,1},), input) - readdlm_string(s, dlm, T, eol, auto) + asci = true + d = sbuff.data + for idx in 1:length(d) + (d[idx] < 0x80) ? continue : (asci = false; break) + end + asci ? ASCIIString(sbuff.data) : sbuff end -function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool=false) - nrows,ncols = dlm_dims(sbuff, eol, dlm) +function readdlm_string(sbuff::String, dlm::Char, T::Type, eol::Char, auto::Bool, optsd::Dict) + nrows,ncols = try + dlm_dims(sbuff, eol, dlm) + catch ex + !get(optsd, :ignore_invalid_chars, false) && throw(ex) + sbuff = ascii_if_possible(convert(typeof(sbuff), sbuff.data, "")) + dlm_dims(sbuff, eol, dlm) + end offsets = zeros(Int, nrows, ncols) - cells = Array(T, nrows, ncols) + has_header = get(optsd, :has_header, false) + cells = Array(T, has_header ? nrows-1 : nrows, ncols) dlm_offsets(sbuff, dlm, eol, offsets) - dlm_fill(cells, offsets, sbuff, auto) + has_header ? (dlm_fill(cells, offsets, sbuff, auto, 1), dlm_fill(Array(String, 1, ncols), offsets, sbuff, auto, 0)) : dlm_fill(cells, offsets, sbuff, auto, 0) +end + +const valid_opts = [:has_header, :ignore_invalid_chars, :use_mmap] +function val_opts(opts) + d = Dict{Symbol,Bool}() + for opt in opts + !contains(valid_opts, opt[1]) && error("unknown option $(opt[1])") + !isa(opt[2], Bool) && error("$(opt[1]) can only be boolean") + d[opt[1]] = opt[2] + end + d end function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int) @@ -64,10 +96,12 @@ function dlm_col_begin(ncols::Int, offsets::Array{Int,2}, row::Int, col::Int) (ret == 0) ? dlm_col_begin(ncols, offsets, pp_row, pp_col) : (ret+2) end -function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool) +function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, auto::Bool, row_offset::Int) maxrow,maxcol = size(cells) tmp64 = Array(Float64,1) - for row in 1:maxrow + + for row in (1+row_offset):(maxrow+row_offset) + cell_row = row-row_offset for col in 1:maxcol start_pos = dlm_col_begin(maxcol, offsets, row, col) end_pos = offsets[row,col] @@ -75,19 +109,19 @@ function dlm_fill{T}(cells::Array{T,2}, offsets::Array{Int,2}, sbuff::String, au if T <: Char (length(sval) != 1) && error("file entry \"$(sval)\" is not a Char") - cells[row,col] = sval + cells[cell_row,col] = sval elseif T <: Number if(float64_isvalid(sval, tmp64)) - cells[row,col] = tmp64[1] + cells[cell_row,col] = tmp64[1] elseif auto - return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false) + return dlm_fill(Array(Any,maxrow,maxcol), offsets, sbuff, false, row_offset) else - cells[row,col] = NaN + cells[cell_row,col] = NaN end elseif T <: String - cells[row,col] = sval + cells[cell_row,col] = sval elseif T == Any - cells[row,col] = float64_isvalid(sval, tmp64) ? tmp64[1] : sval + cells[cell_row,col] = float64_isvalid(sval, tmp64) ? tmp64[1] : sval else error("file entry \"$(sval)\" cannot be converted to $T") end @@ -102,7 +136,7 @@ function dlm_offsets(sbuff::UTF8String, dlm, eol, offsets::Array{Int,2}) row = 1 maxrow,maxcol = size(offsets) idx = 1 - while(idx < length(sbuff.data)) + while(idx <= length(sbuff.data)) val,idx = next(sbuff, idx) (val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue col += 1 @@ -131,19 +165,23 @@ end dlm_dims(s::ASCIIString, eol, dlm) = dlm_dims(s.data, uint8(eol), uint8(dlm)) function dlm_dims(dbuff, eol, dlm) ncols = nrows = col = 0 - for val in dbuff - (val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue - col += 1 - (val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0) + try + for val in dbuff + (val != eol) && ((dlm == invalid_dlm) ? !contains(_default_delims, val) : (val != dlm)) && continue + col += 1 + (val == eol) && (nrows += 1; ncols = max(ncols, col); col = 0) + end + catch ex + error("at row $nrows, column $col : $ex)") end - (col > 0) && (nrow += 1) + (col > 0) && (nrows += 1) ncols = max(ncols, col, 1) nrows = max(nrows, 1) return (nrows, ncols) end -readcsv(io) = readdlm(io, ',') -readcsv(io, T::Type) = readdlm(io, ',', T) +readcsv(io; opts...) = readdlm(io, ','; opts...) +readcsv(io, T::Type; opts...) = readdlm(io, ',', T; opts...) # todo: keyword argument for # of digits to print function writedlm(io::IO, a::Matrix, dlm::Char) diff --git a/base/utf8.jl b/base/utf8.jl index 8dac4b9af3f32..795a04aac5be8 100644 --- a/base/utf8.jl +++ b/base/utf8.jl @@ -119,4 +119,25 @@ utf8(x) = convert(UTF8String, x) convert(::Type{UTF8String}, s::UTF8String) = s convert(::Type{UTF8String}, s::ASCIIString) = UTF8String(s.data) convert(::Type{UTF8String}, a::Array{Uint8,1}) = is_valid_utf8(a) ? UTF8String(a) : error("invalid UTF-8 sequence") +function convert(::Type{UTF8String}, a::Array{Uint8,1}, invalids_as::String) + l = length(a) + idx = 1 + iscopy = false + while idx <= l + if is_utf8_start(a[idx]) + nextidx = idx+1+utf8_trailing[a[idx]+1] + (nextidx <= (l+1)) && (idx = nextidx; continue) + end + !iscopy && (a = copy(a); iscopy = true) + endn = idx + while endn <= l + is_utf8_start(a[endn]) && break + endn += 1 + end + (endn > idx) && (endn -= 1) + splice!(a, idx:endn, invalids_as.data) + l = length(a) + end + UTF8String(a) +end convert(::Type{UTF8String}, s::String) = utf8(bytestring(s)) diff --git a/doc/helpdb.jl b/doc/helpdb.jl index e84f60f2428b9..71614d14102e9 100644 --- a/doc/helpdb.jl +++ b/doc/helpdb.jl @@ -1537,18 +1537,34 @@ "), -("Text I/O","Base","readdlm","readdlm(filename, delim::Char) +("Text I/O","Base","readdlm","readdlm(source, delim::Char; has_header=false, use_mmap=true, ignore_invalid_chars=false) - Read a matrix from a text file where each line gives one row, with - elements separated by the given delimeter. If all data is numeric, - the result will be a numeric array. If some elements cannot be - parsed as numbers, a cell array of numbers and strings is returned. + Read a matrix from the source where each line gives one row, with + elements separated by the given delimeter. The source can be a + text file, stream or byte array. Memory mapped filed can be used + by passing the byte array representation of the mapped segment as + source. + + If \"has_header\" is \"true\" the first row of data would be read + as headers and the tuple \"(data_cells, header_cells)\" is + returned instead of only \"data_cells\". + + If \"use_mmap\" is \"true\" the file specified by \"source\" is + memory mapped for potential speedups. + + If \"ignore_invalid_chars\" is \"true\" bytes in \"source\" with + invalid character encoding will be ignored. Otherwise an error is + thrown indicating the offending character position. + + If all data is numeric, \"data_cells\" will be a numeric array. If + some elements cannot be parsed as numbers, a cell array of numbers + and strings is returned for \"data_cells\". "), -("Text I/O","Base","readdlm","readdlm(filename, delim::Char, T::Type) +("Text I/O","Base","readdlm","readdlm(source, delim::Char, T::Type; options...) - Read a matrix from a text file with a given element type. If \"T\" + Read a matrix from the source with a given element type. If \"T\" is a numeric type, the result is an array of that type, with any non-numeric elements as \"NaN\" for floating-point types, or zero. Other useful values of \"T\" include \"ASCIIString\", \"String\", @@ -1563,7 +1579,7 @@ "), -("Text I/O","Base","readcsv","readcsv(filename[, T::Type]) +("Text I/O","Base","readcsv","readcsv(filename[, T::Type]; options...) Equivalent to \"readdlm\" with \"delim\" set to comma. diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst index 5e6fd43b41b7d..3b5d987d64d1b 100644 --- a/doc/stdlib/base.rst +++ b/doc/stdlib/base.rst @@ -1006,11 +1006,20 @@ Text I/O Create an iterable object that will yield each line from a stream. -.. function:: readdlm(source, delim::Char) +.. function:: readdlm(source, delim::Char; has_header=false, use_mmap=true, ignore_invalid_chars=false) - Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source. If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned. + Read a matrix from the source where each line gives one row, with elements separated by the given delimeter. The source can be a text file, stream or byte array. Memory mapped filed can be used by passing the byte array representation of the mapped segment as source. -.. function:: readdlm(source, delim::Char, T::Type) + If ``has_header`` is ``true`` the first row of data would be read as headers and the tuple ``(data_cells, header_cells)`` is returned instead of only ``data_cells``. + + If ``use_mmap`` is ``true`` the file specified by ``source`` is memory mapped for potential speedups. + + If ``ignore_invalid_chars`` is ``true`` bytes in ``source`` with invalid character encoding will be ignored. Otherwise an error is thrown indicating the offending character position. + + If all data is numeric, the result will be a numeric array. If some elements cannot be parsed as numbers, a cell array of numbers and strings is returned. + + +.. function:: readdlm(source, delim::Char, T::Type; options...) Read a matrix from the source with a given element type. If ``T`` is a numeric type, the result is an array of that type, with any non-numeric elements as ``NaN`` for floating-point types, or zero. Other useful values of ``T`` include ``ASCIIString``, ``String``, and ``Any``. @@ -1018,7 +1027,7 @@ Text I/O Write an array to a text file using the given delimeter (defaults to comma). -.. function:: readcsv(source, [T::Type]) +.. function:: readcsv(source, [T::Type]; options...) Equivalent to ``readdlm`` with ``delim`` set to comma. diff --git a/test/Makefile b/test/Makefile index 1b6bfae0c5911..8add27bb7dc22 100644 --- a/test/Makefile +++ b/test/Makefile @@ -5,7 +5,7 @@ TESTS = all core keywordargs numbers strings unicode collections hashing \ remote iostring arrayops linalg blas fft dsp sparse bitarray \ random math functional bigint sorting statistics spawn parallel \ arpack file git pkg pkg2 resolve suitesparse complex version \ - pollfd mpfr broadcast socket floatapprox priorityqueue + pollfd mpfr broadcast socket floatapprox priorityqueue readdlm $(TESTS) :: $(QUIET_JULIA) $(call spawn,$(JULIA_EXECUTABLE)) ./runtests.jl $@ diff --git a/test/readdlm.jl b/test/readdlm.jl new file mode 100644 index 0000000000000..817a0447a75ff --- /dev/null +++ b/test/readdlm.jl @@ -0,0 +1,10 @@ +dlm_data = try + readdlm(joinpath(JULIA_HOME, split("../../test/perf2/imdb-1.tsv", '/')...), '\t') + catch + readdlm(joinpath(JULIA_HOME, split("../../julia/share/julia/test/perf2/imdb-1.tsv", '/')...), '\t') + end + +@test size(dlm_data) == (31383,3) +@test dlm_data[12345,2] == "Gladiator" +@test dlm_data[31383,3] == 2005 +@test dlm_data[1,1] == "McClure, Marc (I)" diff --git a/test/runtests.jl b/test/runtests.jl index 0d9a25af87aa8..96d22e4b1bce3 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,7 +5,7 @@ testnames = ["core", "keywordargs", "numbers", "strings", "unicode", "statistics", "spawn", "parallel", "priorityqueue", "arpack", "file", "perf", "suitesparse", "version", "resolve", "pollfd", "mpfr", "broadcast", "complex", - "socket", "floatapprox"] + "socket", "floatapprox", "readdlm"] tests = ARGS==["all"] ? testnames : ARGS n = min(8, CPU_CORES, length(tests))