Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start support for ListView, BinaryView, Utf8View, etc. #512

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 1 addition & 16 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,19 +1,3 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name = "Arrow"
uuid = "69666777-d1a9-59fb-9406-91d4454c9d45"
authors = ["quinnj <quinn.jacobd@gmail.com>"]
Expand All @@ -32,6 +16,7 @@ LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
Mmap = "a63ad114-7e13-5084-954f-fe012c677804"
PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
SentinelArrays = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
StringViews = "354b36f9-a18e-4713-926e-db85100087ba"
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
TimeZones = "f269a46b-ccf7-5d73-abea-4c690281aa53"
TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
Expand Down
3 changes: 2 additions & 1 deletion src/Arrow.jl
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ using DataAPI,
CodecZstd,
TimeZones,
BitIntegers,
ConcurrentUtilities
ConcurrentUtilities,
StringViews

export ArrowTypes

Expand Down
1 change: 1 addition & 0 deletions src/arraytypes/arraytypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -271,3 +271,4 @@ include("map.jl")
include("struct.jl")
include("unions.jl")
include("dictencoding.jl")
include("views.jl")
62 changes: 62 additions & 0 deletions src/arraytypes/views.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

struct ViewElement
length::Int32
prefix::Int32
bufindex::Int32
offset::Int32
end

"""
Arrow.View

An `ArrowVector` where each element is a variable sized list of some kind, like an `AbstractVector` or `AbstractString`.
"""
struct View{T} <: ArrowVector{T}
arrow::Vector{UInt8} # need to hold a reference to arrow memory blob
validity::ValidityBitmap
data::Vector{ViewElement}
inline::Vector{UInt8} # `data` field reinterpreted as a byte array
buffers::Vector{Vector{UInt8}} # holds non-inlined data
ℓ::Int
metadata::Union{Nothing,Base.ImmutableDict{String,String}}
end

Base.size(l::View) = (l.ℓ,)

@propagate_inbounds function Base.getindex(l::View{T}, i::Integer) where {T}
@boundscheck checkbounds(l, i)
@inbounds v = l.data[i]
S = Base.nonmissingtype(T)
if S <: Base.CodeUnits
# BinaryView
return !l.validity[i] ? missing :
v.length < 13 ?
Base.CodeUnits(StringView(@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)])) :
Base.CodeUnits(StringView(@view l.buffers[v.bufindex + 1][(v.offset + 1):(v.offset + v.length)]))
else
# Utf8View
return !l.validity[i] ? missing :
v.length < 13 ?
ArrowTypes.fromarrow(T, StringView(@view l.inline[(((i - 1) * 16) + 5):(((i - 1) * 16) + 5 + v.length - 1)])) :
ArrowTypes.fromarrow(T, StringView(@view l.buffers[v.bufindex + 1][(v.offset + 1):(v.offset + v.length)]))
end
end

# @propagate_inbounds function Base.setindex!(l::List{T}, v, i::Integer) where {T}

# end
6 changes: 3 additions & 3 deletions src/eltypes.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,12 @@ function arrowtype(b, ::Type{T}) where {T<:AbstractFloat}
return Meta.FloatingPoint, Meta.floatingPointEnd(b), nothing
end

juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8}, convert) = String
juliaeltype(f::Meta.Field, b::Union{Meta.Utf8,Meta.LargeUtf8,Meta.Utf8View}, convert) = String

datasizeof(x) = sizeof(x)
datasizeof(x::AbstractVector) = sum(datasizeof, x)

juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary}, convert) = Base.CodeUnits
juliaeltype(f::Meta.Field, b::Union{Meta.Binary,Meta.LargeBinary,Meta.BinaryView}, convert) = Base.CodeUnits

juliaeltype(f::Meta.Field, x::Meta.FixedSizeBinary, convert) =
NTuple{Int(x.byteWidth),UInt8}
Expand Down Expand Up @@ -428,7 +428,7 @@ ArrowTypes.JuliaType(::Val{PERIOD_SYMBOL}, ::Type{Duration{U}}) where {U} = peri
ArrowTypes.fromarrow(::Type{P}, x::Duration{U}) where {P<:Dates.Period,U} = convert(P, x)

# nested types; call juliaeltype recursively on nested children
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList}, convert)
function juliaeltype(f::Meta.Field, list::Union{Meta.List,Meta.LargeList,Meta.ListView,Meta.LargeListView}, convert)
return Vector{juliaeltype(f.children[1], buildmetadata(f.children[1]), convert)}
end

Expand Down
7 changes: 6 additions & 1 deletion src/metadata/Message.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ struct RecordBatch <: FlatBuffers.Table
pos::Base.Int
end

Base.propertynames(x::RecordBatch) = (:length, :nodes, :buffers, :compression)
Base.propertynames(x::RecordBatch) = (:length, :nodes, :buffers, :compression, :variadicBufferCounts)

function Base.getproperty(x::RecordBatch, field::Symbol)
if field === :length
Expand All @@ -97,6 +97,11 @@ function Base.getproperty(x::RecordBatch, field::Symbol)
y = FlatBuffers.indirect(x, o + FlatBuffers.pos(x))
return FlatBuffers.init(BodyCompression, FlatBuffers.bytes(x), y)
end
elseif field === :variadicBufferCounts
o = FlatBuffers.offset(x, 12)
if o != 0
return FlatBuffers.Array{Int32}(x, o)
end
end
return nothing
end
Expand Down
90 changes: 90 additions & 0 deletions src/metadata/Schema.jl
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,91 @@ durationAddUnit(b::FlatBuffers.Builder, unit::TimeUnit.T) =
FlatBuffers.prependslot!(b, 0, unit, 1)
durationEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Contains two child arrays, run_ends and values.
# /// The run_ends child array must be a 16/32/64-bit integer array
# /// which encodes the indices at which the run with the value in
# /// each corresponding index in the values child array ends.
# /// Like list/struct types, the value array can be of any type.
# table RunEndEncoded {
# }
struct RunEndEncoded <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::RunEndEncoded) = ()

runEndEncodedStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
runEndEncodedEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Logically the same as Binary, but the internal representation uses a view
# /// struct that contains the string length and either the string's entire data
# /// inline (for small strings) or an inlined prefix, an index of another buffer,
# /// and an offset pointing to a slice in that buffer (for non-small strings).
# ///
# /// Since it uses a variable number of data buffers, each Field with this type
# /// must have a corresponding entry in `variadicBufferCounts`.
# table BinaryView {
# }
struct BinaryView <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::BinaryView) = ()

binaryViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
binaryViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Logically the same as Utf8, but the internal representation uses a view
# /// struct that contains the string length and either the string's entire data
# /// inline (for small strings) or an inlined prefix, an index of another buffer,
# /// and an offset pointing to a slice in that buffer (for non-small strings).
# ///
# /// Since it uses a variable number of data buffers, each Field with this type
# /// must have a corresponding entry in `variadicBufferCounts`.
# table Utf8View {
# }
struct Utf8View <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::Utf8View) = ()

utf8ViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
utf8ViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Represents the same logical types that List can, but contains offsets and
# /// sizes allowing for writes in any order and sharing of child values among
# /// list values.
# table ListView {
# }
struct ListView <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::ListView) = ()

listViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
listViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

# /// Represents the same logical types that LargeList can, but contains offsets
# /// and sizes allowing for writes in any order and sharing of child values among
# /// list values.
# table LargeListView {
# }
struct LargeListView <: FlatBuffers.Table
bytes::Vector{UInt8}
pos::Base.Int
end

Base.propertynames(x::LargeListView) = ()

largeListViewStart(b::FlatBuffers.Builder) = FlatBuffers.startobject!(b, 0)
largeListViewEnd(b::FlatBuffers.Builder) = FlatBuffers.endobject!(b)

function Type(b::UInt8)
b == 1 && return Null
b == 2 && return Int
Expand All @@ -423,6 +508,11 @@ function Type(b::UInt8)
b == 19 && return LargeBinary
b == 20 && return LargeUtf8
b == 21 && return LargeList
b == 22 && return RunEndEncoded
b == 23 && return BinaryView
b == 24 && return Utf8View
b == 25 && return ListView
b == 26 && return LargeListView
return nothing
end

Expand Down
Loading
Loading