Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable multithreading tests of joins only on 64 bit machines #3183

Merged
merged 2 commits into from
Sep 27, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
289 changes: 53 additions & 236 deletions test/join.jl
Original file line number Diff line number Diff line change
Expand Up @@ -1510,171 +1510,53 @@ end
@test m1[!, :a] == m2[!, :a]
end

@testset "threaded correctness" begin
try
if Sys.WORD_SIZE == 64
@testset "threaded correctness" begin
df1 = DataFrame(id=[1:10^6; 10^7+1:10^7+2])
df1.left_row = axes(df1, 1)
df2 = DataFrame(id=[1:10^6; 10^8+1:10^8+4])
df2.right_row = axes(df2, 1)

@test try
innerjoin(df1, df2, on=:id) ≅
DataFrame(id=1:10^6, left_row=1:10^6, right_row=1:10^6)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping innerjoin test."
true
else
rethrow(e)
end
end

@test try
leftjoin(df1, df2, on=:id) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2], left_row=1:10^6+2,
right_row=[1:10^6; missing; missing])
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping leftjoin test."
true
else
rethrow(e)
end
end

@test try
rightjoin(df1, df2, on=:id) ≅
DataFrame(id=[1:10^6; 10^8+1:10^8+4],
left_row=[1:10^6; fill(missing, 4)],
right_row=1:10^6+4)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping rightjoin test."
true
else
rethrow(e)
end
end

@test try
outerjoin(df1, df2, on=:id) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2; 10^8+1:10^8+4],
left_row=[1:10^6+2; fill(missing, 4)],
right_row=[1:10^6; missing; missing; 10^6+1:10^6+4])
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping outerjoin test."
true
else
rethrow(e)
end
end

@test try
semijoin(df1, df2, on=:id) ≅
DataFrame(id=1:10^6, left_row=1:10^6)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping semijoin test."
true
else
rethrow(e)
end
end

@test try
antijoin(df1, df2, on=:id) ≅
DataFrame(id=10^7+1:10^7+2, left_row=10^6+1:10^6+2)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping antijoin test."
true
else
rethrow(e)
end
end
@test innerjoin(df1, df2, on=:id) ≅
DataFrame(id=1:10^6, left_row=1:10^6, right_row=1:10^6)
@test leftjoin(df1, df2, on=:id) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2], left_row=1:10^6+2,
right_row=[1:10^6; missing; missing])
@test rightjoin(df1, df2, on=:id) ≅
DataFrame(id=[1:10^6; 10^8+1:10^8+4],
left_row=[1:10^6; fill(missing, 4)],
right_row=1:10^6+4)
@test outerjoin(df1, df2, on=:id) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2; 10^8+1:10^8+4],
left_row=[1:10^6+2; fill(missing, 4)],
right_row=[1:10^6; missing; missing; 10^6+1:10^6+4])
@test semijoin(df1, df2, on=:id) ≅
DataFrame(id=1:10^6, left_row=1:10^6)
@test antijoin(df1, df2, on=:id) ≅
DataFrame(id=10^7+1:10^7+2, left_row=10^6+1:10^6+2)

Random.seed!(1234)
for i in 1:4
df1 = df1[shuffle(axes(df1, 1)), :]
df2 = df2[shuffle(axes(df2, 1)), :]

@test try
sort!(innerjoin(df1, df2, on=:id)) ≅
DataFrame(id=1:10^6, left_row=1:10^6, right_row=1:10^6)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping innerjoin test."
true
else
rethrow(e)
end
end

@test try
sort!(leftjoin(df1, df2, on=:id)) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2], left_row=1:10^6+2,
right_row=[1:10^6; missing; missing])
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping leftjoin test."
true
else
rethrow(e)
end
end

@test try
sort!(rightjoin(df1, df2, on=:id)) ≅
DataFrame(id=[1:10^6; 10^8+1:10^8+4],
left_row=[1:10^6; fill(missing, 4)],
right_row=1:10^6+4)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping rightjoin test."
true
else
rethrow(e)
end
end

@test try
sort!(outerjoin(df1, df2, on=:id)) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2; 10^8+1:10^8+4],
left_row=[1:10^6+2; fill(missing, 4)],
right_row=[1:10^6; missing; missing; 10^6+1:10^6+4])
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping outerjoin test."
true
else
rethrow(e)
end
end

@test try
sort!(semijoin(df1, df2, on=:id)) ≅
DataFrame(id=1:10^6, left_row=1:10^6)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping semijoin test."
true
else
rethrow(e)
end
end

@test try
sort!(antijoin(df1, df2, on=:id)) ≅
DataFrame(id=10^7+1:10^7+2, left_row=10^6+1:10^6+2)
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping antijoin test."
true
else
rethrow(e)
end
end
@test sort!(innerjoin(df1, df2, on=:id)) ≅
DataFrame(id=1:10^6, left_row=1:10^6, right_row=1:10^6)
@test sort!(leftjoin(df1, df2, on=:id)) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2], left_row=1:10^6+2,
right_row=[1:10^6; missing; missing])
@test sort!(rightjoin(df1, df2, on=:id)) ≅
DataFrame(id=[1:10^6; 10^8+1:10^8+4],
left_row=[1:10^6; fill(missing, 4)],
right_row=1:10^6+4)
@test sort!(outerjoin(df1, df2, on=:id)) ≅
DataFrame(id=[1:10^6; 10^7+1:10^7+2; 10^8+1:10^8+4],
left_row=[1:10^6+2; fill(missing, 4)],
right_row=[1:10^6; missing; missing; 10^6+1:10^6+4])
@test sort!(semijoin(df1, df2, on=:id)) ≅
DataFrame(id=1:10^6, left_row=1:10^6)
@test sort!(antijoin(df1, df2, on=:id)) ≅
DataFrame(id=10^7+1:10^7+2, left_row=10^6+1:10^6+2)
end

# test correctness of column order
Expand All @@ -1683,88 +1565,23 @@ end
df2 = DataFrame(e=Int8(5), id1=[1:10^6; 10^8+1:10^8+4], f=Int8(6), g=Int8(7),
id2=-[1:10^6; 10^8+1:10^8+4], h=Int8(8))

@test try
innerjoin(df1, df2, on=[:id1, :id2]) ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping innerjoin test."
true
else
rethrow(e)
end
end

@test try
leftjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping leftjoin test."
true
else
rethrow(e)
end
end

@test try
rightjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping rightjoin test."
true
else
rethrow(e)
end
end

@test try
outerjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping outerjoin test."
true
else
rethrow(e)
end
end

@test try
semijoin(df1, df2, on=[:id1, :id2]) ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6, c=Int8(3), d=Int8(4))
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping semijoin test."
true
else
rethrow(e)
end
end

@test try
antijoin(df1, df2, on=[:id1, :id2]) ≅
DataFrame(a=Int8(1), id2=-(10^7+1:10^7+2), b=Int8(2), id1=(10^7+1:10^7+2),
c=Int8(3), d=Int8(4))
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping antijoin test."
true
else
rethrow(e)
end
end
catch e
if Int === Int32 && e isa OutOfMemoryError
@warn "OutOfMemoryError. Skipping antijoin test."
else
rethrow(e)
end
@test innerjoin(df1, df2, on=[:id1, :id2]) ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
@test leftjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
@test rightjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
@test outerjoin(df1, df2, on=[:id1, :id2])[1:10^6, :] ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6,
c=Int8(3), d=Int8(4), e=Int8(5), f=Int8(6), g=Int8(7), h=Int8(8))
@test semijoin(df1, df2, on=[:id1, :id2]) ≅
DataFrame(a=Int8(1), id2=-(1:10^6), b=Int8(2), id1=1:10^6, c=Int8(3), d=Int8(4))
@test antijoin(df1, df2, on=[:id1, :id2]) ≅
DataFrame(a=Int8(1), id2=-(10^7+1:10^7+2), b=Int8(2), id1=(10^7+1:10^7+2),
c=Int8(3), d=Int8(4))
end
end

Expand Down