Skip to content

Commit

Permalink
Merge pull request #73 from testercwt/master
Browse files Browse the repository at this point in the history
add support for east asia characters
  • Loading branch information
tk3369 authored Mar 19, 2021
2 parents 5c35967 + 85619f6 commit d8401c2
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 2 deletions.
7 changes: 5 additions & 2 deletions src/constants.jl
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,15 @@ const encoding_names = Dict(
65 => "WINDOWS-1255",
66 => "WINDOWS-1256",
67 => "WINDOWS-1257",
118 => "CP950",
119 => "EUC-TW",
123 => "BIG-5",
123 => "BIG5-HKSCS",
125 => "GB18030",
126 => "CP936",
134 => "EUC-JP",
138 => "CP932",
140 => "EUC-KR"
140 => "EUC-KR",
141 => "CP949"
)

const index_rowSizeIndex = 0
Expand Down
Binary file added test/data_big5/cp950.sas7bdat
Binary file not shown.
Binary file added test/data_big5/testbig5.sas7bdat
Binary file not shown.
17 changes: 17 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ using Statistics: mean
using SharedArrays: SharedArray
using Tables
import IteratorInterfaceExtensions, TableTraits
using StringEncodings: encode,decode

function getpath(dir, file)
path = joinpath(dir, file)
Expand Down Expand Up @@ -331,6 +332,22 @@ Base.convert(::Type{YearStr}, v::Float64) = YearStr(string(round(Int, v)))
# @test result[:file_encoding] == "US-ASCII"
@test rs[:Column42][3] == "dog"
end

@testset "taiwan encodings" begin
# check cp950 support , the most prevalent encoding on traditional Han characters ;compatible with big5-2003
@test encode("","CP950")==[0xa3, 0xe1]
# check big5-hkscs 2008 support, stringencodings is based on iconv but old version iconv had problem on east asian character esp. on hkscs
@test encode("","big5-hkscs") == [0x87, 0xdf] # big5-hkscs is compatible with big5-2003 but not fully compatible with CP950
# cp950 encoding
rs = readfile("data_big5", "cp950.sas7bdat")
@test rs[1,1] == "我愛你"
# wlatin1 encoding , this works on winxp ansi system but not in new unicode system
rs = readfile("data_big5", "testbig5.sas7bdat", encoding = "cp950")
@test rs[1,1] == "我愛你"
# wlatin1 encoding , format on winxp ansi system
rs = readfile("data_big5", "testbig5.sas7bdat")
@test decode(encode(rs[1,1],"cp1252"),"cp950") == "我愛你"
end

@testset "handler object" begin
handler = openfile("data_reikoch", "binary.sas7bdat")
Expand Down

0 comments on commit d8401c2

Please sign in to comment.