diff --git a/docs/make.jl b/docs/make.jl index 10af5d0..29c1238 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -3,7 +3,7 @@ using Documenter, FeatherLib makedocs( modules = [FeatherLib], sitename = "FeatherLib.jl", - analytics="UA-132838790-1", + analytics = "UA-132838790-1", pages = [ "Introduction" => "index.md" ] diff --git a/src/loadfile.jl b/src/loadfile.jl index 4192517..952ec31 100644 --- a/src/loadfile.jl +++ b/src/loadfile.jl @@ -4,16 +4,16 @@ getoutputlength(version::Int32, x::Integer) = version < FEATHER_VERSION ? x : pa function validatefile(filename::AbstractString, data::AbstractVector{UInt8}) if length(data) < MIN_FILE_LENGTH throw(ArgumentError("'$file' is not in feather format: total length of file: $(length(data))")) - end + end header = data[1:4] - footer = data[(end-3):end] + footer = data[(end - 3):end] if header ≠ FEATHER_MAGIC_BYTES || footer ≠ FEATHER_MAGIC_BYTES throw(ArgumentError(string("'$filename' is not in feather format: header = $header, ", "footer = $footer."))) end end -function loadfile(filename::AbstractString; use_mmap::Bool=true) +function loadfile(filename::AbstractString; use_mmap::Bool = true) isfile(filename) || throw(ArgumentError("'$filename' is not a valid file.")) data = use_mmap ? Mmap.mmap(filename) : read(filename) validatefile(filename, data) @@ -21,15 +21,15 @@ function loadfile(filename::AbstractString; use_mmap::Bool=true) end function metalength(data::AbstractVector{UInt8}) - read(IOBuffer(data[(length(data)-7):(length(data)-4)]), Int32) + read(IOBuffer(data[(length(data) - 7):(length(data) - 4)]), Int32) end -function metaposition(data::AbstractVector{UInt8}, metalen::Integer=metalength(data)) - length(data) - (metalen+7) +function metaposition(data::AbstractVector{UInt8}, metalen::Integer = metalength(data)) + length(data) - (metalen + 7) end -function rootposition(data::AbstractVector{UInt8}, mpos::Integer=metaposition(data)) - read(IOBuffer(data[mpos:(mpos+4)]), Int32) +function rootposition(data::AbstractVector{UInt8}, mpos::Integer = metaposition(data)) + read(IOBuffer(data[mpos:(mpos + 4)]), Int32) end function getctable(data::AbstractVector{UInt8}) diff --git a/src/metadata.jl b/src/metadata.jl index c244a61..b05e7c9 100644 --- a/src/metadata.jl +++ b/src/metadata.jl @@ -29,7 +29,7 @@ mutable struct CategoryMetadata ordered::Bool end -@DEFAULT CategoryMetadata ordered=false +@DEFAULT CategoryMetadata ordered = false mutable struct TimestampMetadata unit::TimeUnit @@ -43,7 +43,7 @@ mutable struct TimeMetadata unit::TimeUnit end -@UNION TypeMetadata (Nothing,CategoryMetadata,TimestampMetadata,DateMetadata,TimeMetadata) +@UNION TypeMetadata (Nothing, CategoryMetadata, TimestampMetadata, DateMetadata, TimeMetadata) mutable struct Column name::String @@ -53,8 +53,8 @@ mutable struct Column user_metadata::String end -function Column(name::String, values::PrimitiveArray, metadata::TypeMetadata=nothing, - user_metadata::String="") +function Column(name::String, values::PrimitiveArray, metadata::TypeMetadata = nothing, + user_metadata::String = "") Column(name, values, FlatBuffers.typeorder(TypeMetadata, typeof(metadata)), metadata, user_metadata) end @@ -119,7 +119,7 @@ const JULIA_TIME_DICT = Dict{Metadata.TimeUnit,DataType}( Metadata.MICROSECOND => Dates.Microsecond, Metadata.NANOSECOND => Dates.Nanosecond ) -const METADATA_TIME_DICT = Dict{DataType,Metadata.TimeUnit}(v=>k for (k,v) in JULIA_TIME_DICT) +const METADATA_TIME_DICT = Dict{DataType,Metadata.TimeUnit}(v => k for (k, v) in JULIA_TIME_DICT) isprimitivetype(t::Metadata.DType) = t ∉ NON_PRIMITIVE_TYPES @@ -171,4 +171,4 @@ function getmetadata(io::IO, ::Type{T}, A::DictEncoding) where T Metadata.CategoryMetadata(vals, true) end -getmetadata(io::IO, ::Type{Union{Missing, T}}, A::DictEncoding) where T = getmetadata(io, T, A) +getmetadata(io::IO, ::Type{Union{Missing,T}}, A::DictEncoding) where T = getmetadata(io, T, A) diff --git a/src/read.jl b/src/read.jl index f173321..e9e5c67 100644 --- a/src/read.jl +++ b/src/read.jl @@ -5,8 +5,8 @@ struct ResultSet metadata::String end -function featherread(filename::AbstractString; use_mmap=true) - data = loadfile(filename, use_mmap=use_mmap) +function featherread(filename::AbstractString; use_mmap = true) + data = loadfile(filename, use_mmap = use_mmap) ctable = getctable(data) ncols = length(ctable.columns) colnames = [Symbol(col.name) for col in ctable.columns] @@ -15,12 +15,12 @@ function featherread(filename::AbstractString; use_mmap=true) return ResultSet(columns, colnames, ctable.description, ctable.metadata) end -#===================================================================================================== +#= ==================================================================================================== new column construction stuff -=====================================================================================================# +==================================================================================================== =# Base.length(p::Metadata.PrimitiveArray) = p.length -startloc(p::Metadata.PrimitiveArray) = p.offset+1 +startloc(p::Metadata.PrimitiveArray) = p.offset + 1 Arrow.nullcount(p::Metadata.PrimitiveArray) = p.null_count @@ -29,7 +29,7 @@ function bitmasklength(p::Metadata.PrimitiveArray) end function offsetslength(p::Metadata.PrimitiveArray) - isprimitivetype(p.dtype) ? 0 : padding((length(p)+1)*sizeof(Int32)) + isprimitivetype(p.dtype) ? 0 : padding((length(p) + 1) * sizeof(Int32)) end valueslength(p::Metadata.PrimitiveArray) = p.total_bytes - offsetslength(p) - bitmasklength(p) diff --git a/src/write.jl b/src/write.jl index 80c0b33..fef06d5 100644 --- a/src/write.jl +++ b/src/write.jl @@ -1,8 +1,8 @@ -function featherwrite(filename::AbstractString, columns, colnames; description::AbstractString="", metadata::AbstractString="") +function featherwrite(filename::AbstractString, columns, colnames; description::AbstractString = "", metadata::AbstractString = "") ncol = length(columns) nrows = length(columns[1]) cols = ArrowVector[arrowformat(_first_col_convert_pass(col)) for col in columns] - + open(filename, "w+") do io writepadded(io, FEATHER_MAGIC_BYTES) colmetadata = Metadata.Column[writecolumn(io, string(colnames[i]), cols[i]) for i in 1:ncol] @@ -43,7 +43,7 @@ function writecontents(::Type{Metadata.PrimitiveArray}, io::IO, A::ArrowVector) a = position(io) writecontents(io, A) b = position(io) - Metadata.PrimitiveArray(A, a, b-a) + Metadata.PrimitiveArray(A, a, b - a) end @@ -55,7 +55,7 @@ end function writemetadata(io::IO, ctable::Metadata.CTable) meta = FlatBuffers.build!(ctable) - rng = (meta.head+1):length(meta.bytes) + rng = (meta.head + 1):length(meta.bytes) writepadded(io, view(meta.bytes, rng)) Int32(length(rng)) end diff --git a/test/runtests.jl b/test/runtests.jl index b4a81c3..0731974 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -5,21 +5,21 @@ temps = [] @testset "FeatherLib" begin -include("test_readwrite.jl") -include("test_arrow.jl") + include("test_readwrite.jl") + include("test_arrow.jl") -GC.gc(); GC.gc() -for t in temps - try - rm(t) - catch - GC.gc() + GC.gc(); GC.gc() + for t in temps try rm(t) catch + GC.gc() + try + rm(t) + catch + end end end -end # issue #34 # data = DataFrame(A=Union{Missing, String}[randstring(10) for i ∈ 1:100], B=rand(100)) diff --git a/test/test_arrow.jl b/test/test_arrow.jl index f3b4663..a791cfe 100644 --- a/test/test_arrow.jl +++ b/test/test_arrow.jl @@ -13,9 +13,9 @@ randdate() = Date(rand(0:4000), rand(1:12), rand(1:27)) randtime() = Dates.Time(rand(0:23), rand(0:59), rand(0:59)) randdatetime() = randdate() + randtime() -randstrings() = String[[randstring(rand(0:20)) for i ∈ 1:(NROWS-1)]; "a"] +randstrings() = String[[randstring(rand(0:20)) for i ∈ 1:(NROWS - 1)]; "a"] function randstrings(::Missing) - Union{String,Missing}[[rand(Bool) ? missing : randstring(rand(0:20)) for i ∈ 1:(NROWS-1)]; "a"] + Union{String,Missing}[[rand(Bool) ? missing : randstring(rand(0:20)) for i ∈ 1:(NROWS - 1)]; "a"] end convstring(str::AbstractString) = String(str) @@ -23,8 +23,8 @@ convstring(::Missing) = missing @testset "ArrowTests" begin -cols = [rand(Int32,NROWS), - rand(Float64,NROWS), + cols = [rand(Int32, NROWS), + rand(Float64, NROWS), Date[randdate() for i ∈ 1:NROWS], DateTime[randdatetime() for i ∈ 1:NROWS], Dates.Time[randtime() for i ∈ 1:NROWS], @@ -34,55 +34,53 @@ cols = [rand(Int32,NROWS), CategoricalArrays.categorical(randstrings()), CategoricalArrays.categorical(randstrings(missing))] -colnames = [:ints,:floats,:dates,:datetimes,:times,:missingints,:strings, + colnames = [:ints,:floats,:dates,:datetimes,:times,:missingints,:strings, :missingstrings,:catstrings,:catstringsmissing] -featherwrite(arrow_tempname, cols, colnames) + featherwrite(arrow_tempname, cols, colnames) -ndf = featherread(arrow_tempname) + ndf = featherread(arrow_tempname) -@test ndf.names == colnames + @test ndf.names == colnames -@test typeof(ndf.columns[1]) == Arrow.Primitive{Int32} -@test typeof(ndf.columns[2]) == Arrow.Primitive{Float64} -@test typeof(ndf.columns[3]) == Arrow.Primitive{Arrow.Datestamp} -@test typeof(ndf.columns[4]) == Arrow.Primitive{Arrow.Timestamp{Dates.Millisecond}} -@test typeof(ndf.columns[5]) == Arrow.Primitive{Arrow.TimeOfDay{Dates.Nanosecond,Int64}} -@test typeof(ndf.columns[6]) == Arrow.NullablePrimitive{Int64} -@test typeof(ndf.columns[7]) == Arrow.List{String,Arrow.DefaultOffset,Arrow.Primitive{UInt8}} -@test typeof(ndf.columns[8]) == Arrow.NullableList{String,Arrow.DefaultOffset,Arrow.Primitive{UInt8}} -@test typeof(ndf.columns[9]) == Arrow.DictEncoding{String,Arrow.Primitive{Int32}, - Arrow.List{String,Arrow.DefaultOffset,Arrow.Primitive{UInt8}}} -@test typeof(ndf.columns[10]) == - Arrow.DictEncoding{Union{String,Missing},Arrow.NullablePrimitive{Int32},Arrow.List{String,Arrow.DefaultOffset, - Arrow.Primitive{UInt8}}} + @test typeof(ndf.columns[1]) == Arrow.Primitive{Int32} + @test typeof(ndf.columns[2]) == Arrow.Primitive{Float64} + @test typeof(ndf.columns[3]) == Arrow.Primitive{Arrow.Datestamp} + @test typeof(ndf.columns[4]) == Arrow.Primitive{Arrow.Timestamp{Dates.Millisecond}} + @test typeof(ndf.columns[5]) == Arrow.Primitive{Arrow.TimeOfDay{Dates.Nanosecond,Int64}} + @test typeof(ndf.columns[6]) == Arrow.NullablePrimitive{Int64} + @test typeof(ndf.columns[7]) == Arrow.List{String,Arrow.DefaultOffset,Arrow.Primitive{UInt8}} + @test typeof(ndf.columns[8]) == Arrow.NullableList{String,Arrow.DefaultOffset,Arrow.Primitive{UInt8}} + @test typeof(ndf.columns[9]) == Arrow.DictEncoding{String,Arrow.Primitive{Int32},Arrow.List{String,Arrow.DefaultOffset,Arrow.Primitive{UInt8}}} + @test typeof(ndf.columns[10]) == + Arrow.DictEncoding{Union{String,Missing},Arrow.NullablePrimitive{Int32},Arrow.List{String,Arrow.DefaultOffset,Arrow.Primitive{UInt8}}} -for j ∈ 1:N_IDX_TESTS - i = rand(1:NROWS) - @test cols[1][i] == ndf.columns[1][i] - @test cols[2][i] == ndf.columns[2][i] - @test cols[3][i] == convert(Date, ndf.columns[3][i]) - @test cols[4][i] == convert(DateTime, ndf.columns[4][i]) - @test cols[5][i] == convert(Dates.Time, ndf.columns[5][i]) - @test isequal(cols[6][i], ndf.columns[6][i]) - @test cols[7][i] == ndf.columns[7][i] - @test isequal(cols[8][i], ndf.columns[8][i]) - @test cols[9][i] == String(ndf.columns[9][i]) - @test isequal(cols[10][i], convstring(ndf.columns[10][i])) -end -for j ∈ 1:N_IDX_TESTS - a, b = extrema(rand(1:NROWS, 2)) - i = a:b - @test cols[1][i] == ndf.columns[1][i] - @test cols[2][i] == ndf.columns[2][i] - @test cols[3][i] == convert.(Date, ndf.columns[3][i]) - @test cols[4][i] == convert.(DateTime, ndf.columns[4][i]) - @test cols[5][i] == convert.(Dates.Time, ndf.columns[5][i]) - @test isequal(cols[6][i], ndf.columns[6][i]) - @test cols[7][i] == ndf.columns[7][i] - @test isequal(cols[8][i], ndf.columns[8][i]) - @test cols[9][i] == String.(ndf.columns[9][i]) - @test isequal(cols[10][i], convstring.(ndf.columns[10][i])) -end + for j ∈ 1:N_IDX_TESTS + i = rand(1:NROWS) + @test cols[1][i] == ndf.columns[1][i] + @test cols[2][i] == ndf.columns[2][i] + @test cols[3][i] == convert(Date, ndf.columns[3][i]) + @test cols[4][i] == convert(DateTime, ndf.columns[4][i]) + @test cols[5][i] == convert(Dates.Time, ndf.columns[5][i]) + @test isequal(cols[6][i], ndf.columns[6][i]) + @test cols[7][i] == ndf.columns[7][i] + @test isequal(cols[8][i], ndf.columns[8][i]) + @test cols[9][i] == String(ndf.columns[9][i]) + @test isequal(cols[10][i], convstring(ndf.columns[10][i])) + end + for j ∈ 1:N_IDX_TESTS + a, b = extrema(rand(1:NROWS, 2)) + i = a:b + @test cols[1][i] == ndf.columns[1][i] + @test cols[2][i] == ndf.columns[2][i] + @test cols[3][i] == convert.(Date, ndf.columns[3][i]) + @test cols[4][i] == convert.(DateTime, ndf.columns[4][i]) + @test cols[5][i] == convert.(Dates.Time, ndf.columns[5][i]) + @test isequal(cols[6][i], ndf.columns[6][i]) + @test cols[7][i] == ndf.columns[7][i] + @test isequal(cols[8][i], ndf.columns[8][i]) + @test cols[9][i] == String.(ndf.columns[9][i]) + @test isequal(cols[10][i], convstring.(ndf.columns[10][i])) + end end diff --git a/test/test_readwrite.jl b/test/test_readwrite.jl index 591a8d2..d57c306 100644 --- a/test/test_readwrite.jl +++ b/test/test_readwrite.jl @@ -1,11 +1,11 @@ @testset "ReadWrite" begin testdir = joinpath(@__DIR__, "data") - files = map(x -> joinpath(testdir, x), readdir(testdir)) + files = map(x->joinpath(testdir, x), readdir(testdir)) for f in files - res = featherread(f) - columns, headers = res.columns, res.names + res = featherread(f) + columns, headers = res.columns, res.names ncols = length(columns) nrows = length(columns[1]) @@ -13,25 +13,25 @@ temp = tempname() push!(temps, temp) - featherwrite(temp, columns, headers, description=res.description, metadata=res.metadata) + featherwrite(temp, columns, headers, description = res.description, metadata = res.metadata) - res2 = featherread(temp) - columns2, headers2 = res2.columns, res2.names + res2 = featherread(temp) + columns2, headers2 = res2.columns, res2.names @test length(columns2) == ncols - @test headers==headers2 + @test headers == headers2 - for (c1,c2) in zip(columns, columns2) - @test length(c1)==nrows - @test length(c2)==nrows + for (c1, c2) in zip(columns, columns2) + @test length(c1) == nrows + @test length(c2) == nrows for i = 1:nrows @test isequal(c1[i], c2[i]) end end - @test res.description == res2.description - @test res.metadata == res2.metadata + @test res.description == res2.description + @test res.metadata == res2.metadata # for (col1,col2) in zip(source.ctable.columns,sink.ctable.columns) # @test col1.name == col2.name # @test col1.metadata_type == col2.metadata_type