Skip to content

Commit

Permalink
Merge pull request #12 from Arkoniak/dev
Browse files Browse the repository at this point in the history
Compress support
  • Loading branch information
Arkoniak authored Apr 21, 2020
2 parents 81fdf40 + 4aa3b91 commit 681bfc2
Show file tree
Hide file tree
Showing 4 changed files with 241 additions and 26 deletions.
14 changes: 11 additions & 3 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "UrlDownload"
uuid = "856ac37a-3032-4c1c-9122-f86d88358c8b"
authors = ["Andrey Oskin"]
version = "0.1.1"
version = "0.1.2"

[deps]
HTTP = "cd3eb016-35fb-5094-929b-558a96fad6f3"
Expand All @@ -14,11 +14,19 @@ julia = "1.4"

[extras]
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Feather = "becb17da-46f6-5d3c-ad1b-1c5fe96bc73c"
ImageMagick = "6218d12a-5da1-5696-b52f-db25d2ecc6d1"
JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
CodecBzip2 = "523fee87-0ab8-5b00-afb7-3ecf72e48cfd"
CodecLz4 = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b"
CodecZlib = "944b1d66-785c-5afd-91f1-9de20f533193"
CodecZstd = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea"

[targets]
test = ["Test", "ImageMagick", "Feather", "CSV", "JSON3", "DataFrames"]
test = ["Test", "ImageMagick", "Feather", "CSV", "JSON3", "DataFrames",
"CodecBzip2", "CodecLz4", "CodecXz", "CodecZlib", "CodecZstd",
"ZipFile"]
61 changes: 61 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,17 @@ res = urldownload(url, parser = x -> DataFrame(CSV.File(IOBuffer(x))))
# │ 2 │ 3 │ 4 │
```

Alternatively one can use `parser = identity` and process data outside of the function
```julia
using UrlDownload
using DataFrames
using CSV

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/ext.csv"
res = urldownload(url, parser = identity) |>
x -> DataFrame(CSV.File(IOBuffer(x)))
```

If keywords arguments are used in custom parser they will accept values from
keyword arguments of `urldownload` function

Expand All @@ -170,6 +181,56 @@ res = urldownload(url, parser = wrapper, delim = ';')
# │ 2 │ 3 │ 4 │
```

## Compressed files

`UrlDownload.jl` can process compressed data using autodetection. Currently following formats are supported:
`:xz, :gzip, :bzip2, :lz4, :zstd, :zip`.
```julia
using UrlDownload
using DataFrames

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.gz"
res = urldownload(url) |> DataFrame
# 2×2 DataFrame
# │ Row │ x │ y │
# │ │ Int64 │ Int64 │
# ├─────┼───────┼───────┤
# │ 1 │ 1 │ 2 │
# │ 2 │ 3 │ 4 │
```

To override compression type one can use either one of formats `:xz, :gzip, :bzip2, :lz4, :zstd, :zip`
in the argument `compress` or specify `:none`. In second case if custom parser is used it should
decompress data on itself
```julia
using UrlDownload
using DataFrames
using CodecXz
using CSV

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.gz"
res = urldownload(url, compress = :xz) |> DataFrame

res = urldownload(url, compress = :none, parser = x -> CSV.read(XzDecompressorStream(IOBuffer(x))))
```

For all compress types except `:zip` `urldownload` automatically applies `CSV.File`
transformation. If any other kind of data is stored in an archive, it should be processed
with custom parser.

`:zip` compressed data is processed one by one with usual rules of the auto-detection applied.
If zip archive contains only single file, than it'll be decompressed as a single object, otherwise
only first file is unpacked. This behavior can be overridden with `multifiles = true`, in
this case `urldownload` returns `Vector` of processed objects.

```julia
using UrlDownload
url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test2.zip"
res = urldownload(url, multifiles = true)

length(res) # 2
```

## Undetected file types
Sometimes file type can't be detected from the url, in this case one can supply optional
`format` argument, to force necessary behavior
Expand Down
140 changes: 117 additions & 23 deletions src/UrlDownload.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,31 +18,39 @@ const ext2sym = Dict(
)

const sym2func = Dict(
:FEATHER => (x, y; kw...) -> load_feather(x, y; kw...),
:PIC => (x, y; kw...) -> load_pic(x, y; kw...),
:CSV => (x, y; kw...) -> load_csv(x, y; kw...),
:TSV => (x, y; kw...) -> load_csv(x, y; kw...),
:JSON => (x, y; kw...) -> load_json(x, y; kw...)
:FEATHER => (x; kw...) -> load_feather(x; kw...),
:PIC => (x; kw...) -> load_pic(x; kw...),
:CSV => (x; kw...) -> load_csv(x; kw...),
:TSV => (x; kw...) -> load_csv(x; kw...),
:JSON => (x; kw...) -> load_json(x; kw...)
)

function load_feather(buf, data; kw...)
const Compressor = Dict(
:gzip => (lib = :CodecZlib, stream = :GzipDecompressorStream),
:zstd => (lib = :CodecZstd, stream = :ZstdDecompressorStream),
:xz => (lib = :CodecXz, stream = :XzDecompressorStream),
:lz4 => (lib = :CodecLz4, stream = :LZ4FrameDecompressorStream),
:bzip2 => (lib = :CodecBzip2, stream = :Bzip2DecompressorStream)
)

function load_feather(buf; kw...)
lib = checked_import(:Feather)
return Base.invokelatest(lib.read, buf)
end

function load_csv(buf, data; kw...)
function load_csv(buf; kw...)
lib = checked_import(:CSV)
return Base.invokelatest(lib.File, buf; kw...)
end

function load_pic(buf, data; kw...)
function load_pic(buf; kw...)
lib = checked_import(:ImageMagick)
return Base.invokelatest(lib.load_, data)
return Base.invokelatest(lib.load_, _getdata(buf))
end

function load_json(buf, data; kw...)
function load_json(buf; kw...)
lib = checked_import(:JSON3)
return Base.invokelatest(lib.read, data)
return Base.invokelatest(lib.read, _getdata(buf))
end

# Borrowed directly from FileIO
Expand Down Expand Up @@ -80,21 +88,68 @@ function datatype(url)
for (k, v) in ext2sym
occursin(k, ext) && return v
end
end

# For now (later magic should be reintroduced)
error("$ext is unsupported.")
# this set of functions is needed for distinguish between raw data and zipstreams,
# we do not want to wrap/unwrap zipstreams.
createbuffer(data::Vector{UInt8}) = IOBuffer(data)
createbuffer(data) = data

_getdata(buf::IOBuffer) = buf.data
_getdata(buf) = buf

function wrapdata(url, data, format, parser, error_on_undetected_format = true; kw...)
if isnothing(parser)
buf = createbuffer(data)
dtype = format == nothing ? datatype(url) : format

if dtype in keys(sym2func)
return sym2func[dtype](buf; kw...)
else
if error_on_undetected_format
@error "Data format $dtype is not supported."
else
@warn "Data format $dtype is not supported."
return data
end
end
else
return parser(data; kw...)
end
end

function wrapdata(url, data, format; kw...)
buf = IOBuffer(data)
dtype = format == nothing ? datatype(url) : format
# Check the file format of a stream.
# Taken from TableReader.jl
function checkformat(data, url)
if length(data) >= 6
magic = data[1:6]
else
error("Incomplete data received")
end
if magic[1:6] == b"\xFD\x37\x7A\x58\x5A\x00"
return :xz
elseif magic[1:2] == b"\x1f\x8b"
return :gzip
elseif magic[1:4] == b"\x28\xb5\x2f\xfd"
return :zstd
elseif magic[1:3] == b"\x42\x5A\x68"
return :bzip2
elseif magic[1:4] == b"\x04\x22\x4D\x18"
return :lz4
end

sym2func[dtype](buf, data; kw...)
end
# it's too ambigous to detect zip with magic bytes
_, ext = splitext(url)
if (ext == ".zip") | (ext == ".z")
return :zip
end

# we are giving up
return :none
end

"""
urldownload(url, progress = false; parser = nothing, format = nothing, headers = HTTP.Header[], httpkw = Pair[], update_period = 1, kw...)
urldownload(url, progress = false; parser = nothing, format = nothing, compress = :auto, multifiles = false, headers = HTTP.Header[], httpkw = Pair[], update_period = 1, kw...)
Download file from the corresponding url in memory and process it to the necessary data structure.
Expand All @@ -105,13 +160,22 @@ Download file from the corresponding url in memory and process it to the necessa
keyword arguments and return necessary data structure. If parser is set than it overrides all other settings, such as `format`.
If parser is not set, than internal parsers are used for data process.
* `format`: one of the fixed formats (:CSV, :PIC, :FEATHER, :JSON), if set overrides autodetection mechanism.
* `compress`: :auto by default, can be one of :none, :xz, :gzip, :bzip2, :lz4, :zstd, :zip. Determines whether file is compressed
and compression type. Decompressed data is processed either by custom `parser` or by internal parser. By default
for any compression type except of `:zip` internal parser is `CSV.File`, for `:zip` usual rules applies. If
`compress` is `:none` than custom parser should decompress data on its own.
* `multifiles`: `false` by default, for `:zip` compressed data defines, whether process only
first file inside archive or return an array of decompressed and processed objects.
* `headers`: `HTTP.jl` arguments that set http header of the request.
* `httpkw`: `HTTP.jl` additional keyword arguments that is passed to the `GET` function. Should be supplied as a vector of
pairs.
* `update_period`: period of `ProgressMeter` update, by default 1 sec
* `kw...`: any keyword arguments that should be passed to the data parser.
"""
function urldownload(url, progress = false; parser = nothing, format = nothing, headers = HTTP.Header[],
function urldownload(url, progress = false;
parser = nothing, format = nothing,
compress = :auto, multifiles = false,
headers = HTTP.Header[],
update_period = 1, httpkw = Pair[], kw...)
body = UInt8[]
HTTP.open("GET", url, headers; httpkw...) do stream
Expand All @@ -130,10 +194,40 @@ function urldownload(url, progress = false; parser = nothing, format = nothing,
end
end

if parser == nothing
return wrapdata(url, body, format; kw...)
compress = compress == :auto ? checkformat(body, url) : compress

if compress == :none
# skip unzipping entirely, it's parser responisbility to process the data
wrapdata(url, body, format, parser; kw...)
elseif compress == :zip
zlib = checked_import(:ZipFile)
zread = Base.invokelatest(getfield(zlib, :Reader), IOBuffer(body)).files
if multifiles
return [wrapdata(z.name, z, format, parser, false; kw...) for z in zread]
else
if length(zread) > 1
@warn "More than one file in zip archive, returning first."
elseif length(zread) == 0
@error "Zip archive is empty."
end
zread = zread[1]
# This one can easily fail for non csv files, nothing I can do about it
wrapdata(zread.name, zread, format, parser; kw...)
end
elseif compress in keys(Compressor)
# it's one of the TranscodingStreams.jl streams, not much to do here,
# defaults to CSV/custom parser
if isnothing(parser)
lib = checked_import(Compressor[compress].lib)
stream = getfield(lib, Compressor[compress].stream)
csvlib = checked_import(:CSV)
csvlibfile = getfield(csvlib, :File)
return Base.invokelatest(csvlibfile, Base.invokelatest(stream, IOBuffer(body)); kw...)
else
return parser(body; kw...)
end
else
return parser(body; kw...)
error("Unknown compress format: $compress")
end
end

Expand Down
52 changes: 52 additions & 0 deletions test/test03_compress.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
module TestCompress
using Test
using UrlDownload

@testset "compress" begin
url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.xz"
urldownload(url)
urldownload(url, compress = :xz)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.gz"
urldownload(url)
urldownload(url, compress = :gzip)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.bz2"
urldownload(url)
urldownload(url, compress = :bzip2)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.lz4"
urldownload(url)
urldownload(url, compress = :lz4)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.zst"
urldownload(url)
urldownload(url, compress = :zstd)
end

@testset "zip compress" begin
url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/ext.csv.zip"
urldownload(url)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test2.zip"
@test_logs (:warn, "More than one file in zip archive, returning first.") urldownload(url)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test2.zip"
@test_logs (:warn, "Data format nothing is not supported.") urldownload(url, multifiles = true)
end

@testset "compress overrides" begin
url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.zst"
urldownload(url, parser = identity)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test.zst"
urldownload(url, compress = :none, parser = identity)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test2.zip"
@test_logs (:warn, "More than one file in zip archive, returning first.") urldownload(url, parser = identity)

url = "https://raw.githubusercontent.com/Arkoniak/UrlDownload.jl/master/data/test2.zip"
urldownload(url, compress = :none, parser = identity)
end

end # module

0 comments on commit 681bfc2

Please sign in to comment.