-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembl_taxonomy.jl
48 lines (44 loc) · 1.31 KB
/
embl_taxonomy.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
using Serialization
function taxonomyfromembl(infile::String)
local accession::String = ""
taxonomy = Vector{String}(undef, 0)
open(infile, "r") do emblfile
for line in eachline(emblfile)
if startswith(line, "AC")
if accession == ""
accession = split(line[6:end - 1], "; ")[1]
end
elseif startswith(line, "OC")
append!(taxonomy, split(line[6:end - 1], "; "))
end
end
end
return accession, taxonomy
end
function createtaxonomy(embldir::String, outfile::String)
if isdir(embldir)
files = filter(endswith(".embl"), readdir(embldir, join=true))
else
files = Vector{String}()
push!(files,embldir)
end
taxonomies = Dict{String, Vector{String}}()
for file in files
accession, taxonomy = taxonomyfromembl(file)
taxonomies[accession] = taxonomy
end
serialize(outfile, taxonomies)
return taxonomies
end
function readtaxonomy(taxfile::String)
taxonomies = deserialize(taxfile)
end
function findtaxon(taxonomies::Dict{String, Vector{String}}, taxon::String)
hits = Vector{String}(undef, 0)
for (accession, taxa) in taxonomies
if taxon in taxa
push!(hits, accession)
end
end
return hits
end