-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathblast_seq_df.jl
46 lines (38 loc) · 1.25 KB
/
blast_seq_df.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
using FASTX # read fasta
using CSV
using DataFrames
using BioSequences # reverse_complement;
function fasta_read(infasta)
reader = open(FASTA.Reader, infasta)
#ids = String[]; seqs = String[]
df = DataFrame(id = String[], seq = String[])
for record in reader
sequence = FASTA.sequence(record)
id = FASTA.identifier(record)
#push!(ids, id)
#push!(seqs, sequence)
push!(df, [id, sequence])
end
close(reader)
return df
end
function gene_read(ingene)
return DataFrame(CSV.File(ingene, header=false))
end
function fasta_output(infasta_file, ingene_file)
fasta_df = fasta_read(infasta_file)
gene_df = gene_read(ingene_file)
for n in 1:nrow(gene_df)
geneid = "copy$n"
start = gene_df[n, 9]
stop = gene_df[n, 10]
strand = (start < stop ) ? "+" : "-"
start > stop && ((start, stop) = (stop, start))
seqdf = filter(:id => x -> x == gene_df[n, 2], fasta_df)
(seq, seqid) = (seqdf.seq[1], seqdf.id[1])
subseq = LongDNA{4}(SubString(seq, start, stop))
strand == "-" && (reverse_complement!(subseq))
println(">$geneid\t$seqid|$start..$stop|$strand\t$(length(subseq))\n$subseq")
end
end
fasta_output(ARGS[1], ARGS[2])