-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathNCBILink2tab
executable file
·106 lines (98 loc) · 2.75 KB
/
NCBILink2tab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env perl
use Getopt::Long;
use strict;
no strict 'refs'; # because Perl sucks
## Enter as many files as you want. Each file must be either:
## 1. XML from an eutils' elinks.fcgi query, keys are query IDs (see below);
## 2. Tabular; keys are in column 1.
## All column-1 and query IDs must be the same (or from a common set).
## The output will be keyed on this common set (col 1), and all other columns will be the associated IDs found in the input files.
## Header and first record from an elinks.fcgi query. XML files must have this format:
#<?xml version="1.0"?>
#<!DOCTYPE eLinkResult PUBLIC "-//NLM//DTD eLinkResult, 23 November 2010//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/eLink_101123.dtd">
#<eLinkResult>
# <LinkSet>
# <DbFrom>nuccore</DbFrom>
# <IdList>
# <Id>2443265</Id>
# </IdList>
# <LinkSetDb>
# <DbTo>gene</DbTo>
# <LinkName>nuccore_gene</LinkName>
# <Link>
# <Id>668880</Id>
# </Link>
# </LinkSetDb>
# </LinkSet>
my (%linksets, %alldbs, $fromname);
foreach my $file (@ARGV) {
my ($xml, $start, $capture, $idcapture, $source, $dbname, $fromid);
open TEST, $file;
while (<TEST>) {
$xml = 1 if $_ =~ /^<\?xml/;
last if $. == 1;
}
close TEST;
open IN, $file or warn "Cannot read input file '$file': $!\n";
if ($xml) {
while (<IN>) {
$start = 1 if $_ =~ /^<eLinkResult>/;
next unless $start;
$_ =~ s/[\n\r]+$//;
if ($_ =~ /^\s*<LinkSet>/) {
$capture = 1;
} elsif (/^\s*<\/LinkSet>/) {
$capture = 0;
} elsif (/^\s*<DbFrom>(.*)<\/DbFrom>/) {
$fromname = $1;
$source = 'FROM';
} elsif (/^\s*<IdList>/) {
$idcapture = 1;
} elsif (/^\s*<\/IdList>/) {
$idcapture = 0;
} elsif (/^\s*<LinkSetDb>/) {
# ignore for now
} elsif (/^\s*<\/LinkSetDb>/) {
# ignore for now
} elsif (/^\s*<DbTo>(.*)<\/DbTo>/) {
$dbname = $1;
$source = 'TO';
$alldbs{$dbname} = 1;
} elsif (/^\s*<LinkName>.*<\/LinkName>/) {
# ignore for now
} elsif (/^\s*<Link>/) {
$idcapture = 1;
} elsif (/^\s*<\/Link>/) {
$idcapture = 0;
} elsif (/^\s*<Id>(.*)<\/Id>/) {
if ($source eq 'FROM') {
$fromid = $1;
} else {
$linksets{$fromid}{$dbname}{$1} = 1;
}
}
}
} else {
my @dbnames;
while (<IN>) {
$_ =~ s/[\n\r]+$//;
my @data = split /\t/, $_;
if ($. == 1) {
@dbnames = @data;
$alldbs{$_} = 1 foreach @dbnames[1..$#data];
next;
}
$linksets{$data[0]}{$dbnames[$_]}{$data[$_]} = 1 foreach (1..$#data);
}
}
close IN;
}
print join "\t", ($fromname, sort keys %alldbs), "\n";
foreach my $id (sort {$a <=> $b} keys %linksets) {
print $id;
foreach my $linkdb (sort keys %alldbs) {
print "\t$_" foreach sort keys %{ $linksets{$id}{$linkdb} };
}
print "\n";
}
exit;