Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions scripts/run_definite_error_checking_sql.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
#!/usr/bin/perl -w
use DBI;
use Getopt::Long;
use Data::Dumper;
use strict;
use warnings;

# E = ERROR : These are cases in which data appears to break the rules outlined by the CS schema. These suggest a likely problem with the load or fix.
# New things that show up here need to be addressed before the data can be released. Potentially a Jira ticket made. Example: Genomes without contigs.

my $db_name = undef;
my $db_user = undef;
my $db_pwd = undef;

my $usage = "This command requires the db_name, db_user, db_pwd\n".
"The DB parameters need to be in single quotes. \n".
"Example Call : \n".
"perl run_error_checking_sql.pl -db_name='kbase_sapling_v4:db4.chicago.kbase.us' -db_user='YOUR_DB_USER' -db_pwd='YOUR_DB_PWD'\n";
(GetOptions('db_name=s' => \$db_name,
'db_user=s' => \$db_user,
'db_pwd=s' => \$db_pwd
)
&& @ARGV == 0) || die $usage;
die $usage if ((!defined $db_user) || (!defined $db_pwd) || (!defined $db_name));

my $full_db_name = 'DBI:mysql:'.$db_name;
my $dbh = DBI->connect($full_db_name,$db_user, $db_pwd, { RaiseError => 1, ShowErrorStatement => 1 } );

my ($db,$dummy) = split(':',$db_name);

my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$year += 1900;
$mon = $mon + 1;

my $out = "./definite_errors_".$db_name."_".$year."_".$mon."_".$mday.".txt";
open (OUT,">$out") || die "Did not create $out";


my @sql_queries = (
"select 'Genomes_without_contigs_count' query_name, current_date(), database() as db, count(*)
from (select g.id, count(c.id) as cnt
from Genome g left outer join IsComposedOf i on i.from_link = g.id
left outer join Contig c on c.id = i.to_link
group by g.id
having cnt = 0) no_contigs"
,
"select 'Genomes_with_inconsistent_contigs_count' query_name, current_date(), database() as db, count(*)
from (select g.id, g.contigs, count(c.id) as cnt
from Genome g left outer join IsComposedOf i on i.from_link = g.id
left outer join Contig c on c.id = i.to_link
group by g.id, g.contigs) as subq
where subq.contigs != subq.cnt"
,
"select 'Genomes_without_CDS_count' query_name, current_date(), database() as db, count(*) from
(select g.id, su.from_link
from Genome g inner join Submitted su on g.id = su.to_link inner join Source s on s.id = su.from_link
where g.id not in (select distinct g1.id
from Genome g1 inner join IsOwnerOf i1 on g1.id = i1.from_link
inner join Feature f1 on f1.id = i1.to_link
where f1.feature_type = 'CDS')) subq"
,
"select 'Genomes_with_inconsistent_peg_and_cds_count' query_name, current_date(), database() as db, count(*)
from (
select g.id, g.pegs, count(*) cnt
from Genome g inner join IsOwnerOf io on io.from_link = g.id
inner join Feature f on f.id = io.to_link
where f.feature_type = 'CDS'
group by g.id, g.pegs
having g.pegs != cnt) subq"
,
"select 'CDS_Features_length_inconsistent_with_IsLocatedIn_length_sum_count' query_name, current_date(), database() as db, count(*)
from
(select f.id, f.sequence_length, sum.sum_length
from
(select sum(len) as sum_length, from_link from IsLocatedIn group by from_link) sum
inner join Feature f on f.id = sum.from_link
where f.sequence_length != sum.sum_length
and f.feature_type = 'CDS') subq"
,
"select 'Pairings_without_component_features_count' query_name, current_date(), database() as db, count(*)
from (select p.* from Pairing p where id not in (select to_link from IsInPair)) subq"
,
"select 'Missing_source_count' query_name, current_date(), database() as db, count(*) from (
select distinct r.from_link as ID
from HasCompoundAliasFrom r where r.from_link not in
(select id from Source)
union select distinct r.to_link
from HasReactionAliasFrom r where r.to_link not in
(select id from Source)
union select distinct r.from_link
from AssertsFunctionFor r where r.from_link not in
(select id from Source)
union select distinct r.from_link
from Aligned r where r.from_link not in
(select id from Source)
union select distinct r.from_link
from Treed r where r.from_link not in
(select id from Source)) m"
,
"select 'Select_plasma_genomes_with_wrong_genetic_code_count' query_name, current_date(), database() as db, count(*) from (
select scientific_name, domain, genetic_code from Genome
where (scientific_name like 'Acholeplasma%' or scientific_name like 'Mesoplasma%' or scientific_name like 'Mycoplasma%'
or scientific_name like 'Spiroplasma%' or scientific_name like 'Ureaplasma%')
and domain = 'Bacteria' and genetic_code != 4 order by scientific_name) m"
,
"select 'Eukaryotic_genomes_with_wrong_genetic_code_count' query_name, current_date(), database() as db, count(*) from (
select scientific_name, genetic_code, domain from Genome where domain like 'Eukaryota%' and genetic_code != 1) m"
,
"select 'Genomes_without_domains_count' query_name, current_date(), database() as db, count(*) from Genome where domain = ''"
,
"select 'Feature_with_duplicated_ordinals_count' query_name, current_date(), database() as db, count(*)
from (
select count(*) as cnt, from_link, ordinal from IsLocatedIn group by from_link, ordinal having cnt > 1) subq"
,
"select 'Features_without_location_information_count' query_name, current_date(), database() as db, count(*)
from (select id from Feature where id not in (select from_link from IsLocatedIn where ordinal = 0)) subq"
,
"select 'Features_where_ordinal_count_does_not_correspond_to_number_of_location_count' query_name, current_date(), database() as db, count(*)
from (select count(*) as cnt, max(ordinal) as maxOrd, from_link from IsLocatedIn group by from_link having cnt != (maxOrd + 1)) subq"
);

#print headers
print OUT "Query_Name\tDate\tDB\tCount\n";
foreach my $sql_query (@sql_queries)
{
my $sth = $dbh->prepare($sql_query) or die "Unable to prepare query : $sql_query :".$dbh->errstr();
$sth->execute or die "SQL Error: for query $sql_query : $DBI::errstr\n";
my (@results) = $sth->fetchrow_array();
print "Completed $results[0] ". localtime(time)."\n";
if ($results[3] > 0)
{
print OUT join("\t",@results). "\n";
}
}

close OUT;
137 changes: 137 additions & 0 deletions scripts/run_monitor_situation_checking_sql.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
#!/usr/bin/perl -w
use DBI;
use Getopt::Long;
use Data::Dumper;
use strict;
use warnings;


# M = Monitor : These are situations where increases in numbers are probably problematic data, but could theoretically be explained by rare biology occurrences.
# If the number of increases are larger than expected rarity of the biological occurrence, these should be investigated further.


my $db_name = undef;
my $db_user = undef;
my $db_pwd = undef;

my $usage = "This command requires the db_name, db_user, db_pwd\n".
"The DB parameters need to be in single quotes. \n".
"Example Call : \n".
"perl run_probable_error_checking_sql.pl -db_name='kbase_sapling_v4:db4.chicago.kbase.us' -db_user='YOUR_DB_USER' -db_pwd='YOUR_DB_PWD'\n";
(GetOptions('db_name=s' => \$db_name,
'db_user=s' => \$db_user,
'db_pwd=s' => \$db_pwd
)
&& @ARGV == 0) || die $usage;
die $usage if ((!defined $db_user) || (!defined $db_pwd) || (!defined $db_name));



my $full_db_name = 'DBI:mysql:'.$db_name;
my $dbh = DBI->connect($full_db_name,$db_user, $db_pwd, { RaiseError => 1, ShowErrorStatement => 1 } );

my ($db,$dummy) = split(':',$db_name);

my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
$year += 1900;
$mon = $mon + 1;

my $out = "./monitor_situations_".$db_name."_".$year."_".$mon."_".$mday.".txt";
open (OUT,">$out") || die "Did not create $out";


my @sql_queries = (
"select 'CDS_length_too_small_count' query_name, current_date(), database() as db, count(*)
from (select f.id, f.sequence_length from Feature f where sequence_length < 10 and feature_type = 'CDS') subq"
,
"select 'CDS_length_too_large_count' query_name, current_date(), database() as db, count(*)
from (select f.id, f.sequence_length from Feature f where feature_type = 'CDS' and sequence_length > 100000) subq"
,
"select 'Genomes_with_a_CDS_length_too_large_count' query_name, current_date(), database() as db, count(*)
from (
select g.id, g.scientific_name, g.domain, g.prokaryotic, s.id as source, count(*) as cnt
from Feature f inner join IsOwnerOf i on f.id = i.to_link inner join Genome g on g.id = i.from_link
inner join Submitted su on g.id = su.to_link inner join Source s on s.id = su.from_link
where feature_type = 'CDS' and sequence_length > 100000 group by g.id, g.scientific_name, g.domain, g.prokaryotic, s.id
having cnt > 0
order by count(*) desc) subq"
,
"select 'Genomes_with_non_methionine_starting_methione_ending_proteins_count' query_name, current_date(), database() as db, count(*)
from (
select g.id, g.scientific_name, count(*) as cnt
from Genome g inner join IsOwnerOf io on g.id = io.from_link
inner join IsProteinFor ip on ip.to_link = io.to_link
inner join ProteinSequence ps on ip.from_link = ps.id
where ps.sequence not like 'M%'
and ps.sequence like '%M'
group by g.id, g.scientific_name
having cnt > 0
order by cnt ) subq"
,
"select 'CDS_with_non_methionine_starting_methione_ending_proteins_count' query_name, current_date(), database() as db, sum(cnt)
from (
select g.id, g.scientific_name, count(*) as cnt
from Genome g inner join IsOwnerOf io on g.id = io.from_link
inner join IsProteinFor ip on ip.to_link = io.to_link
inner join ProteinSequence ps on ip.from_link = ps.id
where ps.sequence not like 'M%'
and ps.sequence like '%M'
group by g.id, g.scientific_name
having cnt > 0
order by cnt ) subq"
,
"select 'CDS_length_inconsistent_with_protein_length_count' query_name, current_date(), database() as db, sum(cnt)
from (
select substring_index(f.id,'.',2), count(*) cnt
from Feature f
inner join IsProteinFor i on f.id = i.to_link
inner join ProteinSequence p on p.id = i.from_link
where f.feature_type = 'CDS'
and f.sequence_length/3 != (length(p.sequence) + 1)
and f.sequence_length/3 != length(p.sequence)
group by substring_index(f.id,'.',2)) subq"
,
"select 'CDS_Features_not_modulo3_count' query_name, current_date(), database() as db, count(*)
from (select f.id, f.sequence_length from Feature f inner join IsOwnerOf i on f.id = i.to_link
where feature_type = 'CDS' and f.sequence_length%3 != 0 order by f.id) subq"
,
"select 'Genomes_with_CDS_Features_not_modulo3_count' query_name, current_date(), database() as db, count(*)
from (
select g.id genome_id, g.scientific_name, g.domain, g.prokaryotic, s.id, count(*) as cnt
from Feature f inner join IsOwnerOf i on f.id = i.to_link inner join Genome g on g.id = i.from_link
inner join Submitted su on g.id = su.to_link inner join Source s on s.id = su.from_link
where feature_type = 'CDS' and f.sequence_length%3 != 0 group by g.id, g.scientific_name, g.domain, g.prokaryotic, s.id
having cnt > 0
order by count(*) desc) subq"
,
"select 'Genomes_with_at_least_1%_of_CDS_Features_not_modulo3_count' query_name, current_date(), database() as db, count(*)
from (
select r1.genome_id, ((r1.not_modulo3_count/r2.total_cds_count) * 100) percent_not_modulo3, r1.not_modulo3_count, r2.total_cds_count
from (select g1.id genome_id, count(f1.id) as not_modulo3_count
from Feature f1 inner join IsOwnerOf i1 on f1.id = i1.to_link inner join Genome g1 on g1.id = i1.from_link
where f1.feature_type = 'CDS' and f1.sequence_length%3 != 0
group by g1.id) r1 inner join
(select g2.id genome_id, count(f2.id) as total_cds_count
from Feature f2 inner join IsOwnerOf i2 on f2.id = i2.to_link inner join Genome g2 on g2.id = i2.from_link
where f2.feature_type = 'CDS'
group by g2.id) r2 on r1.genome_id = r2.genome_id
where ((r1.not_modulo3_count/r2.total_cds_count) * 100) > 1
order by ((r1.not_modulo3_count/r2.total_cds_count) * 100) desc) subq"
,
"select 'CDS_Features_without_functions_count' query_name, current_date(), database() as db, count(f.id)
from Feature f
where f.feature_type = 'CDS' and f.function = ''"
);

#print headers
print OUT "Query_Name\tDate\tDB\tCount\n";
foreach my $sql_query (@sql_queries)
{
my $sth = $dbh->prepare($sql_query) or die "Unable to prepare query : $sql_query :".$dbh->errstr();
$sth->execute or die "SQL Error: for query $sql_query : $DBI::errstr\n";
my (@results) = $sth->fetchrow_array();
print "Completed $results[0] ". localtime(time)."\n";
print OUT join("\t",@results). "\n";
}

close OUT;
Loading