diff --git a/scripts/run_definite_error_checking_sql.pl b/scripts/run_definite_error_checking_sql.pl new file mode 100755 index 0000000..86ac53c --- /dev/null +++ b/scripts/run_definite_error_checking_sql.pl @@ -0,0 +1,136 @@ +#!/usr/bin/perl -w +use DBI; +use Getopt::Long; +use Data::Dumper; +use strict; +use warnings; + +# E = ERROR : These are cases in which data appears to break the rules outlined by the CS schema. These suggest a likely problem with the load or fix. +# New things that show up here need to be addressed before the data can be released. Potentially a Jira ticket made. Example: Genomes without contigs. + +my $db_name = undef; +my $db_user = undef; +my $db_pwd = undef; + +my $usage = "This command requires the db_name, db_user, db_pwd\n". + "The DB parameters need to be in single quotes. \n". + "Example Call : \n". + "perl run_error_checking_sql.pl -db_name='kbase_sapling_v4:db4.chicago.kbase.us' -db_user='YOUR_DB_USER' -db_pwd='YOUR_DB_PWD'\n"; +(GetOptions('db_name=s' => \$db_name, + 'db_user=s' => \$db_user, + 'db_pwd=s' => \$db_pwd +) + && @ARGV == 0) || die $usage; +die $usage if ((!defined $db_user) || (!defined $db_pwd) || (!defined $db_name)); + +my $full_db_name = 'DBI:mysql:'.$db_name; +my $dbh = DBI->connect($full_db_name,$db_user, $db_pwd, { RaiseError => 1, ShowErrorStatement => 1 } ); + +my ($db,$dummy) = split(':',$db_name); + +my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); +$year += 1900; +$mon = $mon + 1; + +my $out = "./definite_errors_".$db_name."_".$year."_".$mon."_".$mday.".txt"; +open (OUT,">$out") || die "Did not create $out"; + + +my @sql_queries = ( +"select 'Genomes_without_contigs_count' query_name, current_date(), database() as db, count(*) +from (select g.id, count(c.id) as cnt +from Genome g left outer join IsComposedOf i on i.from_link = g.id +left outer join Contig c on c.id = i.to_link +group by g.id +having cnt = 0) no_contigs" +, +"select 'Genomes_with_inconsistent_contigs_count' query_name, current_date(), database() as db, count(*) +from (select g.id, g.contigs, count(c.id) as cnt +from Genome g left outer join IsComposedOf i on i.from_link = g.id +left outer join Contig c on c.id = i.to_link +group by g.id, g.contigs) as subq +where subq.contigs != subq.cnt" +, +"select 'Genomes_without_CDS_count' query_name, current_date(), database() as db, count(*) from +(select g.id, su.from_link +from Genome g inner join Submitted su on g.id = su.to_link inner join Source s on s.id = su.from_link +where g.id not in (select distinct g1.id +from Genome g1 inner join IsOwnerOf i1 on g1.id = i1.from_link +inner join Feature f1 on f1.id = i1.to_link +where f1.feature_type = 'CDS')) subq" +, +"select 'Genomes_with_inconsistent_peg_and_cds_count' query_name, current_date(), database() as db, count(*) +from ( +select g.id, g.pegs, count(*) cnt +from Genome g inner join IsOwnerOf io on io.from_link = g.id +inner join Feature f on f.id = io.to_link +where f.feature_type = 'CDS' +group by g.id, g.pegs +having g.pegs != cnt) subq" +, +"select 'CDS_Features_length_inconsistent_with_IsLocatedIn_length_sum_count' query_name, current_date(), database() as db, count(*) +from +(select f.id, f.sequence_length, sum.sum_length +from +(select sum(len) as sum_length, from_link from IsLocatedIn group by from_link) sum +inner join Feature f on f.id = sum.from_link +where f.sequence_length != sum.sum_length +and f.feature_type = 'CDS') subq" +, +"select 'Pairings_without_component_features_count' query_name, current_date(), database() as db, count(*) +from (select p.* from Pairing p where id not in (select to_link from IsInPair)) subq" +, +"select 'Missing_source_count' query_name, current_date(), database() as db, count(*) from ( +select distinct r.from_link as ID + from HasCompoundAliasFrom r where r.from_link not in + (select id from Source) +union select distinct r.to_link + from HasReactionAliasFrom r where r.to_link not in + (select id from Source) +union select distinct r.from_link + from AssertsFunctionFor r where r.from_link not in + (select id from Source) +union select distinct r.from_link + from Aligned r where r.from_link not in + (select id from Source) +union select distinct r.from_link + from Treed r where r.from_link not in + (select id from Source)) m" +, +"select 'Select_plasma_genomes_with_wrong_genetic_code_count' query_name, current_date(), database() as db, count(*) from ( +select scientific_name, domain, genetic_code from Genome +where (scientific_name like 'Acholeplasma%' or scientific_name like 'Mesoplasma%' or scientific_name like 'Mycoplasma%' +or scientific_name like 'Spiroplasma%' or scientific_name like 'Ureaplasma%') +and domain = 'Bacteria' and genetic_code != 4 order by scientific_name) m" +, +"select 'Eukaryotic_genomes_with_wrong_genetic_code_count' query_name, current_date(), database() as db, count(*) from ( +select scientific_name, genetic_code, domain from Genome where domain like 'Eukaryota%' and genetic_code != 1) m" +, +"select 'Genomes_without_domains_count' query_name, current_date(), database() as db, count(*) from Genome where domain = ''" +, +"select 'Feature_with_duplicated_ordinals_count' query_name, current_date(), database() as db, count(*) +from ( +select count(*) as cnt, from_link, ordinal from IsLocatedIn group by from_link, ordinal having cnt > 1) subq" +, +"select 'Features_without_location_information_count' query_name, current_date(), database() as db, count(*) +from (select id from Feature where id not in (select from_link from IsLocatedIn where ordinal = 0)) subq" +, +"select 'Features_where_ordinal_count_does_not_correspond_to_number_of_location_count' query_name, current_date(), database() as db, count(*) +from (select count(*) as cnt, max(ordinal) as maxOrd, from_link from IsLocatedIn group by from_link having cnt != (maxOrd + 1)) subq" +); + +#print headers +print OUT "Query_Name\tDate\tDB\tCount\n"; +foreach my $sql_query (@sql_queries) +{ + my $sth = $dbh->prepare($sql_query) or die "Unable to prepare query : $sql_query :".$dbh->errstr(); + $sth->execute or die "SQL Error: for query $sql_query : $DBI::errstr\n"; + my (@results) = $sth->fetchrow_array(); + print "Completed $results[0] ". localtime(time)."\n"; + if ($results[3] > 0) + { + print OUT join("\t",@results). "\n"; + } +} + +close OUT; diff --git a/scripts/run_monitor_situation_checking_sql.pl b/scripts/run_monitor_situation_checking_sql.pl new file mode 100755 index 0000000..c4be969 --- /dev/null +++ b/scripts/run_monitor_situation_checking_sql.pl @@ -0,0 +1,137 @@ +#!/usr/bin/perl -w +use DBI; +use Getopt::Long; +use Data::Dumper; +use strict; +use warnings; + + +# M = Monitor : These are situations where increases in numbers are probably problematic data, but could theoretically be explained by rare biology occurrences. +# If the number of increases are larger than expected rarity of the biological occurrence, these should be investigated further. + + +my $db_name = undef; +my $db_user = undef; +my $db_pwd = undef; + +my $usage = "This command requires the db_name, db_user, db_pwd\n". + "The DB parameters need to be in single quotes. \n". + "Example Call : \n". + "perl run_probable_error_checking_sql.pl -db_name='kbase_sapling_v4:db4.chicago.kbase.us' -db_user='YOUR_DB_USER' -db_pwd='YOUR_DB_PWD'\n"; +(GetOptions('db_name=s' => \$db_name, + 'db_user=s' => \$db_user, + 'db_pwd=s' => \$db_pwd +) + && @ARGV == 0) || die $usage; +die $usage if ((!defined $db_user) || (!defined $db_pwd) || (!defined $db_name)); + + + +my $full_db_name = 'DBI:mysql:'.$db_name; +my $dbh = DBI->connect($full_db_name,$db_user, $db_pwd, { RaiseError => 1, ShowErrorStatement => 1 } ); + +my ($db,$dummy) = split(':',$db_name); + +my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); +$year += 1900; +$mon = $mon + 1; + +my $out = "./monitor_situations_".$db_name."_".$year."_".$mon."_".$mday.".txt"; +open (OUT,">$out") || die "Did not create $out"; + + +my @sql_queries = ( +"select 'CDS_length_too_small_count' query_name, current_date(), database() as db, count(*) +from (select f.id, f.sequence_length from Feature f where sequence_length < 10 and feature_type = 'CDS') subq" +, +"select 'CDS_length_too_large_count' query_name, current_date(), database() as db, count(*) +from (select f.id, f.sequence_length from Feature f where feature_type = 'CDS' and sequence_length > 100000) subq" +, +"select 'Genomes_with_a_CDS_length_too_large_count' query_name, current_date(), database() as db, count(*) +from ( +select g.id, g.scientific_name, g.domain, g.prokaryotic, s.id as source, count(*) as cnt +from Feature f inner join IsOwnerOf i on f.id = i.to_link inner join Genome g on g.id = i.from_link +inner join Submitted su on g.id = su.to_link inner join Source s on s.id = su.from_link +where feature_type = 'CDS' and sequence_length > 100000 group by g.id, g.scientific_name, g.domain, g.prokaryotic, s.id +having cnt > 0 +order by count(*) desc) subq" +, +"select 'Genomes_with_non_methionine_starting_methione_ending_proteins_count' query_name, current_date(), database() as db, count(*) +from ( +select g.id, g.scientific_name, count(*) as cnt +from Genome g inner join IsOwnerOf io on g.id = io.from_link +inner join IsProteinFor ip on ip.to_link = io.to_link +inner join ProteinSequence ps on ip.from_link = ps.id +where ps.sequence not like 'M%' +and ps.sequence like '%M' +group by g.id, g.scientific_name +having cnt > 0 +order by cnt ) subq" +, +"select 'CDS_with_non_methionine_starting_methione_ending_proteins_count' query_name, current_date(), database() as db, sum(cnt) +from ( +select g.id, g.scientific_name, count(*) as cnt +from Genome g inner join IsOwnerOf io on g.id = io.from_link +inner join IsProteinFor ip on ip.to_link = io.to_link +inner join ProteinSequence ps on ip.from_link = ps.id +where ps.sequence not like 'M%' +and ps.sequence like '%M' +group by g.id, g.scientific_name +having cnt > 0 +order by cnt ) subq" +, +"select 'CDS_length_inconsistent_with_protein_length_count' query_name, current_date(), database() as db, sum(cnt) +from ( +select substring_index(f.id,'.',2), count(*) cnt +from Feature f +inner join IsProteinFor i on f.id = i.to_link +inner join ProteinSequence p on p.id = i.from_link +where f.feature_type = 'CDS' +and f.sequence_length/3 != (length(p.sequence) + 1) +and f.sequence_length/3 != length(p.sequence) +group by substring_index(f.id,'.',2)) subq" +, +"select 'CDS_Features_not_modulo3_count' query_name, current_date(), database() as db, count(*) +from (select f.id, f.sequence_length from Feature f inner join IsOwnerOf i on f.id = i.to_link +where feature_type = 'CDS' and f.sequence_length%3 != 0 order by f.id) subq" +, +"select 'Genomes_with_CDS_Features_not_modulo3_count' query_name, current_date(), database() as db, count(*) +from ( +select g.id genome_id, g.scientific_name, g.domain, g.prokaryotic, s.id, count(*) as cnt +from Feature f inner join IsOwnerOf i on f.id = i.to_link inner join Genome g on g.id = i.from_link +inner join Submitted su on g.id = su.to_link inner join Source s on s.id = su.from_link +where feature_type = 'CDS' and f.sequence_length%3 != 0 group by g.id, g.scientific_name, g.domain, g.prokaryotic, s.id +having cnt > 0 +order by count(*) desc) subq" +, +"select 'Genomes_with_at_least_1%_of_CDS_Features_not_modulo3_count' query_name, current_date(), database() as db, count(*) +from ( +select r1.genome_id, ((r1.not_modulo3_count/r2.total_cds_count) * 100) percent_not_modulo3, r1.not_modulo3_count, r2.total_cds_count +from (select g1.id genome_id, count(f1.id) as not_modulo3_count +from Feature f1 inner join IsOwnerOf i1 on f1.id = i1.to_link inner join Genome g1 on g1.id = i1.from_link +where f1.feature_type = 'CDS' and f1.sequence_length%3 != 0 +group by g1.id) r1 inner join +(select g2.id genome_id, count(f2.id) as total_cds_count +from Feature f2 inner join IsOwnerOf i2 on f2.id = i2.to_link inner join Genome g2 on g2.id = i2.from_link +where f2.feature_type = 'CDS' +group by g2.id) r2 on r1.genome_id = r2.genome_id +where ((r1.not_modulo3_count/r2.total_cds_count) * 100) > 1 +order by ((r1.not_modulo3_count/r2.total_cds_count) * 100) desc) subq" +, +"select 'CDS_Features_without_functions_count' query_name, current_date(), database() as db, count(f.id) +from Feature f +where f.feature_type = 'CDS' and f.function = ''" +); + +#print headers +print OUT "Query_Name\tDate\tDB\tCount\n"; +foreach my $sql_query (@sql_queries) +{ + my $sth = $dbh->prepare($sql_query) or die "Unable to prepare query : $sql_query :".$dbh->errstr(); + $sth->execute or die "SQL Error: for query $sql_query : $DBI::errstr\n"; + my (@results) = $sth->fetchrow_array(); + print "Completed $results[0] ". localtime(time)."\n"; + print OUT join("\t",@results). "\n"; +} + +close OUT; diff --git a/scripts/run_probable_error_checking_sql.pl b/scripts/run_probable_error_checking_sql.pl new file mode 100755 index 0000000..fdc9a70 --- /dev/null +++ b/scripts/run_probable_error_checking_sql.pl @@ -0,0 +1,164 @@ +#!/usr/bin/perl -w +use DBI; +use Getopt::Long; +use Data::Dumper; +use strict; +use warnings; + +# P = Probable Error : These are situations where the data is almost definitely wrong, but we already have data of this type. +# These are likely to be as a result of bad quality data or errors in the load scripts. These need to be investigated further. + + +my $db_name = undef; +my $db_user = undef; +my $db_pwd = undef; + +my $usage = "This command requires the db_name, db_user, db_pwd\n". + "The DB parameters need to be in single quotes. \n". + "Example Call : \n". + "perl run_probable_error_checking_sql.pl -db_name='kbase_sapling_v4:db4.chicago.kbase.us' -db_user='YOUR_DB_USER' -db_pwd='YOUR_DB_PWD'\n"; +(GetOptions('db_name=s' => \$db_name, + 'db_user=s' => \$db_user, + 'db_pwd=s' => \$db_pwd +) + && @ARGV == 0) || die $usage; +die $usage if ((!defined $db_user) || (!defined $db_pwd) || (!defined $db_name)); + + + +my $full_db_name = 'DBI:mysql:'.$db_name; +my $dbh = DBI->connect($full_db_name,$db_user, $db_pwd, { RaiseError => 1, ShowErrorStatement => 1 } ); + +my ($db,$dummy) = split(':',$db_name); + +my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); +$year += 1900; +$mon = $mon + 1; + +my $out = "./probable_errors_".$db_name."_".$year."_".$mon."_".$mday.".txt"; +open (OUT,">$out") || die "Did not create $out"; + + +my @sql_queries = ( +"select 'Genomes_with_a_CDS_length_too_large_and_big_for_protein_count' query_name, current_date(), database() as db, 'Way too large CDS', count(*) +from ( +select g.id, g.scientific_name, g.domain, g.prokaryotic, s.id as source, count(*) as cnt +from Feature f inner join IsOwnerOf i on f.id = i.to_link inner join Genome g on g.id = i.from_link +inner join Submitted su on g.id = su.to_link inner join Source s on s.id = su.from_link +inner join IsProteinFor ip on f.id = ip.to_link +inner join ProteinSequence p on p.id = ip.from_link +where feature_type = 'CDS' and sequence_length > 100000 +and sequence_length/3 > ((length(p.sequence) + 1)* 1.10) +group by g.id, g.scientific_name, g.domain, g.prokaryotic, s.id +having cnt > 0 +order by count(*) desc) subq" +, +"select 'Protein_sequences_that_look_like_DNA_count' query_name, current_date(), database() as db, 'Protein sequences that are DNA only',count(*) +from (select p.id, p.sequence from ProteinSequence p where p.sequence RLIKE '^[AGTC]+\$') subq" +, +"select 'CDS_length_definitely_wrong_with_protein_length_count' query_name, current_date(), database() as db, +'Query is geared to allow for stop codon being included or not. This would allow for up to 3 frameshifts (in same direction - deletion or insertion), which would be extremely unlikely ', sum(cnt) +from ( +select substring_index(f.id,'.',2), count(*) cnt +from Feature f +inner join IsProteinFor i on f.id = i.to_link +inner join ProteinSequence p on p.id = i.from_link +where f.feature_type = 'CDS' +and FLOOR(f.sequence_length/3) != (length(p.sequence) + 1) +and CEIL(f.sequence_length/3) != (length(p.sequence) + 1) +and FLOOR(f.sequence_length/3) != length(p.sequence) +group by substring_index(f.id,'.',2)) subq" +, +"select 'Genomes_with_5percent_or_more_obviously_inconistent_proteins' query_name, current_date(), database() as db, +'Genomes with 5% of CDS features having this case. Red flag. Query is geared to allow for stop codon being included or not. This would allow for up to 3 frameshifts (in same direction - deletion or insertion), which would be extremely unlikely', +count(*) +from ( +select g.id, g.scientific_name, subq1.cnt, subq2.cds_cnt, (subq1.cnt/subq2.cds_cnt) +from Genome g inner join +(select substring_index(f.id,'.',2) as genome_id, count(*) cnt +from Feature f +inner join IsProteinFor i on f.id = i.to_link +inner join ProteinSequence p on p.id = i.from_link +where f.feature_type = 'CDS' +and FLOOR(f.sequence_length/3) != (length(p.sequence) + 1) +and CEIL(f.sequence_length/3) != (length(p.sequence) + 1) +and FLOOR(f.sequence_length/3) != length(p.sequence) +group by substring_index(f.id,'.',2)) subq1 +on subq1.genome_id =g.id +inner join (select substring_index(f1.id,'.',2) as genome_id, count(*) as cds_cnt from Feature f1 +where f1.feature_type = 'CDS' group by substring_index(f1.id,'.',2)) subq2 +on g.id = subq2.genome_id +where (subq1.cnt/subq2.cds_cnt) > .05) fullsub" +, +"select 'CDS_Features_without_protein_sequences_count' query_name, current_date(), database() as db, 'CDS FEATURES SHOULD HAVE AN ASSOCIATED PROTEIN SEQUENCE', count(*) +from (select f.id from +Feature f where feature_type = 'CDS' and id not in (select to_link from IsProteinFor)) subq" +, +"select 'CDS_length_equal_to_protein_length_count' query_name, current_date(), database() as db, 'CDS length equal to protein length', count(*) +from (select f.id, f.sequence_length +from Feature f +inner join IsProteinFor i on f.id = i.to_link +inner join ProteinSequence p on p.id = i.from_link +where f.feature_type = 'CDS' +and f.sequence_length = (length(p.sequence))) subq" +, +"select 'Genomes_without_IsTaxonomyOf_relationship_count' query_name, current_date(), database() as db, +'The following are false positives (g.23746, g.26015, g.26860, g.2876, g.484, g.626, g.96) - This count should not be more than 7', +count(*) +from Genome g +where id not in (select to_link from IsTaxonomyOf)" +, +"select 'Genomes_with_suspicious_reversed_ordinals_count' query_name, current_date(), database() as db, 'Ordinals reversed', count(*) +from ( +select substring_index(s.feature,'.',2), g.scientific_name, g.domain, u.from_link, g.pegs, count(*) as cnt +from +(select distinct i1.from_link as feature +from IsLocatedIn i1 inner join IsLocatedIn i2 on i1.from_link = i2.from_link and i1.to_link = i2.to_link +where i1.dir = '-' +and i1.begin > i2.begin +and i1.ordinal > i2.ordinal) s +inner join Genome g on g.id = substring_index(s.feature,'.',2) +inner join Submitted u on u.to_link = g.id +group by substring_index(s.feature,'.',2), g.scientific_name, g.domain, u.from_link, g.pegs) subq" +, +"select 'Protein_sequences_that_may_be_reversed_that_have_corresponding_features_count' query_name, current_date(), database() as db, +'Ends in Methinonine and does not start with Methionine',count(*) +from ( +select distinct ps.id +from ProteinSequence ps +inner join IsProteinFor ip on ip.from_link = ps.id +inner join Feature f on ip.to_link = f.id +where sequence not like 'M%' +and sequence like '%M') subq" +, +"select 'Genomes_that_may_have_reversed_Protein_sequences_count' query_name, current_date(), database() as db, +'Genomes with reversed proteins - Ends in Methinonine and does not start with Methionine',count(*) +from ( +select g.id, g.scientific_name, count(*) as cnt +from Genome g inner join IsOwnerOf io on g.id = io.from_link +inner join IsProteinFor ip on ip.to_link = io.to_link +inner join ProteinSequence ps on ip.from_link = ps.id +where ps.sequence not like 'M%' +and ps.sequence like '%M' +group by g.id, g.scientific_name +having cnt > 0 +order by cnt) subq " +, +"select 'Multiple_location_features_on_both_strands_count' query_name, current_date(), database() as db, 'Genes located on both strands', count(*) +from (select distinct i1.from_link +from IsLocatedIn i1 inner join IsLocatedIn i2 on i1.from_link = i2.from_link and i1.to_link = i2.to_link +where i1.dir = '-' and i2.dir = '+') subq" +); + +#print headers +print OUT "Query_Name\tDate\tDB\tComments\tCount\n"; +foreach my $sql_query (@sql_queries) +{ + my $sth = $dbh->prepare($sql_query) or die "Unable to prepare query : $sql_query :".$dbh->errstr(); + $sth->execute or die "SQL Error: for query $sql_query : $DBI::errstr\n"; + my (@results) = $sth->fetchrow_array(); + print "Completed $results[0] ". localtime(time)."\n"; + print OUT join("\t",@results). "\n"; +} + +close OUT;