diff --git a/Model/lib/psql/webready/comparative/AlphaFoldGenes.psql b/Model/lib/psql/webready/comparative/AlphaFoldGenes.psql new file mode 100644 index 000000000..a0702a3df --- /dev/null +++ b/Model/lib/psql/webready/comparative/AlphaFoldGenes.psql @@ -0,0 +1,97 @@ + drop table if exists :SCHEMA.uniprotgenes; + + CREATE UNLOGGED TABLE :SCHEMA.uniprotGenes AS + SELECT DISTINCT ed.name + , d.* + , edr.version + , aa.source_id + , pa.gene_source_id + , CASE WHEN (ed.name like '%SWISSPROT%' AND edr.version = 'xrefuniparc') THEN 1 + WHEN (ed.name like '%SPTREMBL%' AND edr.version = 'xrefuniparc') THEN 2 + WHEN (ed.name like '%SWISSPROT%' AND edr.version = 'xref_sprot_blastp') THEN 4 + WHEN (ed.name like '%SPTREMBL%' and edr.version = 'xref_trembl_blastp') THEN 5 + ELSE 6 END as rank + , (af.last_residue_index - af.first_residue_index + 1) as hit_length + FROM sres.dbref d + LEFT JOIN apidb.AlphaFold af ON d.primary_identifier = af.uniprot_id + , sres.externaldatabase ed + , sres.externaldatabaserelease edr + , dots.dbrefaafeature db + , dots.aafeature aa + , :SCHEMA.ProteinAttributes pa + WHERE (ed.name = 'Uniprot/SWISSPROT' OR ed.name = 'Uniprot/SPTREMBL') + AND (edr.version = 'xrefuniparc' OR edr.version = 'xref_sprot_blastp' OR edr.version = 'xref_trembl_blastp') + AND edr.external_database_id = ed.external_database_id + AND d.external_database_release_id = edr.external_database_release_id + AND db.db_ref_id = d.db_ref_id + AND aa.aa_feature_id = db.aa_feature_id + AND pa.source_id = aa.source_id + UNION + SELECT DISTINCT ed.name + , d.* + , edr.version + , na.source_id + , ta.gene_source_id + , 3 as rank + , (af.last_residue_index - af.first_residue_index + 1) as hit_length + FROM sres.dbref d + LEFT JOIN apidb.AlphaFold af ON d.primary_identifier = af.uniprot_id + , sres.externaldatabase ed + , sres.externaldatabaserelease edr + , dots.dbrefnafeature db + , dots.nafeature na + , :SCHEMA.TranscriptAttributes ta + WHERE ed.name like '%_dbxref_%niprot_%RSRC' + AND edr.external_database_id = ed.external_database_id + AND d.external_database_release_id = edr.external_database_release_id + AND db.db_ref_id = d.db_ref_id + AND na.na_feature_id = db.na_feature_id + AND (ta.transcript_source_id = na.source_id OR ta.gene_source_id = na.source_id) + + ; + + drop table if exists :SCHEMA.minrank +; + CREATE UNLOGGED TABLE :SCHEMA.minRank AS ( + SELECT gene_source_id + , MIN(rank) as min_rank + FROM :SCHEMA.uniprotGenes upg + WHERE hit_length is not null + GROUP BY gene_source_id + ) + + ; + + drop table if exists :SCHEMA.alphafoldhits; + + CREATE UNLOGGED TABLE :SCHEMA.alphaFoldHits AS ( + SELECT DISTINCT gene_source_id + , last_value(primary_identifier) over (PARTITION BY gene_source_id ORDER BY hit_length ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS uniprot_id + FROM ( + SELECT upg.* + FROM :SCHEMA.uniprotGenes upg + , :SCHEMA.minRank + WHERE upg.gene_source_id = minRank.gene_source_id + AND upg.rank = minRank.min_rank + ) t + ) + ; + + CREATE TABLE :SCHEMA.AlphaFoldGenes AS ( + SELECT afh.gene_source_id + , af.uniprot_id + , af.source_id as alphafold_id + , af.alphafold_version + , af.first_residue_index + , af.last_residue_index + FROM apidb.alphafold af + , :SCHEMA.alphaFoldHits afh + WHERE afh.uniprot_id = af.uniprot_id + ) + + ; + + drop table if exists :SCHEMA.uniprotgenes; + drop table if exists :SCHEMA.minrank; + drop table if exists :SCHEMA.alphafoldhits; + diff --git a/Model/lib/psql/webready/comparative/AlphaFoldGenes_ix.psql b/Model/lib/psql/webready/comparative/AlphaFoldGenes_ix.psql new file mode 100644 index 000000000..5a938f861 --- /dev/null +++ b/Model/lib/psql/webready/comparative/AlphaFoldGenes_ix.psql @@ -0,0 +1,3 @@ + CREATE index AlphaFoldGenes_idx ON :SCHEMA.AlphaFoldGenes (gene_source_id, uniprot_id) + + ; diff --git a/Model/lib/psql/webready/comparative/GroupDomainDescriptions.psql b/Model/lib/psql/webready/comparative/GroupDomainDescriptions.psql new file mode 100644 index 000000000..919cb8a61 --- /dev/null +++ b/Model/lib/psql/webready/comparative/GroupDomainDescriptions.psql @@ -0,0 +1,21 @@ +create table :SCHEMA.GroupDomainDescriptions as +SELECT og.group_id AS group_name, ag.descriptions +FROM apidb.OrthologGroup og, + (SELECT group_name, + STRING_AGG(accession ||' (' || num_proteins|| ')', ', ') AS descriptions + FROM (SELECT group_name, accession, num_proteins, rnk + FROM (SELECT group_name, accession, num_proteins, + rank() OVER (PARTITION BY group_name ORDER BY num_proteins DESC) rnk + FROM (SELECT group_name, accession, count(distinct full_id) AS num_proteins + FROM :SCHEMA.ProteinDomainAssignment + GROUP BY group_name,accession + ) + ) + WHERE rnk <= 3 + ) + GROUP BY group_name + ORDER BY 1 + ) ag +WHERE og.group_id = ag.group_name + +; \ No newline at end of file diff --git a/Model/lib/psql/webready/comparative/GroupDomainDescriptions_ix.psql b/Model/lib/psql/webready/comparative/GroupDomainDescriptions_ix.psql new file mode 100644 index 000000000..27ed2e9f4 --- /dev/null +++ b/Model/lib/psql/webready/comparative/GroupDomainDescriptions_ix.psql @@ -0,0 +1,2 @@ +CREATE INDEX GroupDomainAttribute_idx ON :SCHEMA.GroupDomainDescriptions (group_name) +; \ No newline at end of file diff --git a/Model/lib/psql/webready/comparative/LoadOrthologTables.psql b/Model/lib/psql/webready/comparative/LoadOrthologTables.psql new file mode 100644 index 000000000..ece00e833 --- /dev/null +++ b/Model/lib/psql/webready/comparative/LoadOrthologTables.psql @@ -0,0 +1,29 @@ +Truncate table :SCHEMA.GeneOrthologGroup; +Truncate table :SCHEMA.TranscriptOrthologGroup; + +insert into :SCHEMA.GeneOrthologGroup (gene_id, group_id, project_id, org_abbrev, modification_date) +SELECT ga.gene_source_id AS gene_id + , ogas.group_id + , ga.project_id + , ga.org_abbrev + , timestamp as modification_date + FROM :SCHEMA.geneattributes ga, + apidb.orthologgroupaasequence ogas + WHERE ga.aa_sequence_id = ogas.aa_sequence_id; + + + +insert into :SCHEMA.TranscriptOrthologGroup (source_id, gene_id, group_id, project_id, org_abbrev, modification_date) +SELECT ta.source_id AS source_id + , ta.gene_source_id as gene_id + , ogas.group_id + , ta.project_id + , ta.org_abbrev + , timestamp as modification_date + FROM :SCHEMA.transcriptattributes ta + , apidb.orthologgroupaasequence ogas + WHERE ta.aa_sequence_id = ogas.aa_sequence_id; + + + + diff --git a/Model/lib/psql/webready/comparative/LoadPathwaysGeneTable.psql b/Model/lib/psql/webready/comparative/LoadPathwaysGeneTable.psql new file mode 100644 index 000000000..28ffcf3b4 --- /dev/null +++ b/Model/lib/psql/webready/comparative/LoadPathwaysGeneTable.psql @@ -0,0 +1,58 @@ +/* ATTENTION: This script is run using a custom workflow step class */ +/* This accommodates the required to retain an empty table on undo */ + +TRUNCATE TABLE :SCHEMA.PathwaysGeneTable; + +DO $$ + DECLARE org record:PLPGSQL_DELIM + BEGIN + FOR org IN (SELECT DISTINCT abbrev FROM apidb.organism) + LOOP + INSERT INTO :SCHEMA.PathwaysGeneTable ( + SELECT t2.*, current_timestamp AS modification_date FROM ( + SELECT DISTINCT + gene_source_id + , pathway_source_id + , pathway_name + , count(reaction_source_id) AS reactions + , enzyme + , expasy_url + , pathway_source + , exact_match + , project_id + , org_abbrev + FROM ( + SELECT DISTINCT + tp.gene_source_id + , tp.project_id + , tp.pathway_source_id + , tp.pathway_name + , tp.org_abbrev + , pr.reaction_source_id + , pr.enzyme + , pr.expasy_url + , tp.pathway_source + , CASE MAX(tp.exact_match) WHEN 1 THEN 'Yes' WHEN 0 THEN 'No' END AS exact_match + FROM :SCHEMA.TranscriptPathway tp + , apidbtuning.PathwayAttributes pa + , apidbtuning.PathwayCompounds pc + , apidbtuning.PathwayReactions pr + WHERE tp.pathway_id = pa.pathway_id + AND pc.pathway_id = pa.pathway_id + AND pr.reaction_id = pc.reaction_id + AND pr.ext_db_name = pc.ext_db_name + AND tp.ec_number_pathway = pr.enzyme + AND tp.wildcard_count_gene <= tp.wildcard_count_pathway + AND pr.enzyme != '-.-.-.-' + AND tp.org_abbrev = org.abbrev + GROUP BY tp.gene_source_id, tp.project_id, tp.org_abbrev, tp.pathway_name, tp.pathway_source_id, pr.reaction_source_id, pr.enzyme, pr.expasy_url, tp.pathway_source + ) t + GROUP BY gene_source_id, project_id, org_abbrev, pathway_source_id, pathway_name, enzyme, expasy_url, pathway_source, exact_match + ) t2 + ORDER BY pathway_source, lower(pathway_name) + ):PLPGSQL_DELIM + COMMIT:PLPGSQL_DELIM + END LOOP:PLPGSQL_DELIM + END:PLPGSQL_DELIM +$$ LANGUAGE PLPGSQL; + diff --git a/Model/lib/psql/webready/comparative/LoadTranscriptPathway.psql b/Model/lib/psql/webready/comparative/LoadTranscriptPathway.psql new file mode 100644 index 000000000..afd718fb4 --- /dev/null +++ b/Model/lib/psql/webready/comparative/LoadTranscriptPathway.psql @@ -0,0 +1,174 @@ +/* ATTENTION: This script is run using a custom workflow step class */ +/* This accommodates the requirement to retain empty TranscriptEC and TranscriptPathway tables on undo */ + +/* STEP 1: Make sure temp tables have been dropped before starting */ + +DROP TABLE IF EXISTS :SCHEMA.TranscriptEcUniverse; +DROP TABLE IF EXISTS :SCHEMA.PathwayEcUniverse; +DROP TABLE IF EXISTS :SCHEMA.TranscriptPathwayEcMatch; + +-- Just to be safe add these here +TRUNCATE TABLE :SCHEMA.TranscriptPathway; +TRUNCATE TABLE :SCHEMA.TranscriptEC; + + +/* STEP 2: Load into the TranscriptEC table. This must be done here so that we capture ortho-derived EC numbers*/ + +/* ATTENTION: this step loads into an empty partitioned table created in the orgSpecific graph */ + +DO $$ + DECLARE org RECORD:PLPGSQL_DELIM + BEGIN + FOR org IN (SELECT DISTINCT taxon_id, abbrev from apidb.organism) + LOOP + INSERT INTO :SCHEMA.TranscriptEc ( + SELECT DISTINCT + ta.source_id + , ta.gene_source_id + , ec.enzyme_class_id + , ec.ec_number + , ec.ec_number_1 + , ec.ec_number_2 + , ec.ec_number_3 + , ec.ec_number_4 + , regexp_count(ec.ec_number, '-') as wildcard_count + , asec.evidence_code + , ta.project_id + , org.abbrev as org_abbrev + , current_timestamp as modification_date + FROM sres.EnzymeClass ec + , dots.AaSequenceEnzymeClass asec + , :SCHEMA.transcriptattributes ta + WHERE asec.aa_sequence_id = ta.aa_sequence_id + AND asec.enzyme_class_id = ec.enzyme_class_id + AND ta.org_abbrev = org.abbrev + ):PLPGSQL_DELIM + COMMIT:PLPGSQL_DELIM + END LOOP:PLPGSQL_DELIM + END:PLPGSQL_DELIM +$$ LANGUAGE PLPGSQL; + + +/* STEP 3: Extracts the distinct EC numbers from TranscriptEC */ +/* This represents the "universe" of EC numbers associated to transcripts */ +/* Temp table, will be dropped */ + +CREATE TABLE :SCHEMA.TranscriptEcUniverse as ( + SELECT DISTINCT + enzyme_class_id + , ec_number + , ec_number_1 + , ec_number_2 + , ec_number_3 + , ec_number_4 + , wildcard_count + FROM :SCHEMA.TranscriptEc +); + +/* STEP 4: Extract the distinct EC number from PathwayEC */ +/* This represents the "universe" of EC numbers associated to pathways */ +/* Temp table, will be dropped */ + +CREATE TABLE :SCHEMA.PathwayEcUniverse as ( + SELECT DISTINCT + enzyme_class_id + , ec_number + , ec_number_1 + , ec_number_2 + , ec_number_3 + , ec_number_4 + , wildcard_count + FROM :SCHEMA.PathwayEc +); + + +/* STEP 5: Match EC numbers from the transcript universe and EC numbers from the pathway universe */ +/* Use the universe tables to avoid redundancy */ +/* Temp table, will be dropped */ + +CREATE TABLE :SCHEMA.TranscriptPathwayEcMatch as ( + SELECT DISTINCT + teu.enzyme_class_id AS transcript_enzyme_class_id + , peu.enzyme_class_id AS pathway_enzyme_class_id + , teu.wildcard_count AS wildcard_count_transcript + , peu.wildcard_count AS wildcard_count_pathway + , teu.ec_number AS ec_number_transcript + , peu.ec_number AS ec_number_pathway + FROM :SCHEMA.TranscriptEcUniverse teu + , :SCHEMA.PathwayEc peu + + -- this part does ec number expansion using the individual digits to avoid slow like syntax + WHERE (teu.ec_number_1 = peu.ec_number_1 or teu.ec_number_1 is null or peu.ec_number_1 is null) + AND (teu.ec_number_2 = peu.ec_number_2 or teu.ec_number_2 is null or peu.ec_number_2 is null) + AND (teu.ec_number_3 = peu.ec_number_3 or teu.ec_number_3 is null or peu.ec_number_3 is null) + AND (teu.ec_number_4 = peu.ec_number_4 or teu.ec_number_4 is null or peu.ec_number_4 is null) +); + + +/* STEP 6: Map the matched EC numbers to map back to both pathways and transcripts */ +/* ATTENTION: this step loads into an empty partitioned table created in the orgSpecific graph */ + +/* This is the equivalent of the old TranscriptPathway tuning table */ + +DO $$ + DECLARE org RECORD:PLPGSQL_DELIM + BEGIN + FOR org IN (SELECT DISTINCT taxon_id, abbrev FROM apidb.organism) + LOOP + INSERT INTO :SCHEMA.transcriptpathway ( + SELECT DISTINCT + + -- gene info + ta.source_id + , ta.gene_source_id + + -- pathway info + , pa.source_id AS pathway_source_id + , pa.name AS pathway_name + , pa.pathway_id + , pa.pathway_source + , pec.external_database_release_id + + -- info about match + , tpem.ec_number_transcript AS ec_number_gene + , tpem.wildcard_count_transcript AS wildcard_count_gene + , tpem.ec_number_pathway + , tpem.wildcard_count_pathway + , CASE WHEN tpem.ec_number_pathway = tpem.ec_number_transcript + THEN 1 + ELSE 0 END AS exact_match + , CASE WHEN tpem.wildcard_count_pathway + tpem.wildcard_count_transcript = 0 + THEN 1 + ELSE 0 END AS complete_ec + + -- for partitioning + , ta.project_id + , org.abbrev AS org_abbrev + , current_timestamp AS modification_date + FROM :SCHEMA.TranscriptPathwayEcMatch tpem + , :SCHEMA.PathwayEc pec + , :SCHEMA.PathwayAttributes pa + , :SCHEMA.TranscriptAttributes ta + , :SCHEMA.TranscriptEc tec + WHERE tpem.ec_number_transcript = tec.ec_number + AND tpem.ec_number_pathway = pec.ec_number + AND pa.pathway_id = pec.pathway_id + AND ta.source_id = tec.source_id + AND tec.org_abbrev = org.abbrev + AND ta.org_abbrev = org.abbrev + -- JB: It is unclear what the intention here was. This will not remove any rows and shouldn't + -- AND ( + -- (ta.orthomcl_name IS NULL AND tec.evidence_code != 'OrthoMCLDerived') + -- OR ta.orthomcl_name IS NOT NULL + -- ) + ):PLPGSQL_DELIM + COMMIT:PLPGSQL_DELIM + END LOOP:PLPGSQL_DELIM + END:PLPGSQL_DELIM +$$ LANGUAGE PLPGSQL; + + +/* STEP 7: Delete temp tables */ +DROP TABLE :SCHEMA.TranscriptEcUniverse; +DROP TABLE :SCHEMA.PathwayEcUniverse; +DROP TABLE :SCHEMA.TranscriptPathwayEcMatch; diff --git a/Model/lib/psql/webready/comparative/OrthologousTranscripts.psql b/Model/lib/psql/webready/comparative/OrthologousTranscripts.psql new file mode 100644 index 000000000..983e5239c --- /dev/null +++ b/Model/lib/psql/webready/comparative/OrthologousTranscripts.psql @@ -0,0 +1,48 @@ + drop table if exists :SCHEMA.SyntenicPairs; + + create UNLOGGED table :SCHEMA.SyntenicPairs as + select distinct ga.na_feature_id, sg.syn_na_feature_id + from apidb.SyntenicGene sg, :SCHEMA.GeneAttributes ga + where sg.na_sequence_id = ga.na_sequence_id + and sg.end_max >= ga.start_min + and sg.start_min <= ga.end_max + + ; + + create index SynPair_idx + on :SCHEMA.SyntenicPairs (na_feature_id, syn_na_feature_id) + ; + + create table :SCHEMA.OrthologousTranscripts as + with all_pairs + as (select ga.source_id + , ga.project_id + , ga.na_feature_id + , ota.source_id as ortho_source_id + , ota.gene_source_id as ortho_gene_source_id + , ota.project_id as ortho_project_id + , ota.gene_na_feature_id as ortho_na_feature_id + , ota.transcript_product as ortho_product + , ota.protein_length + , ga.name as ortho_name + , ota.organism as ortho_organism + , ota.taxon_id as ortho_taxon_id + , o.is_reference_strain + from :SCHEMA.Geneattributes ga + , :SCHEMA.TranscriptAttributes ota + , apidb.Organism o + where ga.ORTHOMCL_NAME = ota.ORTHOMCL_NAME + and ota.taxon_id = o.taxon_id + ), + syn_pairs + as (select na_feature_id, syn_na_feature_id, 1 as is_syntenic from :SCHEMA.SyntenicPairs + ) + select all_pairs.* + , coalesce(syn_pairs.is_syntenic, 0) as is_syntenic + from all_pairs + left join syn_pairs + on all_pairs.na_feature_id = syn_pairs.na_feature_id + and all_pairs.ortho_na_feature_id = syn_pairs.syn_na_feature_id + ; + + drop table if exists :SCHEMA.SyntenicPairs; diff --git a/Model/lib/psql/webready/comparative/OrthologousTranscripts_ix.psql b/Model/lib/psql/webready/comparative/OrthologousTranscripts_ix.psql new file mode 100644 index 000000000..677049f72 --- /dev/null +++ b/Model/lib/psql/webready/comparative/OrthologousTranscripts_ix.psql @@ -0,0 +1,12 @@ + + create index ot_idx + on :SCHEMA.OrthologousTranscripts (source_id, project_id, is_syntenic desc, ortho_source_id, + ortho_project_id, ortho_gene_source_id, ortho_product, + ortho_name, ortho_organism, ortho_taxon_id, is_reference_strain) + ; + + create index ot_smol_idx + on :SCHEMA.OrthologousTranscripts (is_syntenic, ortho_taxon_id, source_id, ortho_source_id, + ortho_project_id, ortho_gene_source_id) + ; + diff --git a/Model/lib/psql/webready/comparative/PhyleticPattern.psql b/Model/lib/psql/webready/comparative/PhyleticPattern.psql new file mode 100644 index 000000000..1fad7add7 --- /dev/null +++ b/Model/lib/psql/webready/comparative/PhyleticPattern.psql @@ -0,0 +1,395 @@ + + +CREATE TABLE :SCHEMA.PhyleticPattern AS + (SELECT actual.group_name, + actual.alveolata as alveolata_actual, + total.alveolata as alveolata_total, + round(100*actual.alveolata/total.alveolata,0) AS alveolata_percent, + actual.archaea as archaea_actual, + total.archaea as archaea_total, + round(100*actual.archaea/total.archaea,0) AS archaea_percent, + actual.amoeba as amoeba_actual, + total.amoeba as amoeba_total, + round(100*actual.amoeba/total.amoeba,0) AS amoeba_percent, + actual.bacteria as bacteria_actual, + total.bacteria as bacteria_total, + round(100*actual.bacteria/total.bacteria,0) AS bacteria_percent, + actual.fungi as fungi_actual, + total.fungi as fungi_total, + round(100*actual.fungi/total.fungi,0) AS fungi_percent, + actual.euglenozoa as euglenozoa_actual, + total.euglenozoa as euglenozoa_total, + round(100*actual.euglenozoa/total.euglenozoa,0) AS euglenozoa_percent, + actual.metazoa as metazoa_actual, + total.metazoa as metazoa_total, + round(100*actual.metazoa/total.metazoa,0) AS metazoa_percent, + actual.viridiplantae as viridiplantae_actual, + total.viridiplantae as viridiplantae_total, + round(100*actual.viridiplantae/total.viridiplantae,0) AS viridiplantae_percent, + actual.other_eukaryotes as other_eukaryotes_actual, + total.other_eukaryotes as other_eukaryotes_total, + round(100*actual.other_eukaryotes/total.other_eukaryotes,0) AS other_eukaryotes_percent, + actual.bacteria_firm as bacteria_firm_actual, + total.bacteria_firm as bacteria_firm_total, + actual.bacteria_proa as bacteria_proa_actual, + total.bacteria_proa as bacteria_proa_total, + actual.bacteria_prob as bacteria_prob_actual, + total.bacteria_prob as bacteria_prob_total, + actual.bacteria_prod as bacteria_prod_actual, + total.bacteria_prod as bacteria_prod_total, + actual.bacteria_proe as bacteria_proe_actual, + total.bacteria_proe as bacteria_proe_total, + actual.bacteria_prog as bacteria_prog_actual, + total.bacteria_prog as bacteria_prog_total, + actual.bacteria_obac as bacteria_obac_actual, + total.bacteria_obac as bacteria_obac_total, + actual.archaea_arch as archaea_arch_actual, + total.archaea_arch as archaea_arch_total, + actual.archaea_eury as archaea_eury_actual, + total.archaea_eury as archaea_eury_total, + actual.archaea_cren as archaea_cren_actual, + total.archaea_cren as archaea_cren_total, + actual.archaea_nano as archaea_nano_actual, + total.archaea_nano as archaea_nano_total, + actual.archaea_kora as archaea_kora_actual, + total.archaea_kora as archaea_kora_total, + --oeuk_genera.html as oeuk_genera_html, + --eugl_genera.html as eugl_genera_html, + --amoe_genera.html as amoe_genera_html, + actual.alveolata_alve as alveolata_alve_actual, + total.alveolata_alve as alveolata_alve_total, + actual.alveolata_cili as alveolata_cili_actual, + total.alveolata_cili as alveolata_cili_total, + actual.alveolata_apic as alveolata_apic_actual, + total.alveolata_apic as alveolata_apic_total, + actual.alveolata_cocc as alveolata_cocc_actual, + total.alveolata_cocc as alveolata_cocc_total, + actual.alveolata_haem as alveolata_haem_actual, + total.alveolata_haem as alveolata_haem_total, + actual.alveolata_piro as alveolata_piro_actual, + total.alveolata_piro as alveolata_piro_total, + actual.viridiplantae_stre as viridiplantae_stre_actual, + total.viridiplantae_stre as viridiplantae_stre_total, + actual.viridiplantae_chlo as viridiplantae_chlo_actual, + total.viridiplantae_chlo as viridiplantae_chlo_total, + actual.viridiplantae_rhod as viridiplantae_rhod_actual, + total.viridiplantae_rhod as viridiplantae_rhod_total, + actual.viridiplantae_cryp as viridiplantae_cryp_actual, + total.viridiplantae_cryp as viridiplantae_cryp_total, + actual.fungi_fung as fungi_fung_actual, + total.fungi_fung as fungi_fung_total, + actual.fungi_micr as fungi_micr_actual, + total.fungi_micr as fungi_micr_total, + actual.fungi_basi as fungi_basi_actual, + total.fungi_basi as fungi_basi_total, + actual.fungi_asco as fungi_asco_actual, + total.fungi_asco as fungi_asco_total, + actual.fungi_muco as fungi_muco_actual, + total.fungi_muco as fungi_muco_total, + actual.fungi_chyt as fungi_chyt_actual, + total.fungi_chyt as fungi_chyt_total, + actual.metazoa_omet as metazoa_omet_actual, + total.metazoa_omet as metazoa_omet_total, + actual.metazoa_nema as metazoa_nema_actual, + total.metazoa_nema as metazoa_nema_total, + actual.metazoa_arth as metazoa_arth_actual, + total.metazoa_arth as metazoa_arth_total, + actual.metazoa_chor as metazoa_chor_actual, + total.metazoa_chor as metazoa_chor_total, + actual.metazoa_acti as metazoa_acti_actual, + total.metazoa_acti as metazoa_acti_total, + actual.metazoa_aves as metazoa_aves_actual, + total.metazoa_aves as metazoa_aves_total, + actual.metazoa_mamm as metazoa_mamm_actual, + total.metazoa_mamm as metazoa_mamm_total, + actual.metazoa_tuni as metazoa_tuni_actual, + total.metazoa_tuni as metazoa_tuni_total + FROM + (SELECT SUM(CASE clade WHEN 'alveolata' THEN num ELSE 0 END) as alveolata, + SUM(CASE clade WHEN 'archaea' THEN num ELSE 0 END) as archaea, + SUM(CASE clade WHEN 'amoeba' THEN num ELSE 0 END) as amoeba, + SUM(CASE clade WHEN 'bacteria' THEN num ELSE 0 END) as bacteria, + SUM(CASE clade WHEN 'fungi' THEN num ELSE 0 END) as fungi, + SUM(CASE clade WHEN 'euglenozoa' THEN num ELSE 0 END) as euglenozoa, + SUM(CASE clade WHEN 'metazoa' THEN num ELSE 0 END) as metazoa, + SUM(CASE clade WHEN 'viridiplantae' THEN num ELSE 0 END) as viridiplantae, + SUM(CASE clade WHEN 'other_eukaryotes' THEN num ELSE 0 END) as other_eukaryotes, + SUM(CASE clade WHEN 'FIRM' THEN num ELSE 0 END) as bacteria_firm, + SUM(CASE clade WHEN 'PROA' THEN num ELSE 0 END) as bacteria_proa, + SUM(CASE clade WHEN 'PROB' THEN num ELSE 0 END) as bacteria_prob, + SUM(CASE clade WHEN 'PROD' THEN num ELSE 0 END) as bacteria_prod, + SUM(CASE clade WHEN 'PROE' THEN num ELSE 0 END) as bacteria_proe, + SUM(CASE clade WHEN 'PROG' THEN num ELSE 0 END) as bacteria_prog, + SUM(CASE clade WHEN 'OBAC' THEN num ELSE 0 END) as bacteria_obac, + SUM(CASE clade WHEN 'ARCH' THEN num ELSE 0 END) as archaea_arch, + SUM(CASE clade WHEN 'EURY' THEN num ELSE 0 END) as archaea_eury, + SUM(CASE clade WHEN 'CREN' THEN num ELSE 0 END) as archaea_cren, + SUM(CASE clade WHEN 'NANO' THEN num ELSE 0 END) as archaea_nano, + SUM(CASE clade WHEN 'KORA' THEN num ELSE 0 END) as archaea_kora, + SUM(CASE clade WHEN 'ALVE' THEN num ELSE 0 END) as alveolata_alve, + SUM(CASE clade WHEN 'CILI' THEN num ELSE 0 END) as alveolata_cili, + SUM(CASE clade WHEN 'APIC' THEN num ELSE 0 END) as alveolata_apic, + SUM(CASE clade WHEN 'COCC' THEN num ELSE 0 END) as alveolata_cocc, + SUM(CASE clade WHEN 'HAEM' THEN num ELSE 0 END) as alveolata_haem, + SUM(CASE clade WHEN 'PIRO' THEN num ELSE 0 END) as alveolata_piro, + SUM(CASE clade WHEN 'STRE' THEN num ELSE 0 END) as viridiplantae_stre, + SUM(CASE clade WHEN 'CHLO' THEN num ELSE 0 END) as viridiplantae_chlo, + SUM(CASE clade WHEN 'RHOD' THEN num ELSE 0 END) as viridiplantae_rhod, + SUM(CASE clade WHEN 'CRYP' THEN num ELSE 0 END) as viridiplantae_cryp, + SUM(CASE clade WHEN 'FUNG' THEN num ELSE 0 END) as fungi_fung, + SUM(CASE clade WHEN 'MICR' THEN num ELSE 0 END) as fungi_micr, + SUM(CASE clade WHEN 'BASI' THEN num ELSE 0 END) as fungi_basi, + SUM(CASE clade WHEN 'ASCO' THEN num ELSE 0 END) as fungi_asco, + SUM(CASE clade WHEN 'MUCO' THEN num ELSE 0 END) as fungi_muco, + SUM(CASE clade WHEN 'CHYT' THEN num ELSE 0 END) as fungi_chyt, + SUM(CASE clade WHEN 'OMET' THEN num ELSE 0 END) as metazoa_omet, + SUM(CASE clade WHEN 'NEMA' THEN num ELSE 0 END) as metazoa_nema, + SUM(CASE clade WHEN 'ARTH' THEN num ELSE 0 END) as metazoa_arth, + SUM(CASE clade WHEN 'CHOR' THEN num ELSE 0 END) as metazoa_chor, + SUM(CASE clade WHEN 'ACTI' THEN num ELSE 0 END) as metazoa_acti, + SUM(CASE clade WHEN 'AVES' THEN num ELSE 0 END) as metazoa_aves, + SUM(CASE clade WHEN 'MAMM' THEN num ELSE 0 END) as metazoa_mamm, + SUM(CASE clade WHEN 'TUNI' THEN num ELSE 0 END) as metazoa_tuni + FROM ( + (WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'BACT' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'bacteria' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'ARCH' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'archaea' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'ALVE' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'alveolata' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'AMOE' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'amoeba' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'EUGL' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'euglenozoa' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'VIRI' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'viridiplantae' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'FUNG' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'fungi' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'META' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'metazoa' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + WITH RECURSIVE TaxonHierarchy AS ( + SELECT orthomcl_clade_id, parent_id + FROM apidb.OrthomclClade + WHERE orthomcl_clade_id IN ( + SELECT orthomcl_clade_id + FROM apidb.OrthomclClade + WHERE three_letter_abbrev = 'OEUK' + ) UNION ALL + SELECT child.orthomcl_clade_id,child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT 'other_eukaryotes' AS clade, COUNT(*) AS num + FROM TaxonHierarchy th + JOIN apidb.OrthomclClade c ON th.orthomcl_clade_id = c.orthomcl_clade_id + WHERE c.core_peripheral IN ('C', 'P') + + ) UNION ( + SELECT b.three_letter_abbrev AS clade,a.num + FROM (SELECT parent_id, COUNT(orthomcl_clade_id) AS num + FROM apidb.OrthomclClade + WHERE core_peripheral in ('C','P') + GROUP BY parent_id) a, + apidb.OrthomclClade b + WHERE a.parent_id = b.orthomcl_clade_id + ) + )) total, + (SELECT name as group_name, + SUM(CASE three_letter_abbrev WHEN 'ALVE' THEN number_of_taxa ELSE 0 END) as alveolata, + SUM(CASE three_letter_abbrev WHEN 'ARCH' THEN number_of_taxa ELSE 0 END) as archaea, + SUM(CASE three_letter_abbrev WHEN 'AMOE' THEN number_of_taxa ELSE 0 END) as amoeba, + SUM(CASE three_letter_abbrev WHEN 'BACT' THEN number_of_taxa ELSE 0 END) as bacteria, + SUM(CASE three_letter_abbrev WHEN 'FUNG' THEN number_of_taxa ELSE 0 END) as fungi, + SUM(CASE three_letter_abbrev WHEN 'EUGL' THEN number_of_taxa ELSE 0 END) as euglenozoa, + SUM(CASE three_letter_abbrev WHEN 'META' THEN number_of_taxa ELSE 0 END) as metazoa, + SUM(CASE three_letter_abbrev WHEN 'VIRI' THEN number_of_taxa ELSE 0 END) as viridiplantae, + SUM(CASE three_letter_abbrev WHEN 'OEUK' THEN number_of_taxa ELSE 0 END) as other_eukaryotes, + SUM(CASE three_letter_abbrev WHEN 'FIRM' THEN number_of_taxa ELSE 0 END) as bacteria_firm, + SUM(CASE three_letter_abbrev WHEN 'PROA' THEN number_of_taxa ELSE 0 END) as bacteria_proa, + SUM(CASE three_letter_abbrev WHEN 'PROB' THEN number_of_taxa ELSE 0 END) as bacteria_prob, + SUM(CASE three_letter_abbrev WHEN 'PROD' THEN number_of_taxa ELSE 0 END) as bacteria_prod, + SUM(CASE three_letter_abbrev WHEN 'PROE' THEN number_of_taxa ELSE 0 END) as bacteria_proe, + SUM(CASE three_letter_abbrev WHEN 'PROG' THEN number_of_taxa ELSE 0 END) as bacteria_prog, + SUM(CASE three_letter_abbrev WHEN 'OBAC' THEN number_of_taxa ELSE 0 END) as bacteria_obac, + SUM(CASE three_letter_abbrev WHEN 'ARCH' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'EURY' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'CREN' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'NANO' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'KORA' THEN number_of_taxa ELSE 0 END) as archaea_arch, + SUM(CASE three_letter_abbrev WHEN 'EURY' THEN number_of_taxa ELSE 0 END) as archaea_eury, + SUM(CASE three_letter_abbrev WHEN 'CREN' THEN number_of_taxa ELSE 0 END) as archaea_cren, + SUM(CASE three_letter_abbrev WHEN 'NANO' THEN number_of_taxa ELSE 0 END) as archaea_nano, + SUM(CASE three_letter_abbrev WHEN 'KORA' THEN number_of_taxa ELSE 0 END) as archaea_kora, + SUM(CASE three_letter_abbrev WHEN 'ALVE' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'CILI' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'APIC' THEN number_of_taxa ELSE 0 END) as alveolata_alve, + SUM(CASE three_letter_abbrev WHEN 'CILI' THEN number_of_taxa ELSE 0 END) as alveolata_cili, + SUM(CASE three_letter_abbrev WHEN 'APIC' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'COCC' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'ACON' THEN number_of_taxa ELSE 0 END) as alveolata_apic, + SUM(CASE three_letter_abbrev WHEN 'COCC' THEN number_of_taxa ELSE 0 END) as alveolata_cocc, + SUM(CASE three_letter_abbrev WHEN 'HAEM' THEN number_of_taxa ELSE 0 END) as alveolata_haem, + SUM(CASE three_letter_abbrev WHEN 'PIRO' THEN number_of_taxa ELSE 0 END) as alveolata_piro, + SUM(CASE three_letter_abbrev WHEN 'STRE' THEN number_of_taxa ELSE 0 END) as viridiplantae_stre, + SUM(CASE three_letter_abbrev WHEN 'CHLO' THEN number_of_taxa ELSE 0 END) as viridiplantae_chlo, + SUM(CASE three_letter_abbrev WHEN 'RHOD' THEN number_of_taxa ELSE 0 END) as viridiplantae_rhod, + SUM(CASE three_letter_abbrev WHEN 'CRYP' THEN number_of_taxa ELSE 0 END) as viridiplantae_cryp, + SUM(CASE three_letter_abbrev WHEN 'FUNG' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'MICR' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'BASI' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'ASCO' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'MUCO' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'CHYT' THEN number_of_taxa ELSE 0 END) as fungi_fung, + SUM(CASE three_letter_abbrev WHEN 'MICR' THEN number_of_taxa ELSE 0 END) as fungi_micr, + SUM(CASE three_letter_abbrev WHEN 'BASI' THEN number_of_taxa ELSE 0 END) as fungi_basi, + SUM(CASE three_letter_abbrev WHEN 'ASCO' THEN number_of_taxa ELSE 0 END) as fungi_asco, + SUM(CASE three_letter_abbrev WHEN 'MUCO' THEN number_of_taxa ELSE 0 END) as fungi_muco, + SUM(CASE three_letter_abbrev WHEN 'CHYT' THEN number_of_taxa ELSE 0 END) as fungi_chyt, + SUM(CASE three_letter_abbrev WHEN 'OMET' THEN number_of_taxa ELSE 0 END) as metazoa_omet, + SUM(CASE three_letter_abbrev WHEN 'NEMA' THEN number_of_taxa ELSE 0 END) as metazoa_nema, + SUM(CASE three_letter_abbrev WHEN 'ARTH' THEN number_of_taxa ELSE 0 END) as metazoa_arth, + SUM(CASE three_letter_abbrev WHEN 'CHOR' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'ACTI' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'AVES' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'MAMM' THEN number_of_taxa ELSE 0 END) - + SUM(CASE three_letter_abbrev WHEN 'TUNI' THEN number_of_taxa ELSE 0 END) as metazoa_chor, + SUM(CASE three_letter_abbrev WHEN 'ACTI' THEN number_of_taxa ELSE 0 END) as metazoa_acti, + SUM(CASE three_letter_abbrev WHEN 'AVES' THEN number_of_taxa ELSE 0 END) as metazoa_aves, + SUM(CASE three_letter_abbrev WHEN 'MAMM' THEN number_of_taxa ELSE 0 END) as metazoa_mamm, + SUM(CASE three_letter_abbrev WHEN 'TUNI' THEN number_of_taxa ELSE 0 END) as metazoa_tuni + FROM (SELECT og.group_id AS name,agt.three_letter_abbrev,agt.number_of_taxa::numeric + FROM ApiDB.OrthologGroupTaxon agt, apidb.orthologgroup og + WHERE agt.group_id = og.group_id + AND agt.three_letter_abbrev = UPPER(agt.three_letter_abbrev)) + GROUP BY name) actual +) + + ; + diff --git a/Model/lib/psql/webready/comparative/ProteinDomainAssignment.psql b/Model/lib/psql/webready/comparative/ProteinDomainAssignment.psql new file mode 100644 index 000000000..5ba8d0129 --- /dev/null +++ b/Model/lib/psql/webready/comparative/ProteinDomainAssignment.psql @@ -0,0 +1,38 @@ + create table :SCHEMA.ProteinDomainAssignment as + select sa.full_id, sa.group_name, + r.interpro_primary_id as accession, + r.interpro_desc as description, + CAST (NULL as NUMERIC) as domain_index, + sa.aa_sequence_id, + r.interpro_start_min as start_min, + r.interpro_end_min as end_max + from :SCHEMA.ProteinSequenceGroup sa, apidb.interproresults r + where sa.full_id = r.protein_source_id + and upper(r.interpro_db_name) = 'PFAM' +; + + create index domain_accession_ix + on :SCHEMA.ProteinDomainAssignment (accession, full_id, group_name) + ; + + drop table if exists :SCHEMA.domainIndex_tmp +; + + create table :SCHEMA.domainIndex_tmp as + select row_number() OVER () as domain_index, accession + from (select distinct accession + from :SCHEMA.ProteinDomainAssignment + order by accession) +; + + create index domainIdxIdx on :SCHEMA.DomainIndex_tmp(accession, domain_index) +; + + update :SCHEMA.ProteinDomainAssignment da + set domain_index = (select domain_index + from :SCHEMA.DomainIndex_tmp + where accession = da.accession) +; + + drop table :SCHEMA.domainIndex_tmp + ; \ No newline at end of file diff --git a/Model/lib/psql/webready/comparative/ProteinDomainAssignment_ix.psql b/Model/lib/psql/webready/comparative/ProteinDomainAssignment_ix.psql new file mode 100644 index 000000000..7fe3cdc3c --- /dev/null +++ b/Model/lib/psql/webready/comparative/ProteinDomainAssignment_ix.psql @@ -0,0 +1,11 @@ + create index domain_ix_ix + on :SCHEMA.ProteinDomainAssignment (domain_index, accession, full_id) +; + + create index domain_group_ix + on :SCHEMA.ProteinDomainAssignment (group_name, accession, full_id) +; + + create index domain_seq_ix + on :SCHEMA.ProteinDomainAssignment (aa_sequence_id, accession, full_id, group_name) +; \ No newline at end of file diff --git a/Model/lib/psql/webready/comparative/ProteinSequenceGroup.psql b/Model/lib/psql/webready/comparative/ProteinSequenceGroup.psql new file mode 100644 index 000000000..de2df5a82 --- /dev/null +++ b/Model/lib/psql/webready/comparative/ProteinSequenceGroup.psql @@ -0,0 +1,92 @@ + create table :SCHEMA.ProteinSequenceGroup as + SELECT + distinct(aas.source_id) AS full_id, + aas.source_id, + aas.aa_sequence_id, + length(aas.sequence) as length, + aas.description AS product, + aas.taxon_id, + + taxon.orthomcl_taxon_id, + taxon.taxon_group, + taxon.orthomcl_abbrev AS taxon_abbreviation, + taxon.name AS organism_name, + taxon.core_peripheral, + + o.group_id AS group_name, + o.ortholog_group_id, + o.number_of_members AS group_size, + o.number_of_core_members, + o.number_of_peripheral_members, + CASE is_residual WHEN 1 THEN 'Residual' + ELSE 'Core' END AS group_type, + urls.source_url, + urls.source_text + FROM + dots.AASequence aas, + apidb.orthologGroup o, + apidb.orthologGroupAASequence ogseq, + ( + SELECT o.orthomcl_abbrev, + o.taxon_id as orthomcl_taxon_id, + t.name, + t.core_peripheral, + t.taxon_group + FROM apidb.organism o, + (WITH RECURSIVE TaxonHierarchy AS ( + SELECT + three_letter_abbrev, + orthomcl_clade_id, + name, + core_peripheral, + name AS taxon_group, + parent_id + FROM apidb.OrthomclClade + WHERE name IN ('Archaea', 'Bacteria', 'Alveolates', 'Amoebozoa', 'Euglenozoa', + 'Fungi', 'Metazoa', 'Other Eukaryota', 'Viridiplantae') + UNION ALL + SELECT + child.three_letter_abbrev, + child.orthomcl_clade_id, + child.name, + child.core_peripheral, + parent.taxon_group, + child.parent_id + FROM apidb.OrthomclClade child + JOIN TaxonHierarchy parent ON child.parent_id = parent.orthomcl_clade_id + ) + SELECT three_letter_abbrev, taxon_group, name, core_peripheral + FROM TaxonHierarchy + WHERE core_peripheral IN ('C', 'P') + ) t + WHERE t.three_letter_abbrev = o.orthomcl_abbrev + ) taxon, + + ( + SELECT aas.aa_sequence_id, + CASE + WHEN ores.resource_name IN ('AmoebaDB','CryptoDB','FungiDB','GiardiaDB','HostDB','MicrosporidiaDB', + 'PlasmoDB','PiroplasmaDB','ToxoDB','TrichDB','TriTrypDB','VectorBase') + THEN SUBSTR(ores.resource_url, 0, strpos(ores.resource_url, '/downloads')) || 'record/gene/' + || aas.source_id + WHEN ores.resource_name = 'Uniprot' + THEN SUBSTR(ores.resource_url, 0, strpos(ores.resource_url, '/proteomes') ) || 'uniprot/' + || aas.source_id + ELSE '' END AS source_url, + CASE WHEN ores.resource_name IS NULL THEN '' + ELSE aas.source_id || ' (' || ores.resource_name || ')' END AS source_text + FROM dots.AaSequence aas, + apidb.organism ot, + apidb.orthomclresource ores + WHERE ot.taxon_id = ores.orthomcl_taxon_id + AND ot.taxon_id = aas.taxon_id) urls + WHERE aas.aa_sequence_id = ogseq.aa_sequence_id + AND ogseq.group_id = o.group_id + AND aas.aa_sequence_id = urls.aa_sequence_id + AND taxon.orthomcl_taxon_id = aas.taxon_id + AND aas.taxon_id in (select distinct(eas.taxon_id) from apidb.organism og, dots.aasequence eas where eas.taxon_id = og.taxon_id) +; + + alter table :SCHEMA.ProteinSequenceGroup + add constraint SeqAttrs_pk primary key (full_id) +; \ No newline at end of file diff --git a/Model/lib/psql/webready/comparative/ProteinSequenceGroup_ix.psql b/Model/lib/psql/webready/comparative/ProteinSequenceGroup_ix.psql new file mode 100644 index 000000000..4a136d10f --- /dev/null +++ b/Model/lib/psql/webready/comparative/ProteinSequenceGroup_ix.psql @@ -0,0 +1,15 @@ + create unique index PSG_idx ON :SCHEMA.ProteinSequenceGroup (full_id, group_name, taxon_id, source_id) +; + + create unique index PSG_gusIdx ON :SCHEMA.ProteinSequenceGroup (ortholog_group_id, aa_sequence_id) +; + + create unique index PSG_idx2 ON :SCHEMA.ProteinSequenceGroup (group_name, length desc, full_id, taxon_id) +; + + create unique index PSG_idx3 + on :SCHEMA.ProteinSequenceGroup (aa_sequence_id, group_name, ortholog_group_id, orthomcl_taxon_id, taxon_id) + ; + + create unique index PSG_idx4 ON :SCHEMA.ProteinSequenceGroup (source_id, full_id, group_name, taxon_id) +; diff --git a/Model/lib/psql/webready/comparative/Undo_LoadOrthologTables.psql b/Model/lib/psql/webready/comparative/Undo_LoadOrthologTables.psql new file mode 100644 index 000000000..9a50c36b6 --- /dev/null +++ b/Model/lib/psql/webready/comparative/Undo_LoadOrthologTables.psql @@ -0,0 +1,4 @@ +/* This script truncates the tables loaded by LoadTranscriptPathway */ +/* The table is not dropped and the partitions are retained */ +Truncate table :SCHEMA.GeneOrthologGroup; +Truncate table :SCHEMA.TranscriptOrthologGroup; diff --git a/Model/lib/psql/webready/comparative/Undo_LoadPathwaysGeneTable.psql b/Model/lib/psql/webready/comparative/Undo_LoadPathwaysGeneTable.psql new file mode 100644 index 000000000..6c838ced5 --- /dev/null +++ b/Model/lib/psql/webready/comparative/Undo_LoadPathwaysGeneTable.psql @@ -0,0 +1,3 @@ +/* This script truncates the tables loaded by LoadPathwaysGeneTable */ + +TRUNCATE TABLE :SCHEMA.PathwaysGeneTable; diff --git a/Model/lib/psql/webready/comparative/Undo_LoadTranscriptPathway.psql b/Model/lib/psql/webready/comparative/Undo_LoadTranscriptPathway.psql new file mode 100644 index 000000000..329c22d30 --- /dev/null +++ b/Model/lib/psql/webready/comparative/Undo_LoadTranscriptPathway.psql @@ -0,0 +1,5 @@ +/* This script truncates the tables loaded by LoadTranscriptPathway */ +/* The table is not dropped and the partitions are retained */ + +TRUNCATE TABLE :SCHEMA.TranscriptPathway; +TRUNCATE TABLE :SCHEMA.TranscriptEC; diff --git a/Model/lib/psql/webready/global/CompoundAttributes.psql b/Model/lib/psql/webready/global/CompoundAttributes.psql new file mode 100644 index 000000000..aa622e1f3 --- /dev/null +++ b/Model/lib/psql/webready/global/CompoundAttributes.psql @@ -0,0 +1,19 @@ + CREATE TABLE :SCHEMA.CompoundAttributes AS + SELECT p.ID + , p.source_id + , p.compound_name + , string_agg(childc.other_names, ';' ORDER BY childc.other_names) AS other_names + , string_agg(childc.iupac_name, ';' ORDER BY childc.iupac_name) AS iupac_name + , string_agg(childc.syn, ';' ORDER BY childc.syn) AS syn + , p.definition + , p.secondary_ids + , string_agg(childc.formula, ';' ORDER BY childc.formula) AS formula + , avg(childc.mass::numeric) AS mass + FROM :SCHEMA.CompoundProperties p + , (SELECT id, parent_id, other_names, iupac_name, syn, mass, formula FROM :SCHEMA.CompoundProperties ) childc + WHERE p.parent_id IS NULL + AND ( p.ID = childc.parent_id OR p.ID = childc.ID ) + GROUP BY p.ID, p.source_id, p.compound_name, p.definition, p.secondary_ids + ; + + diff --git a/Model/lib/psql/webready/global/CompoundAttributes_ix.psql b/Model/lib/psql/webready/global/CompoundAttributes_ix.psql new file mode 100644 index 000000000..2cdd87d86 --- /dev/null +++ b/Model/lib/psql/webready/global/CompoundAttributes_ix.psql @@ -0,0 +1,2 @@ + CREATE INDEX CompoundAttributes_idx ON :SCHEMA.CompoundAttributes (source_id) + ; diff --git a/Model/lib/psql/webready/global/CompoundId.psql b/Model/lib/psql/webready/global/CompoundId.psql new file mode 100644 index 000000000..9cd80c05a --- /dev/null +++ b/Model/lib/psql/webready/global/CompoundId.psql @@ -0,0 +1,30 @@ + CREATE TABLE :SCHEMA.CompoundId AS + SELECT source_id AS id, source_id AS compound, 'same ID' AS type, '' as source + FROM :SCHEMA.CompoundAttributes + UNION + SELECT p.source_id AS id, ca.source_id AS compound, 'child ID' AS type, '' as source + FROM :SCHEMA.CompoundAttributes ca, :SCHEMA.CompoundProperties p + WHERE ca.id = p.parent_id + UNION + SELECT da.accession_number AS id, p.source_id AS compound, 'KEGG' AS type, '' as source + FROM chebi.database_accession da, :SCHEMA.CompoundAttributes p + WHERE da.type='KEGG COMPOUND accession' + AND da.compound_id = p.id + UNION + SELECT distinct da.accession_number AS id, p.chebi_accession AS compound, 'KEGG' as type, '' as source + FROM chebi.database_accession da, chebi.compounds c, chebi.compounds p + WHERE NOT p.status in ('D', 'F') AND da.type='KEGG COMPOUND accession' + AND da.compound_id = c.id AND c.parent_id=p.id + UNION + SELECT n.name as id, ca.source_id as compound, 'name' as type, n.source + FROM :SCHEMA.CompoundAttributes ca, chebi.names n + WHERE ca.id = n.compound_id + AND n.type = 'NAME' + UNION + SELECT n.name as id, ca.source_id as compound, 'synonym' as type, n.source + FROM :SCHEMA.CompoundAttributes ca, chebi.names n + WHERE ca.id = n.compound_id + AND n.type = 'SYNONYM' + ; + + diff --git a/Model/lib/psql/webready/global/CompoundId_ix.psql b/Model/lib/psql/webready/global/CompoundId_ix.psql new file mode 100644 index 000000000..20a155ef6 --- /dev/null +++ b/Model/lib/psql/webready/global/CompoundId_ix.psql @@ -0,0 +1,2 @@ + CREATE INDEX CompoundId_idx ON :SCHEMA.CompoundId (id, compound) + ; diff --git a/Model/lib/psql/webready/global/CompoundProperties.psql b/Model/lib/psql/webready/global/CompoundProperties.psql new file mode 100644 index 000000000..5faddf835 --- /dev/null +++ b/Model/lib/psql/webready/global/CompoundProperties.psql @@ -0,0 +1,26 @@ + CREATE TABLE :SCHEMA.CompoundProperties AS + SELECT c.ID, c.chebi_accession AS source_id, c.parent_id, + c.name AS compound_name, + substr(string_agg(cn.name, ';'), 1, 1000) AS other_names, + substr(string_agg(ciup.iupac_name, ';'), 1, 1000) AS iupac_name, + substr(string_agg(csyn.syn, ';'), 1, 1000) AS syn, + c.definition, m.mass, + string_agg(formu.formula, ';' order by formu.formula) AS formula, + string_agg(sec.chebi_accession, ';' order by sec.chebi_accession) AS secondary_ids + FROM chebi.compounds c + LEFT JOIN ( SELECT compound_id, NAME FROM chebi.names WHERE TYPE='NAME') + cn ON c.ID = cn.compound_id + LEFT JOIN ( SELECT compound_id, MIN(NAME) AS iupac_name FROM chebi.names WHERE TYPE='IUPAC NAME' GROUP BY compound_id) + ciup ON c.ID = ciup.compound_id + LEFT JOIN ( SELECT compound_id, MIN(NAME) AS syn FROM chebi.names WHERE type='SYNONYM' GROUP BY compound_id) + csyn ON c.ID = csyn.compound_id + LEFT JOIN ( SELECT compound_id, chemical_data AS formula FROM chebi.chemical_data WHERE TYPE='FORMULA') + formu ON c.ID = formu.compound_id + LEFT JOIN ( SELECT compound_id, chemical_data AS mass FROM chebi.chemical_data WHERE TYPE='MASS' and chemical_data != 'NaN') + m ON c.ID = m.compound_id + LEFT JOIN ( SELECT parent_id, chebi_accession FROM chebi.compounds) + sec ON c.ID = sec.parent_id + WHERE NOT c.status in ('D', 'F') + GROUP BY c.ID, c.chebi_accession, c.parent_id, c.name, c.definition, m.mass + ; + diff --git a/Model/lib/psql/webready/global/CompoundProperties_ix.psql b/Model/lib/psql/webready/global/CompoundProperties_ix.psql new file mode 100644 index 000000000..5943ec226 --- /dev/null +++ b/Model/lib/psql/webready/global/CompoundProperties_ix.psql @@ -0,0 +1,3 @@ +CREATE INDEX CompoundProperties_idx1 ON :SCHEMA.CompoundProperties (id); + + CREATE INDEX CompoundProperties_idx2 ON :SCHEMA.CompoundProperties (parent_id); \ No newline at end of file diff --git a/Model/lib/psql/webready/global/CompoundTypeAheads.psql b/Model/lib/psql/webready/global/CompoundTypeAheads.psql new file mode 100644 index 000000000..dfefb18f9 --- /dev/null +++ b/Model/lib/psql/webready/global/CompoundTypeAheads.psql @@ -0,0 +1,12 @@ + CREATE TABLE :SCHEMA.CompoundTypeAheads AS + SELECT ca.source_id AS compound_id, + ca.source_id || ' (' || ca.compound_name || ')' AS display + FROM :SCHEMA.CompoundAttributes ca, :SCHEMA.PathwayCompounds pc + WHERE pc.chebi_accession = ca.source_id + UNION + SELECT ca.source_id AS compound_id, + pc.compound_source_id || ' (' || ca.compound_name || ')' AS display + FROM :SCHEMA.CompoundAttributes ca, :SCHEMA.PathwayCompounds pc + WHERE pc.chebi_accession = ca.source_id + ; + diff --git a/Model/lib/psql/webready/global/OntologyLevels.psql b/Model/lib/psql/webready/global/OntologyLevels.psql new file mode 100644 index 000000000..2a80ec0fb --- /dev/null +++ b/Model/lib/psql/webready/global/OntologyLevels.psql @@ -0,0 +1,38 @@ + DROP TABLE IF EXISTS :SCHEMA.Is_a_links; + + CREATE UNLOGGED TABLE :SCHEMA.Is_a_links AS + SELECT subject_term_id, object_term_id + FROM sres.OntologyRelationship rel, sres.OntologyTerm pred + WHERE rel.predicate_term_id = pred.ontology_term_id + AND pred.name = 'is_a' + ; + + DROP TABLE IF EXISTS :SCHEMA.Roots; + + CREATE UNLOGGED TABLE :SCHEMA.Roots AS + SELECT object_term_id FROM :SCHEMA.is_a_links + EXCEPT + SELECT subject_term_id FROM :SCHEMA.is_a_links + ; + + CREATE TABLE :SCHEMA.OntologyLevels as + WITH RECURSIVE levels(ontology_term_id, depth) AS ( + SELECT object_term_id, 1 as depth FROM :SCHEMA.Roots + UNION + SELECT :SCHEMA.is_a_links.subject_term_id, levels.depth + 1 as depth + FROM :SCHEMA.Is_a_links, levels + WHERE :SCHEMA.is_a_links.object_term_id = levels.ontology_term_id + ) + SELECT ontology_term_id, min(depth) as min_depth, max(depth) as max_depth + FROM ( + SELECT ontology_term_id, depth + FROM levels + WHERE ontology_term_id NOT IN (SELECT object_term_id FROM :SCHEMA.Roots) + UNION + SELECT object_term_id, 0 FROM :SCHEMA.Roots + ) t + GROUP BY ontology_term_id + ; + +drop table :SCHEMA.Is_a_links; +drop table :SCHEMA.Roots; diff --git a/Model/lib/psql/webready/global/OntologyLevels_ix.psql b/Model/lib/psql/webready/global/OntologyLevels_ix.psql new file mode 100644 index 000000000..5f95d3889 --- /dev/null +++ b/Model/lib/psql/webready/global/OntologyLevels_ix.psql @@ -0,0 +1,2 @@ + create index olev_termix on :SCHEMA.OntologyLevels (ontology_term_id, min_depth, max_depth) + ; diff --git a/Model/lib/psql/webready/global/PathwayAttributes.psql b/Model/lib/psql/webready/global/PathwayAttributes.psql new file mode 100644 index 000000000..5c8567fff --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayAttributes.psql @@ -0,0 +1,52 @@ + drop table if exists :SCHEMA.PathwayAttributes; + + CREATE TABLE :SCHEMA.PathwayAttributes as + SELECT + p.source_id + , p.pathway_id + , p.name + , enz.total_enzyme_count + , cpd.total_compound_count + , p.url + , replace(replace(ed.name, 'Pathways_', ''), '_RSRC', '') as pathway_source + , ed.name as external_db_name + , edr.version as external_db_version + FROM + sres.pathway p + , sres.externalDatabase ed + , sres.externalDatabaseRelease edr + ,(SELECT + COUNT( *) AS total_compound_count + , pathway_id + FROM + sres.pathwayNode pn + , SRES.ontologyterm ot + WHERE + pn.pathway_node_type_id = ot.ontology_term_id + AND ot.name = 'molecular entity' + GROUP BY + pathway_id + ) cpd + ,(SELECT + COUNT( *) AS total_enzyme_count + , pathway_id + FROM + sres.pathwayNode pn + , SRES.ontologyterm ot + WHERE + pn.pathway_node_type_id = ot.ontology_term_id + AND ot.name = 'enzyme' + GROUP BY + pathway_id + ) enz + WHERE + ed.external_database_id = edr.external_database_id + AND edr.external_database_release_id = p.external_database_release_id + AND cpd.pathway_id = p.pathway_id + AND enz.pathway_id = p.pathway_id + AND source_id NOT IN('ec01100', 'ec01110', 'ec01120') + -- temporarily remove MPMP from release 46 + AND ed.name NOT LIKE '%MPMP%' + ; + + diff --git a/Model/lib/psql/webready/global/PathwayAttributes_ix.psql b/Model/lib/psql/webready/global/PathwayAttributes_ix.psql new file mode 100644 index 000000000..492ee6882 --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayAttributes_ix.psql @@ -0,0 +1,7 @@ + CREATE UNIQUE index PathAttr_sourceId_pwaySrc + ON :SCHEMA.PathwayAttributes (source_id, pathway_source) + ; + + create index PathAttr_ix + on :SCHEMA.PathwayAttributes (pathway_id, source_id, name, pathway_source, total_enzyme_count, total_compound_count) + ; diff --git a/Model/lib/psql/webready/global/PathwayCompounds.psql b/Model/lib/psql/webready/global/PathwayCompounds.psql new file mode 100644 index 000000000..68fcc8290 --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayCompounds.psql @@ -0,0 +1,81 @@ + CREATE TABLE :SCHEMA.PathwayCompounds AS + SELECT + pathway_id + , reaction_id + , ext_db_name + , ext_db_version + , compound_node_id + , compound_source_id + , c.chebi_accession + , case when c.chebi_accession is not null then 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=' || c.chebi_accession else null end as chebi_url + , type + FROM ( + SELECT + p.PATHWAY_ID + , prx.PATHWAY_REACTION_ID as reaction_id + , ed.NAME as ext_db_name + , edr.version as ext_db_version + , pn.pathway_node_id as compound_node_id + , pn.DISPLAY_LABEL as compound_source_id + , 'substrate' as type + , pn.row_id + FROM + APIDB.PATHWAYREACTION prx + , SRES.PATHWAY p + , APIDB.PATHWAYREACTIONREL prr + , SRES.PATHWAYNODE pn + , SRES.PATHWAYRELATIONSHIP prel + , SRES.ONTOLOGYTERM ot + , SRES.EXTERNALDATABASE ed + , SRES.EXTERNALDATABASERELEASE edr + WHERE p.PATHWAY_ID = prr.PATHWAY_ID + AND prx.PATHWAY_REACTION_ID = prr.PATHWAY_REACTION_ID + AND prr.PATHWAY_RELATIONSHIP_ID = prel.PATHWAY_RELATIONSHIP_ID + AND prel.NODE_ID = pn.PATHWAY_NODE_ID + AND ot.NAME = 'molecular entity' + AND ot.ONTOLOGY_TERM_ID = pn.PATHWAY_NODE_TYPE_ID + AND p.EXTERNAL_DATABASE_RELEASE_ID = edr.EXTERNAL_DATABASE_RELEASE_ID + AND edr.EXTERNAL_DATABASE_ID = ed.EXTERNAL_DATABASE_ID + ) t LEFT OUTER JOIN CHEBI.COMPOUNDS c on t.row_id = c.ID + UNION + SELECT + pathway_id + , reaction_id + , ext_db_name + , ext_db_version + , compound_node_id + , compound_source_id + , c.chebi_accession + , case when c.chebi_accession is not null then 'https://www.ebi.ac.uk/chebi/searchId.do?chebiId=' || c.chebi_accession else null end as chebi_url + , type + FROM ( + SELECT + p.PATHWAY_ID + , prx.PATHWAY_REACTION_ID as reaction_id + , ed.NAME as ext_db_name + , edr.version as ext_db_version + , pn.pathway_node_id as compound_node_id + , pn.DISPLAY_LABEL as compound_source_id + , 'product' as type + , pn.row_id + FROM + APIDB.PATHWAYREACTION prx + , SRES.PATHWAY p + , APIDB.PATHWAYREACTIONREL prr + , SRES.PATHWAYNODE pn + , SRES.PATHWAYRELATIONSHIP prel + , SRES.ONTOLOGYTERM ot + , SRES.EXTERNALDATABASE ed + , SRES.EXTERNALDATABASERELEASE edr + WHERE p.PATHWAY_ID = prr.PATHWAY_ID + AND prx.PATHWAY_REACTION_ID = prr.PATHWAY_REACTION_ID + AND prr.PATHWAY_RELATIONSHIP_ID = prel.PATHWAY_RELATIONSHIP_ID + AND prel.ASSOCIATED_NODE_ID = pn.PATHWAY_NODE_ID + AND ot.NAME = 'molecular entity' + AND ot.ONTOLOGY_TERM_ID = pn.PATHWAY_NODE_TYPE_ID + AND p.EXTERNAL_DATABASE_RELEASE_ID = edr.EXTERNAL_DATABASE_RELEASE_ID + AND edr.EXTERNAL_DATABASE_ID = ed.EXTERNAL_DATABASE_ID + ) t2 LEFT OUTER JOIN CHEBI.COMPOUNDS c on t2.row_id = c.ID + ; + + diff --git a/Model/lib/psql/webready/global/PathwayCompounds_ix.psql b/Model/lib/psql/webready/global/PathwayCompounds_ix.psql new file mode 100644 index 000000000..3aae4dbb5 --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayCompounds_ix.psql @@ -0,0 +1,3 @@ + create index PthCmpd_id_ix + on :SCHEMA.PathwayCompounds (pathway_id, reaction_id, ext_db_name) + ; diff --git a/Model/lib/psql/webready/global/PathwayEC.psql b/Model/lib/psql/webready/global/PathwayEC.psql new file mode 100644 index 000000000..6a688da18 --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayEC.psql @@ -0,0 +1,31 @@ + drop table if exists :SCHEMA.PathwayEC; + + /* this table represents the universe of EC numbers that are associated with enzyme nodes in metabolic pathways*/ + + CREATE TABLE :SCHEMA.PathwayEC as + + SELECT DISTINCT ec.enzyme_class_id -- use this for joining back to pathways later + , ec.ec_number -- useful for quick exact matches + , ec.ec_number_1 --have the 4 EC number components separately avoids lots of like statements later + , ec.ec_number_2 + , ec.ec_number_3 + , ec.ec_number_4 + , regexp_count(ec.ec_number, '-') as wildcard_count -- how many of the enzyme number positions are unknown + , pn.pathway_id + , p.external_database_release_id + + FROM sres.EnzymeClass ec + , sres.OntologyTerm ot + , sres.PathwayNode pn + , sres.Pathway p + -- find all pathway nodes representing enzymes + + WHERE pn.pathway_node_type_id = ot.ontology_term_id + AND ot.name = 'enzyme' + AND p.pathway_id = pn.pathway_id + -- we don't want the root or this gets matched to everything! + AND pn.display_label != '-.-.-.-' + -- now we can get the enzyme details for the enzyme pathway nodes + AND ec.enzyme_class_id = pn.row_id + ; + diff --git a/Model/lib/psql/webready/global/PathwayEC_ix.psql b/Model/lib/psql/webready/global/PathwayEC_ix.psql new file mode 100644 index 000000000..0c652289a --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayEC_ix.psql @@ -0,0 +1,6 @@ +CREATE INDEX PathwayEC_1_idx ON :SCHEMA.PathwayEC (pathway_id, ec_number, external_database_release_id) + ; + + +CREATE INDEX PathwayEC_2_idx ON :SCHEMA.PathwayEC (ec_number_1, ec_number_2, ec_number_3, ec_number_4, enzyme_class_id, ec_number, wildcard_count) + ; diff --git a/Model/lib/psql/webready/global/PathwayNodes.psql b/Model/lib/psql/webready/global/PathwayNodes.psql new file mode 100644 index 000000000..bcd097928 --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayNodes.psql @@ -0,0 +1,304 @@ + DROP TABLE IF EXISTS :SCHEMA.NodesWithTypes; + + CREATE UNLOGGED TABLE :SCHEMA.NodesWithTypes AS + SELECT pn.pathway_id + , CASE WHEN pa.name IS NOT NULL THEN pa.name ELSE pn.display_label END AS display_label + , pa.url + , CASE WHEN pa.name IS NOT NULL THEN pa.name ELSE pn.display_label END AS name + , pa.source_id AS node_identifier + , pn.pathway_node_id AS pathway_node_id + , pn.x + , pn.y + , pn.width + , pn.height + , pn.cellular_location + , ot.name AS type + , NULL AS default_structure + FROM sres.pathwaynode pn + INNER JOIN sres.ontologyterm ot ON pn.pathway_node_type_id = ot.ontology_term_id + LEFT JOIN :SCHEMA.PathwayAttributes pa ON pn.display_label = pa.source_id + WHERE ot.name = 'metabolic process' + UNION ALL + SELECT pn.pathway_id + , pn.display_label + , NULL AS url + , coalesce(ec.description, pn.display_label) AS name + , ec.ec_number AS node_identifier + , pn.pathway_node_id AS pathway_node_id + , pn.x + , pn.y + , pn.width + , pn.height + , pn.cellular_location + , ot.name AS type + , NULL AS default_structure + FROM sres.pathwaynode pn + INNER JOIN sres.ontologyterm ot ON pn.pathway_node_type_id = ot.ontology_term_id + LEFT JOIN sres.enzymeclass ec ON pn.row_id = ec.enzyme_class_id + WHERE ot.name = 'enzyme' + GROUP BY pn.pathway_id + , pn.display_label + , ec.description + , ec.ec_number + , pn.pathway_node_id + , pn.x + , pn.y + , pn.width + , pn.height + , pn.cellular_location + , ot.name + UNION ALL + SELECT pn.pathway_id + , pn.display_label + , NULL AS url + , coalesce(c.name, pn.display_label) AS name + , c.chebi_accession AS node_identifier + , pn.pathway_node_id AS pathway_node_id + , pn.x + , pn.y + , pn.width + , pn.height + , pn.cellular_location + , ot.name AS type + , st.default_structure + FROM sres.pathwaynode pn + INNER JOIN sres.ontologyterm ot ON pn.pathway_node_type_id = ot.ontology_term_id + LEFT JOIN chebi.compounds c ON pn.row_id = c.id + LEFT JOIN ( + SELECT n.compound + , s.structure AS default_structure + FROM chebi.structures s + , (SELECT id + , compound + FROM :SCHEMA.CompoundId + WHERE type IN ('same ID', 'child ID') + ) n + WHERE n.id = 'CHEBI:' || s.compound_id + AND s.type = 'mol' + AND s.dimension = '2D' + AND s.default_structure = 'Y' + ) st ON c.chebi_accession = st.compound + WHERE ot.name = 'molecular entity' + ; + + DROP TABLE IF EXISTS :SCHEMA.ReactionsWithReversibility; + + CREATE UNLOGGED TABLE :SCHEMA.ReactionsWithReversibility AS + SELECT DISTINCT spr.pathway_relationship_id + , tpr.is_reversible + , tpr.reaction_source_id + FROM sres.pathwayrelationship spr + , apidb.pathwayreactionrel prr + , :SCHEMA.PathwayReactions tpr + WHERE prr.pathway_relationship_id = spr.pathway_relationship_id + AND tpr.reaction_id = prr.pathway_reaction_id + ; + + DROP TABLE IF EXISTS :SCHEMA.EnzymeEdges; + + CREATE UNLOGGED TABLE :SCHEMA.EnzymeEdges AS + SELECT DISTINCT nwt.pathway_id AS pathway_id + , nwt.pathway_node_id AS e_id + , nwt.type + , i.associated_node_id AS m1_id + , rri.is_reversible AS ir1 + , o.node_id AS m2_id + , rro.is_reversible AS ir2 + , i.associated_node_id || '_' || o.node_id || '_' || rri.is_reversible || '_' || rro.is_reversible as io + FROM sres.pathwayrelationship i + , sres.pathwayrelationship o + , :SCHEMA.NodesWithTypes nwt + , :SCHEMA.ReactionsWithReversibility rri + , :SCHEMA.ReactionsWithReversibility rro + WHERE i.node_id = nwt.pathway_node_id + AND o.associated_node_id = nwt.pathway_node_id + AND i.pathway_relationship_id = rri.pathway_relationship_id + AND o.pathway_relationship_id = rro.pathway_relationship_id + AND nwt.type = 'enzyme' + AND rri.is_reversible = rro.is_reversible + AND rri.reaction_source_id = rro.reaction_source_id + ; + + DROP TABLE IF EXISTS :SCHEMA.ParentNodes; + + CREATE UNLOGGED TABLE :SCHEMA.ParentNodes AS + WITH AllEnzymeEdges AS ( + SELECT string_agg(io, ',' ORDER BY io) AS all_edges + , e_id + , pathway_id + FROM :SCHEMA.EnzymeEdges + GROUP BY pathway_id + , e_id + ) + , pn as ( + SELECT pathway_id + , all_edges + , string_agg(e_id::varchar, '_' ORDER BY e_id) AS parent + FROM AllEnzymeEdges + GROUP BY pathway_id + , all_edges + HAVING COUNT (*) > 1 + ) + SELECT aee.e_id, pn.* + FROM pn + , AllEnzymeEdges aee + WHERE aee.all_edges = pn.all_edges + ; + + DROP TABLE IF EXISTS :SCHEMA.NodesWithParents; + + CREATE UNLOGGED TABLE :SCHEMA.NodesWithParents AS + SELECT DISTINCT ee.e_id AS pathway_node_id + , pn.parent + , ee.type AS node_type + , ee.pathway_id + FROM :SCHEMA.EnzymeEdges ee + , :SCHEMA.ParentNodes pn + WHERE pn.pathway_id = ee.pathway_id + AND ee.e_id = pn.e_id + ; + + DROP TABLE IF EXISTS :SCHEMA.EnzymeReactions; + + CREATE UNLOGGED TABLE :SCHEMA.EnzymeReactions AS + SELECT DISTINCT pn.PATHWAY_NODE_ID node_id + , pr.SOURCE_ID AS reaction_source_id + FROM sres.pathwaynode pn + , apidb.pathwayreaction pr + , APIDB.PATHWAYREACTIONREL prr + , SRES.PATHWAYRELATIONSHIP prel + , sres.ontologyterm ot + WHERE (prel.NODE_ID = pn.PATHWAY_NODE_ID OR prel.ASSOCIATED_NODE_ID = pn.PATHWAY_NODE_ID) + AND prr.PATHWAY_RELATIONSHIP_ID = prel.PATHWAY_RELATIONSHIP_ID + AND pr.PATHWAY_REACTION_ID = prr.PATHWAY_REACTION_ID + AND ot.name = 'enzyme' + AND pn.PATHWAY_NODE_TYPE_ID = ot.ONTOLOGY_TERM_ID + ; + + DROP TABLE IF EXISTS :SCHEMA.ParentsForEdges; + + CREATE UNLOGGED TABLE :SCHEMA.ParentsForEdges AS + SELECT ee.e_id + , ee.m1_id + , ee.ir1 + , ee.m2_id + , ee.ir2 + , np.parent + FROM :SCHEMA.EnzymeEdges ee + , :SCHEMA.NodesWithParents np + WHERE ee.e_id = np.pathway_node_id + ; + + DROP TABLE IF EXISTS :SCHEMA.PathwayEdges; + + CREATE UNLOGGED TABLE :SCHEMA.PathwayEdges AS + SELECT pa.source_id + , pa.pathway_source + , rel.* + FROM ( + SELECT DISTINCT ee.pathway_id + , coalesce(pe.parent, ee.e_id::varchar) AS source + , ee.m1_id::varchar AS target + , max(ee.ir1) AS is_reversible + FROM :SCHEMA.EnzymeEdges ee + LEFT JOIN :SCHEMA.ParentsForEdges pe ON ee.e_id = pe.e_id + GROUP BY ee.pathway_id, ee.m1_id, coalesce(pe.parent, ee.e_id::varchar) + UNION + SELECT DISTINCT ee.pathway_id + , ee.m2_id::varchar AS source + , coalesce(pe.parent, ee.e_id::varchar) AS target + , max(ee.ir2) AS is_reversible + FROM :SCHEMA.EnzymeEdges ee + LEFT JOIN :SCHEMA.ParentsForEdges pe ON ee.e_id = pe.e_id + GROUP BY ee.pathway_id, ee.m2_id, coalesce(pe.parent, ee.e_id::varchar) + UNION + SELECT pn1.pathway_id + , pr.node_id::varchar AS source + , pr.associated_node_id::varchar AS target + , pr.is_reversible + FROM sres.pathwayrelationship pr + , sres.pathwaynode pn1 + , sres.pathwaynode pn2 + , sres.ontologyterm ot1 + , sres.ontologyterm ot2 + WHERE pr.node_id = pn1.pathway_node_id + AND pr.associated_node_id = pn2.pathway_node_id + AND pn1.pathway_node_type_id = ot1.ontology_term_id + AND pn2.pathway_node_type_id = ot2.ontology_term_id + AND ot1.name != 'enzyme' + AND ot2.name != 'enzyme' + ) rel + , :SCHEMA.PathwayAttributes pa + WHERE pa.pathway_id = rel.pathway_id + ; + + CREATE TABLE :SCHEMA.PathwayNodes AS + SELECT pa.source_id + , pa.pathway_source + , pn.display_label + , pn.x + , pn.y + , pn.width + , pn.height + , pn.cellular_location + , coalesce(pn.url, + CASE WHEN coalesce(type, nodes_with_parents.node_type) = 'enzyme' THEN + CASE + -- CHECK AND FIX + --WHEN REGEXP_LIKE (display_label, '^\d+\.(\d|-)+\.(\d|-)+\.(\d|-)+$') THEN 'https://enzyme.expasy.org/EC/' || display_label + WHEN pa.pathway_source = 'KEGG' THEN 'https://www.genome.jp/dbget-bin/www_bget?rn:' || reaction_source_id + WHEN pa.pathway_source = 'MetaCyc' THEN 'https://metacyc.org/META/new-image?type=REACTION' || chr(38) || 'object=' || reaction_source_id + WHEN pa.pathway_source = 'TrypanoCyc' THEN 'http://vm-trypanocyc.toulouse.inra.fr/TRYPANO/new-image?type=REACTION' || chr(38) || 'object=' || reaction_source_id + WHEN pa.pathway_source = 'LeishCyc' THEN 'http://vm-trypanocyc.toulouse.inra.fr/LEISH/new-image?tyrp=REACTION' || chr(38) || 'object=' || reaction_source_id + ELSE NULL END + ELSE NULL END) AS url + , pn.name + , pn.node_identifier + , nodes_with_parents.pathway_node_id AS id + , nodes_with_parents.parent + , reaction_source_id + , coalesce(type, nodes_with_parents.node_type) AS node_type + , pn.default_structure + FROM + ( SELECT e_id::varchar AS pathway_node_id + , nwp.parent + , type AS node_type + , ee.pathway_id + , er.reaction_source_id + FROM :SCHEMA.EnzymeEdges ee + INNER JOIN :SCHEMA.EnzymeReactions er ON er.node_id = ee.e_id + LEFT JOIN :SCHEMA.NodesWithParents nwp ON ee.e_id = nwp.pathway_node_id + UNION + SELECT nwp.parent + , NULL + , 'nodeOfNodes' + , pathway_id + , NULL + FROM :SCHEMA.NodesWithParents nwp + UNION + SELECT nwt.pathway_node_id::varchar AS pathway_node_id + , NULL AS parent + , nwt.type + , nwt.pathway_id + , NULL + FROM :SCHEMA.NodesWithTypes nwt + WHERE nwt.type != 'enzyme' + ) nodes_with_parents + INNER JOIN :SCHEMA.PathwayAttributes pa ON nodes_with_parents.pathway_id = pa.pathway_id + LEFT JOIN :SCHEMA.NodesWithTypes pn ON nodes_with_parents.pathway_node_id = pn.pathway_node_id::varchar + ; + +drop table if exists :SCHEMA.NodesWithTypes; +drop table if exists :SCHEMA.ReactionsWithReversibility; +drop table if exists :SCHEMA.EnzymeEdges; +drop table if exists :SCHEMA.ParentNodes; +drop table if exists :SCHEMA.NodesWithParents; +drop table if exists :SCHEMA.EnzymeReactions; +drop table if exists :SCHEMA.ParentsForEdges; + + + + + + + diff --git a/Model/lib/psql/webready/global/PathwayReactions.psql b/Model/lib/psql/webready/global/PathwayReactions.psql new file mode 100644 index 000000000..93e6e626c --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayReactions.psql @@ -0,0 +1,142 @@ + drop table if exists :SCHEMA.PR_rep; + + create unlogged table :SCHEMA.PR_rep as + SELECT DISTINCT + pr.PATHWAY_REACTION_ID as reaction_id + , pr.SOURCE_ID as reaction_source_id + , pn.DISPLAY_LABEL as enzyme + , coalesce(ca.compound_name, pc.compound_source_id) as compound + , prel.is_reversible as is_reversible_og + , last_value(prel.is_reversible) OVER (partition by pr.pathway_reaction_id ORDER BY prel.is_reversible ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING ) as is_reversible + , first_value(pc.type) over (partition by pr.pathway_reaction_id, pr.SOURCE_ID, pn.DISPLAY_LABEL, prel.IS_REVERSIBLE, coalesce(pc.chebi_accession, pc.compound_source_id), coalesce(ca.compound_name, pc.compound_source_id) ORDER BY pc.pathway_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as type + FROM + sres.pathway p + , apidb.pathwayreaction pr + , APIDB.PATHWAYREACTIONREL prr + , SRES.PATHWAYNODE pn + , SRES.PATHWAYRELATIONSHIP prel + , SRES.ONTOLOGYTERM ot + , :SCHEMA.PathwayCompounds pc + LEFT JOIN :SCHEMA.CompoundAttributes ca ON pc.chebi_accession = ca.source_id + WHERE p.PATHWAY_ID = prr.PATHWAY_ID + AND pr.PATHWAY_REACTION_ID = prr.PATHWAY_REACTION_ID + AND prr.PATHWAY_RELATIONSHIP_ID = prel.PATHWAY_RELATIONSHIP_ID + AND prel.NODE_ID = pn.PATHWAY_NODE_ID + AND ot.name = 'enzyme' + AND ot.ONTOLOGY_TERM_ID = pn.PATHWAY_NODE_TYPE_ID + AND pc.PATHWAY_ID = p.PATHWAY_ID + AND pc.REACTION_id = pr.PATHWAY_REACTION_ID + ; + + + drop table if exists :SCHEMA.PR_t1; + + create unlogged table :SCHEMA.PR_t1 as + SELECT DISTINCT + pr.PATHWAY_REACTION_ID as reaction_id + , pr.SOURCE_ID as reaction_source_id + , ed.NAME as ext_db_name + , edr.VERSION as ext_db_version + , cast(pn.DISPLAY_LABEL as varchar(20)) as enzyme + , min(PR_rep.is_reversible) as is_reversible + , min(PR_rep.type) as type + , string_agg (pc.type, ',' order by p.pathway_id) as type_list + , coalesce(ca.compound_name, pc.compound_source_id) as compound + , CASE + WHEN coalesce(pc.CHEBI_ACCESSION, pc.compound_source_id) LIKE 'CHEBI%' + THEN '' || coalesce(ca.compound_name, pc.compound_source_id) || '' + ELSE coalesce(pc.chebi_accession, pc.compound_source_id) + END as compound_url + , CASE (replace (replace (ed.name, 'Pathways_', ''), '_RSRC', '')) + WHEN 'KEGG' THEN 'https://www.genome.jp/dbget-bin/www_bget?rn:' || pr.source_id + WHEN 'MetaCyc' THEN 'https://metacyc.org/META/new-image?type=REACTION' || chr(38) || 'object=' || pr.source_id + WHEN 'TrypanoCyc' THEN 'http://vm-trypanocyc.toulouse.inra.fr/TRYPANO/new-image?type=REACTION' || chr(38) || 'object=' || pr.source_id + WHEN 'LeishCyc' THEN 'http://vm-trypanocyc.toulouse.inra.fr/LEISH/new-image?tyrp=REACTION' || chr(38) || 'object=' || pr.source_id + WHEN 'FungiCyc' THEN NULL + END as reaction_url + FROM + sres.pathway p + , apidb.pathwayreaction pr + , APIDB.PATHWAYREACTIONREL prr + , SRES.PATHWAYNODE pn + , SRES.PATHWAYRELATIONSHIP prel + , SRES.EXTERNALDATABASE ed + , SRES.EXTERNALDATABASERELEASE edr + , SRES.ONTOLOGYTERM ot + , :SCHEMA.pr_rep pr_rep + , :SCHEMA.PathwayCompounds pc + LEFT JOIN :SCHEMA.CompoundAttributes ca ON pc.chebi_accession = ca.source_id + WHERE p.PATHWAY_ID = prr.PATHWAY_ID + AND pr.PATHWAY_REACTION_ID = prr.PATHWAY_REACTION_ID + AND prr.PATHWAY_RELATIONSHIP_ID = prel.PATHWAY_RELATIONSHIP_ID + AND prel.NODE_ID = pn.PATHWAY_NODE_ID + AND ot.name = 'enzyme' + AND ot.ONTOLOGY_TERM_ID = pn.PATHWAY_NODE_TYPE_ID + AND pc.EXT_DB_NAME = ed.NAME + AND pc.EXT_DB_VERSION = edr.VERSION + AND ed.EXTERNAL_DATABASE_ID = edr.EXTERNAL_DATABASE_ID + AND pc.PATHWAY_ID = p.PATHWAY_ID + AND pc.REACTION_id = pr.PATHWAY_REACTION_ID + AND PR_rep.reaction_id = pr.pathway_reaction_id + AND PR_rep.reaction_source_id = pr.source_id + AND PR_rep.compound = coalesce(ca.compound_name, pc.compound_source_id) + AND PR_rep.enzyme = pn.DISPLAY_LABEL + AND PR_rep.is_reversible_og = prel.is_reversible + GROUP BY pr.pathway_reaction_id, pr.SOURCE_ID, ed.NAME, edr.VERSION, pn.DISPLAY_LABEL, prel.IS_REVERSIBLE + , coalesce(pc.chebi_accession, pc.compound_source_id) + , coalesce(ca.compound_name, pc.compound_source_id) + ; + + + CREATE TABLE :SCHEMA.PathwayReactions AS + SELECT o.* + , CASE WHEN o.expasy_url IS NOT NULL THEN '' || o.enzyme || '' ELSE o.enzyme END as expasy_html + FROM ( + SELECT i.* + , CASE WHEN i.enzyme like '%.%.%.%' and i.enzyme != '-.-.-.-' + THEN + 'http://enzyme.expasy.org/cgi-bin/enzyme/enzyme-search-ec?field1=' + || ec.ec_number_1 + || CASE ec.ec_number_2 WHEN null THEN null ELSE chr(38) || 'field2=' || ec.ec_number_2 END + || CASE ec.ec_number_3 WHEN null THEN null ELSE chr(38) || 'field3=' || ec.ec_number_3 END + || CASE ec.ec_number_4 WHEN null THEN null ELSE chr(38) || 'field4=' || ec.ec_number_4 END + ELSE reaction_url END as expasy_url + , ec.description as enzyme_description + FROM ( + SELECT + reaction_id + , reaction_source_id + , reaction_url + , ext_db_name + , ext_db_version + , enzyme + , substrates_html || ' ' || sign || ' ' || products_html as equation_html + , substrates_text || ' ' || sign || ' ' || products_text as equation_text + , case when sign = '<=>' then 1 else 0 end as is_reversible + , substrates_text + , products_text + FROM ( + SELECT + reaction_id + , reaction_source_id + , reaction_url + , ext_db_name + , ext_db_version + , enzyme + , (case when (string_agg (case when type_list like '%substrate%' then compound end, ',' order by compound)) = (string_agg (case when type_list like '%product%' then compound end, ',' order by compound)) or is_reversible = 1 then '<=>' else '=>' end) as sign + , string_agg(case when type like '%substrate%' then compound_url end, ' + ' order by compound_url) as substrates_html + , string_agg(case when type like '%substrate%' then compound end, ' + ' order by compound) as substrates_text + , string_agg(case when type like '%product%' then compound_url end, ' + ' order by compound_url) as products_html + , string_agg(case when type like '%product%' then compound end, ' + ' order by compound) as products_text + FROM :SCHEMA.PR_t1 as t1 + GROUP BY reaction_id, reaction_source_id, reaction_url, ext_db_name, ext_db_version, enzyme, is_reversible + ) t2 + ) i + LEFT OUTER JOIN sres.enzymeclass ec ON i.enzyme = ec.ec_number + ) o + ; + + + drop table if exists :SCHEMA.PR_rep; + drop table if exists :SCHEMA.PR_t1; diff --git a/Model/lib/psql/webready/global/PathwayReactions_ix.psql b/Model/lib/psql/webready/global/PathwayReactions_ix.psql new file mode 100644 index 000000000..50dd09d63 --- /dev/null +++ b/Model/lib/psql/webready/global/PathwayReactions_ix.psql @@ -0,0 +1,3 @@ + create index PathRcts_id_ix + on :SCHEMA.PathwayReactions (reaction_id, reaction_source_id, enzyme, expasy_url, ext_db_name) + ; diff --git a/Model/lib/psql/webready/keep/DatasetExampleSourceId.psql b/Model/lib/psql/webready/keep/DatasetExampleSourceId.psql new file mode 100644 index 000000000..4e0137173 --- /dev/null +++ b/Model/lib/psql/webready/keep/DatasetExampleSourceId.psql @@ -0,0 +1,22 @@ +:CREATE_AND_POPULATE + WITH profiles AS ( + SELECT p.source_id, + ga.sequence_id, + d.name, + row_number() over(partition by d.name + order by ga.chromosome_order_num, p.profile_as_string desc) as rn + FROM :SCHEMA.Profile p + INNER JOIN sres.ExternalDatabase d ON p.dataset_name = d.name + LEFT JOIN :SCHEMA.GeneAttributes ga ON p.source_id = ga.source_id + WHERE p.profile_as_string is not null + and ga.org_abbrev = ':ORG_ABBREV' + and p.org_abbrev = ':ORG_ABBREV' + ) + SELECT + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + p.source_id as example_source_id, p.sequence_id, p.name as dataset + FROM profiles p + WHERE p.rn = 1 + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/keep/DatasetExampleSourceId_ix.psql b/Model/lib/psql/webready/keep/DatasetExampleSourceId_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/keep/OrganismAbbreviationBlast.psql b/Model/lib/psql/webready/keep/OrganismAbbreviationBlast.psql new file mode 100644 index 000000000..faf228674 --- /dev/null +++ b/Model/lib/psql/webready/keep/OrganismAbbreviationBlast.psql @@ -0,0 +1,40 @@ +:CREATE_AND_POPULATE + select sub.*, ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, CURRENT_TIMESTAMP as modification_date + from ( + SELECT organism, parent, abbreviation + FROM OrganismAbbreviationWS + UNION + -- all familes for popsets + SELECT DISTINCT family_name_for_files || ' Popset/Genbank Isolates' as organism, '' as parent, + family_name_for_files as abbreviation + FROM apidb.Organism + WHERE family_name_for_files is not null + AND abbrev || '_isolates_genbank_RSRC' IN (SELECT external_db_name as db_name FROM PopsetAttributes) + AND family_name_for_files NOT IN ('Culicosporidae', 'Dubosqiidae', 'Ordosporidae') + UNION + SELECT special.organism, special.parent, special.abbreviation + FROM OrganismTree ot, + ( -- all species and speciesAbbreviations from apidb.Organism where we have ests + SELECT DISTINCT + sp.name as organism, ot.parentTerm as parent, + regexp_replace(org.name_for_filenames, replace(org.strain_abbrev, '/','_'),'') as abbreviation + FROM sres.TaxonName sp, TaxonSpecies ts, apidb.Organism org, OrganismTree ot + WHERE org.taxon_id = ts.taxon_id + AND ts.species_taxon_id = sp.taxon_id + AND sp.name_class = 'scientific name' + AND ot.term = sp.name + AND org.strain_abbrev is not null + AND org.name_for_filenames is not null + AND sp.taxon_id + in (SELECT etn.taxon_id + FROM sres.TaxonName etn + WHERE etn.name in (SELECT organism FROM EstAttributes)) + UNION + SELECT 'Cryptosporidiidae SSU_18srRNA Reference Isolates' as organism, + 'Cryptosporidium' as parent, 'CryptosporidiidaeReference' as abbreviation + ) special + WHERE special.parent = ot.term + ) sub + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/keep/OrganismAbbreviationBlast_ix.psql b/Model/lib/psql/webready/keep/OrganismAbbreviationBlast_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/keep/OrganismAttributes.psql b/Model/lib/psql/webready/keep/OrganismAttributes.psql new file mode 100644 index 000000000..bda5e8a2b --- /dev/null +++ b/Model/lib/psql/webready/keep/OrganismAttributes.psql @@ -0,0 +1,340 @@ + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVDataSourceCount AS + SELECT + taxon_id, + max(CASE WHEN stype = 'organellar' THEN num ELSE null END) as organellar_has, + max(CASE WHEN stype = 'Epitope' THEN num ELSE null END) as Epitope_has, + max(CASE WHEN stype = 'Array' THEN num ELSE null END) as Array_has, + max(CASE WHEN stype = 'HTSIsolate' THEN num ELSE null END) as HTSIsolate_has, + max(CASE WHEN stype = 'Popset' THEN num ELSE null END) as Popset_has + FROM ( + SELECT DISTINCT enas.taxon_id, 'organellar' AS stype, 1 AS num + FROM dots.externalNAsequence enas, SRES.ontologyterm ot + WHERE enas.sequence_ontology_id = ot.ontology_term_id + AND ot.name in( 'mitochondrial_chromosome','apicoplast_chromosome') + and enas.taxon_id = :TAXON_ID + GROUP BY enas.taxon_id + UNION + SELECT distinct ds.taxon_id, 'HTSIsolate' AS stype, 1 AS num + FROM apidb.DataSource ds + WHERE ds.type = 'isolates' AND ds.subtype = 'HTS_SNP' + AND ds.taxon_id = :TAXON_ID + GROUP BY taxon_id + UNION + SELECT distinct ds.taxon_id, 'Popset' AS stype, 1 AS num + FROM apidb.DataSource ds + WHERE ds.subtype = 'sequenceing_types' + AND ds.taxon_id = :TAXON_ID + GROUP BY taxon_id + UNION + SELECT distinct ds.taxon_id, 'Epitope' AS stype, 1 AS num + FROM apidb.DataSource ds + WHERE ds.type = 'epitope' + AND ds.taxon_id = :TAXON_ID + GROUP BY taxon_id + UNION + SELECT distinct ds.taxon_id, 'Array' AS stype, 1 AS num + FROM apidb.DataSource ds + WHERE ds.type = 'transcript_expression' + AND ds.subtype = 'array' + AND ds.taxon_id = :TAXON_ID + GROUP BY taxon_id + ) t + GROUP BY taxon_id + + ; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVOrganismCentromere AS + SELECT distinct s.taxon_id, + case when count(*) > 0 then 1 else 0 end as hasCentromere + FROM DOTS.MISCELLANEOUS f + , sres.ontologyTerm ot + , dots.nasequence s + WHERE ot.ontology_term_id = f.sequence_ontology_id + AND ot.name='centromere' + AND f.na_sequence_id = s.na_sequence_id + AND s.taxon_id = :TAXON_ID + GROUP BY s.taxon_id + + ; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVSequenceCount AS + SELECT + taxon_id, + max(CASE WHEN sequence_type = 'contig' THEN num ELSE null END) as contig_num, + max(CASE WHEN sequence_type = 'supercontig' THEN num ELSE null END) as supercont_num, + max(CASE WHEN sequence_type = 'chromosome' THEN num ELSE null END) as chrom_num + FROM ( + SELECT count(*) as num, sequence_type, taxon_id + FROM :SCHEMA.GenomicSeqAttributes + WHERE is_top_level =1 + and org_abbrev = :ORG_ABBREV + GROUP BY taxon_id, sequence_type + ) t + GROUP BY taxon_id + + ; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVCommunityCount AS + -- SELECT taxon_id, count(*) as communityCount + -- TODO: addd this back + select taxon_id, 0 as communityCount + FROM :SCHEMA.GeneAttributes + where org_abbrev = ':ORG_ABBREV' + --WHERE + --(source_id, project_id) IN ( + --SELECT distinct stable_id, project_name + --userlogins5.mappedComment@prodn.login_comment + --FROM userlogins5.mappedComment + --WHERE is_visible = 1 + --AND comment_target_id = 'gene' + --) + GROUP BY taxon_id + + ; + + + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVProfileCount AS + SELECT ga.taxon_id, + count(distinct(case when p.dataset_type = 'transcript_expression' + and p.dataset_subtype like '%rt_pcr%' + and ga.is_deprecated = 0 + then p.source_id + else '' + end)) as rtPCRCount, + count(distinct(case when p.dataset_type = 'transcript_expression' + and p.dataset_subtype = 'rnaseq' + and ga.is_deprecated = 0 + then p.source_id + else '' + end)) as rnaSeqCount, + count(distinct(case when p.dataset_type = 'transcript_expression' + and p.dataset_subtype = 'array' + and ga.is_deprecated = 0 + then p.source_id + else '' + end)) as geneArrayCount + FROM :SCHEMA.Profile p + RIGHT OUTER JOIN :SCHEMA.GeneAttributes ga ON ga.source_id = p.source_id + WHERE ga.org_abbrev = ':ORG_ABBREV' + and p.org_abbrev = ':ORG_ABBREV' + GROUP BY ga.taxon_id + + ; + + + + -- CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVPopsetCount AS + -- SELECT count(distinct gene.source_id) as popsetCount, sim.taxon_id + -- FROM ( + -- (SELECT i.source_id, nas.taxon_id, nas.source_id as sequence_source_id + -- FROM dots.similarity s, PopsetAttributes i, + -- core.tableinfo t, dots.nasequence nas + -- WHERE s.query_id = i.na_sequence_id + -- AND nas.na_sequence_id = s.subject_id + -- AND t.table_id = s.subject_table_id + -- AND t.table_id = s.query_table_id + -- AND t.name = 'ExternalNASequence' + -- AND s.pvalue_exp <= -10 + -- and nas.taxon_id = :TAXON_ID + -- ) sim LEFT JOIN + -- (SELECT i.source_id, seq.source_id as sequence_id + -- FROM dots.similarity s, PopsetAttributes i, GeneAttributes g, + -- core.tableinfo t, dots.nasequence seq + -- WHERE s.query_id = i.na_sequence_id + -- AND s.subject_id = g.na_sequence_id + -- AND t.table_id = s.subject_table_id + -- AND t.table_id = s.query_table_id + -- AND s.min_subject_start <= g.end_max + -- AND s.max_subject_end >= g.start_min + -- AND g.na_sequence_id = seq.na_sequence_id + -- AND t.name = 'ExternalNASequence' + -- and seq.taxon_id = :TAXON_ID + -- ) gene + -- ON gene.source_id = sim.source_id AND gene.sequence_id = sim.sequence_source_id) + -- GROUP BY sim.taxon_id + + -- ; + + + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVGeneCount AS + SELECT genomestat.taxon_id, + genomestat.project_id, + genomestat.database_version, + genomestat.ncbi_tax_id, + genomestat.Megabps, + -- coalesce(snpCount.ct,0) as snpCount, + coalesce(count(distinct ga.source_id),0) as geneCount, + coalesce(count(distinct case when ga.is_pseudo =1 then ga.source_id else '' end),0) as pseudoGeneCount, + coalesce(count(distinct case when (ga.gene_type ='protein coding' or ga.gene_type ='protein coding gene') then ga.source_id else '' end),0) as codingGeneCount, + coalesce(count(distinct case when (ga.gene_type ='protein coding' or ga.gene_type ='protein coding gene') then '' else ga.source_id end),0) as otherGeneCount, + coalesce(count (distinct (case when ga.is_deprecated = 0 + then cct.gene_source_id + else NULL + end)),0) ChipChipGeneCount , + coalesce(count (distinct (case when ga.is_deprecated = 0 + then pp.source_id + else NULL + end)),0) orthologCount, + coalesce(count (distinct (case when ga.is_deprecated = 0 + then gts.gene_source_id + else NULL + end)),0) goCount, + coalesce(count (distinct (case when ga.is_deprecated = 0 + then tfbs.gene_source_id + else NULL + end)),0) tfbsCount, + coalesce(count (distinct (case when ga.is_deprecated = 0 + then mss.aa_sequence_id + else NULL + end)),0) proteomicsCount, + -- coalesce(count (distinct (case when ga.is_deprecated = 0 + -- then est.source_id + -- else NULL + -- end)),0) estCount, + coalesce(count (distinct (case when (ga.is_deprecated = 0 and ta.ec_numbers is not null) + then ga.source_id + else NULL + end)),0) ecNumberCount + FROM :SCHEMA.GeneAttributes ga + LEFT OUTER JOIN apidb.phylogeneticprofile pp on ga.source_id = pp.source_id and ga.org_abbrev = ':ORG_ABBREV' + LEFT OUTER JOIN :SCHEMA.gotermsummary gts on ga.source_id = gts.gene_source_id and gts.org_abbrev = ':ORG_ABBREV' + LEFT OUTER JOIN :SCHEMA.TFBSGene tfbs on ga.source_id = tfbs.gene_source_id and tfbs.org_abbrev = ':ORG_ABBREV' + LEFT OUTER JOIN :SCHEMA.TranscriptAttributes ta on ta.gene_source_id = ga.source_id and ta.org_abbrev = ':ORG_ABBREV' + LEFT OUTER JOIN apidb.MassSpecSummary mss on ta.aa_sequence_id = mss.aa_sequence_id + LEFT OUTER JOIN :SCHEMA.chipchipTranscript cct on ga.source_id = cct.gene_source_id and cct.org_abbrev = ':ORG_ABBREV' + -- LEFT OUTER JOIN ( + -- SELECT distinct s.gene as source_id + -- FROM EstAlignmentGeneSummary s, EstAttributes e + -- WHERE s.est_gene_overlap_length >= 100 + -- AND s.is_best_alignment in (1) + -- AND s.percent_est_bases_aligned >= 20 + -- AND s.percent_identity >= 90 + -- AND e.best_alignment_count <= 1 + -- AND e.source_id = s.accession + -- GROUP by s.gene HAVING count(*) >= 1 + -- ) est ON ga.source_id = est.source_id + RIGHT OUTER JOIN ( + SELECT project_id, taxon_id, + max(database_version) as database_version, + CASE WHEN ncbi_tax_id > 9000000000 THEN NULL + ELSE ncbi_tax_id + END ncbi_tax_id, + to_char(sum(length)/1000000,'9999.99') as megabps + FROM :SCHEMA.GenomicSeqAttributes + WHERE is_top_level = 1 + AND org_abbrev = ':ORG_ABBREV' + GROUP BY project_ID, taxon_id, ncbi_tax_id + ) genomestat ON genomestat.taxon_id = ga.taxon_id + -- LEFT OUTER JOIN ( + -- SELECT count(distinct ga.source_id) as ct, ga.taxon_id + -- FROM GeneAttributes ga, SnpAttributes sf + -- WHERE sf.gene_source_id = ga.source_id + -- AND ga.is_deprecated = 0 + -- GROUP BY ga.taxon_id + -- ) snpCount ON ga.taxon_id = snpCount.taxon_id + GROUP BY genomestat.taxon_id, + genomestat.project_id, + genomestat.database_version, + genomestat.ncbi_tax_id, + genomestat.Megabps + -- snpCount.ct + + ; + +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + oa.*, tn2.name as species, t.ncbi_tax_id as species_ncbi_tax_id + , CASE WHEN ltrim(replace(oa.organism_name, tn2.name, ''))= oa.organism_name + THEN strain_abbrev + ELSE ltrim(replace(oa.organism_name, tn2.name, '')) END AS strain + FROM ( + SELECT o.project_name as project_id, + case when t.ncbi_tax_id > 10000000 + -- then 'TMPTX_' || round(t.ncbi_tax_id / 10000000) || '_' || + -- mod(t.ncbi_tax_id, 10000000) -- e.g. "TMPTX_930_1" + -- then 'TMPTX_' || t.ncbi_tax_id -- all the many digits + then 'TMPTX_' || o.public_abbrev + else 'NCBITAXON_' || t.ncbi_tax_id + end as source_id, + o.abbrev as internal_abbrev, + o.public_abbrev, + o.orthomcl_abbrev, + o.family_name_for_files, + tn.name as organism_name, + o.genome_source, + o.strain_abbrev, + o.is_annotated_genome, + o.is_reference_strain, + o.is_family_representative, + o.name_for_filenames, + o.taxon_id as component_taxon_id, + gc.database_version, + gc.megabps as megabps, + gc.ncbi_tax_id as ncbi_tax_id, + gc.snpCount as snpCount, + gc.geneCount as geneCount, + gc.pseudoGeneCount as pseudoGeneCount, + gc.codingGeneCount as codingGeneCount, + gc.otherGeneCount as otherGeneCount, + gc.ChipChipGeneCount as ChipChipGeneCount, + gc.orthologCount as orthologCount, + gc.goCount as goCount, + gc.tfbsCount as tfbsCount, + gc.proteomicsCount as proteomicsCount, + gc.estCount as estCount, + gc.ecNumberCount as ecNumberCount, + cast(coalesce(dsc.Organellar_Has, 0) as NUMERIC(1)) as isOrganellar, + cast(coalesce(dsc.HTSIsolate_Has, 0) as NUMERIC(1)) as hasHTSIsolate, + cast(coalesce(dsc.Popset_Has, 0) as NUMERIC(1)) as hasPopset, + cast(coalesce(dsc.Epitope_Has, 0) as NUMERIC(1)) as hasEpitope, + cast(coalesce(dsc.Array_Has, 0) as NUMERIC(1)) as hasArray, + coalesce(oc.hasCentromere, 0) as hasCentromere, + coalesce(sc.contig_num, 0) as contigCount, + coalesce(sc.supercont_num, 0) as supercontigCount, + coalesce(sc.chrom_num, 0) as chromosomeCount, + coalesce(cc.communityCount, 0) as communityCount, + --coalesce(psc.popsetCount, 0) as popsetCount, + coalesce(pc.geneArrayCount, 0) as arrayGeneCount, + coalesce(pc.rnaSeqCount, 0) as rnaSeqCount, + coalesce(pc.rtPCRCount, 0) as rtPCRCount, + coalesce(ta.avg_transcript_length, 0) as avg_transcript_length + FROM apidb.Organism o + INNER JOIN sres.TaxonName tn ON tn.taxon_id = o.taxon_id and o.taxon_id = :TAXON_ID + INNER JOIN sres.Taxon t ON t.taxon_id = tn.taxon_id + LEFT JOIN :SCHEMA.:ORG_ABBREVDataSourceCount dsc ON o.taxon_id = dsc.taxon_id + LEFT JOIN :SCHEMA.:ORG_ABBREVOrganismCentromere oc ON o.taxon_id = oc.taxon_id + LEFT JOIN :SCHEMA.:ORG_ABBREVSequenceCount sc ON o.taxon_id = sc.taxon_id + LEFT JOIN :SCHEMA.:ORG_ABBREVCommunityCount cc ON o.taxon_id = cc.taxon_id + LEFT JOIN :SCHEMA.:ORG_ABBREVGeneCount gc ON o.taxon_id = gc.taxon_id + --LEFT JOIN :SCHEMA.:ORG_ABBREVpopsetCount psc ON o.taxon_id = psc.taxon_id + LEFT JOIN :SCHEMA.:ORG_ABBREVprofileCount pc ON o.taxon_id = pc.taxon_id + LEFT JOIN ( + SELECT taxon_id, round(avg(length),1) as avg_transcript_length + FROM :SCHEMA.TranscriptAttributes + where org_abbrev = ':ORG_ABBREV' + GROUP by taxon_id + ) ta ON o.taxon_id = ta.taxon_id + WHERE tn.name_class = 'scientific name' + ) oa, + :SCHEMA.TaxonSpecies ts, + sres.taxon t, + sres.taxonname tn2 + WHERE oa.component_taxon_id = ts.taxon_id + AND ts.species_taxon_id = t.taxon_id + AND ts.species_taxon_id = tn2.taxon_id + AND tn2.name_class = 'scientific name' + AND o.taxon_id = :TAXON_ID + AND ts.org_abbrev = ':ORG_ABBREV' + + +:DECLARE_PARTITION; + +drop table :SCHEMA.:ORG_ABBREVDataSourceCount; +drop table :SCHEMA.:ORG_ABBREVOrganismCentromere; +drop table :SCHEMA.:ORG_ABBREVProfileCount; +--drop table :SCHEMA.:ORG_ABBREVPopsetCount; +drop table :SCHEMA.:ORG_ABBREVGeneCount; +drop table :SCHEMA.:ORG_ABBREVSequenceCount; +drop table :SCHEMA.:ORG_ABBREVCommunityCount; diff --git a/Model/lib/psql/webready/keep/OrganismAttributes_ix.psql b/Model/lib/psql/webready/keep/OrganismAttributes_ix.psql new file mode 100644 index 000000000..d1f420c0d --- /dev/null +++ b/Model/lib/psql/webready/keep/OrganismAttributes_ix.psql @@ -0,0 +1,3 @@ +create unique Organism_sourceId_idx ON :SCHEMA.OrganismAttributes (source_id) + ; + diff --git a/Model/lib/psql/webready/keep/Profile.psql b/Model/lib/psql/webready/keep/Profile.psql new file mode 100644 index 000000000..7db9b02f9 --- /dev/null +++ b/Model/lib/psql/webready/keep/Profile.psql @@ -0,0 +1,800 @@ +:CREATE_AND_POPULATE + + + CREATE TABLE Profile ( + DATASET_NAME VARCHAR(200), + DATASET_TYPE VARCHAR(50), + DATASET_SUBTYPE VARCHAR(50), + PROFILE_TYPE VARCHAR(30), + NODE_TYPE VARCHAR(100), + SOURCE_ID VARCHAR(500), + PROFILE_STUDY_ID NUMERIC(7), + PROFILE_SET_NAME VARCHAR(400), + PROFILE_SET_SUFFIX VARCHAR(50), + PROFILE_AS_STRING VARCHAR(4000), + MAX_VALUE NUMERIC, + MIN_VALUE NUMERIC, + MAX_TIMEPOINT VARCHAR(200), + MIN_TIMEPOINT VARCHAR(200) + ) + + +:DECLARE_PARTITION; + + + + DO $$ + DECLARE + ctrows NUMERIC := 0; + commit_after NUMERIC := 10000; + pf_rows record; + BEGIN + FOR pf_rows IN ( + SELECT + ds.name as dataset_name, ds.type as dataset_type, + ds.subtype as dataset_subtype, profile.profile_type, profile.node_type, + profile.source_id, profile.node_set_id as profile_study_id, + ps.name as profile_set_name, + cast(case + when regexp_replace(ps.name, '\[.+\]', '') like '% - %' + then regexp_replace(regexp_replace(ps.name, ' *\[.+\]', ''), '.+ - ', '') + -- special cases for legacy datasets + when regexp_replace(ps.name, '\[.+\]', '') like 'DeRisi%' + then regexp_replace(regexp_replace(ps.name, '\[.+\]', ''), 'DeRisi ', '') + when regexp_replace(ps.name, '\[.+\]', '') like 'winzeler_cc_%' + then regexp_replace(regexp_replace(ps.name, '\[.+\]', ''), 'winzeler_cc_', '') + when regexp_replace(ps.name, '\[.+\]', '') like 'Llinas RT transcription and decay %' + then regexp_replace(regexp_replace(ps.name, '\[.+\]', ''), 'Llinas RT transcription and decay ', '') + when regexp_replace(ps.name, '\[.+\]', '') like 'T.brucei paired end RNA-Seqdata from Horn%' + then regexp_replace( + regexp_replace( + regexp_replace( + regexp_replace( + regexp_replace(ps.name, '\[.+ nonunique\]', ''), + '\[.+ unique\]', ' - unique'), + '\[.+\]', ''), + 'aligned with cds coordinates ', 'cds coordinates'), + 'T.brucei paired end RNA-Seqdata from Horn ', '') + else null + end as varchar(50) + ) as profile_set_suffix, + CASE WHEN replace(profile.profile_as_string, 'NA' || CHR(9), '') = 'NA' THEN null ELSE profile.profile_as_string END as profile_as_string, + profile.max_value, profile.min_value, + profile.max_timepoint, profile.min_timepoint + FROM apidb.DataSource ds, sres.ExternalDatabase d, + sres.ExternalDatabaseRelease r, study.NodeSet ps, + ( + -- gene profiles + SELECT gene_profile.node_set_id, ga.source_id, gene_profile.profile_type, gene_profile.node_type, + gene_profile.profile_as_string, gene_profile.max_value, gene_profile.min_value, + gene_profile.max_timepoint, gene_profile.min_timepoint + FROM GeneAttributes ga, + ( + SELECT * FROM ( + WITH result AS ( + SELECT na_feature_id, protocol_app_node_id, max(value) as value + FROM results.NAFEATUREHOSTRESPONSE + GROUP BY na_feature_id, protocol_app_node_id + ), min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.value) over w1 as max_value, + first_value(result.value) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + from study.protocolappnode pan, study.NodeNodeSet sl, study.NodeSet s, result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.value ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'values' as profile_type, s.node_type, + string_agg(coalesce(round(result.value::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.PROTOCOLAPPNODE pan, study.NodeNodeSet sl, study.NodeSet s, min_max, result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t1 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.mean_phenotype) over w1 as max_value, + first_value(result.mean_phenotype) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.PROTOCOLAPPNODE pan, study.NodeNodeSet sl, study.NodeSet s, apidb.CrisprPhenotype result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.mean_phenotype ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'values' as profile_type, s.node_type, + string_agg(coalesce(round(result.mean_phenotype::numeric, 2)::varchar,'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.CrisprPhenotype result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND result.na_feature_id = min_max.na_feature_id + AND sl.node_set_id = min_max.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t2 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.score) over w1 as max_value, + first_value(result.score) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.PROTOCOLAPPNODE pan, study.NodeNodeSet sl, study.NodeSet s, apidb.PhenotypeScore result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.score ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'values' as profile_type, s.node_type, + string_agg(coalesce(round(result.score::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.PhenotypeScore result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t3 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.relative_growth_rate) over w1 as max_value, + first_value(result.relative_growth_rate) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.PhenotypeGrowthRate result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.relative_growth_rate ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'values' as profile_type, s.node_type, + string_agg(coalesce(round(result.relative_growth_rate::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.PhenotypeGrowthRate result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t4 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.value) over w1 as max_value, + first_value(result.value) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.value ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'values' as profile_type, s.node_type, + string_agg(coalesce(round(result.value::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t5 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.percentile_channel1) over w1 as max_value, + first_value(result.percentile_channel1) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.percentile_channel1 ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'channel1_percentiles' as profile_type, s.node_type, + string_agg(coalesce(round(result.percentile_channel1::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t6 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.percentile_channel2) over w1 as max_value, + first_value(result.percentile_channel2) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.percentile_channel2 ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'channel2_percentiles' as profile_type, s.node_type, + string_agg(coalesce(round(result.percentile_channel2::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t7 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.standard_error) over w1 as max_value, + first_value(result.standard_error) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.standard_error ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'standard_error' as profile_type, s.node_type, + string_agg(coalesce(round(result.standard_error::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, results.NaFeatureExpression result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t8 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.pvalue) over w1 as max_value, + first_value(result.pvalue) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.NaFeatureMetacycle result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.pvalue ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'pvalue' as profile_type, s.node_type, + string_agg(coalesce(round(result.pvalue::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.NaFeatureMetacycle result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t9 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.amplitude) over w1 as max_value, + first_value(result.amplitude) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.PROTOCOLAPPNODE pan, study.NodeNodeSet sl, study.NodeSet s, apidb.NaFeatureMetacycle result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.amplitude ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'amplitude' as profile_type, s.node_type, + string_agg(coalesce(round(result.amplitude::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.NaFeatureMetacycle result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t10 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.period) over w1 as max_value, + first_value(result.period) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.NaFeatureMetacycle result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.period ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'period' as profile_type, s.node_type, + string_agg(coalesce(round(result.period::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.NaFeatureMetacycle result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t11 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.probability_mean) over w1 as max_value, + first_value(result.probability_mean) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.probability_mean ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'probability_mean' as profile_type, s.node_type, + string_agg(coalesce(round(result.probability_mean::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t12 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.sd) over w1 as max_value, + first_value(result.sd) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.sd ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'sd' as profile_type, s.node_type, + string_agg(coalesce(round(result.sd::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t13 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.lower_CI) over w1 as max_value, + first_value(result.lower_CI) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.lower_CI ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'lower_CI' as profile_type, s.node_type, + string_agg(coalesce(round(result.lower_CI::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t14 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.na_feature_id, + last_value(result.upper_CI) over w1 as max_value, + first_value(result.upper_CI) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.na_feature_id + ORDER BY result.upper_CI ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.na_feature_id, 'upper_CI' as profile_type, s.node_type, + string_agg(coalesce(round(result.upper_CI::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.LopitResults result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.na_feature_id = result.na_feature_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t15 +-- TO FIX +-- UNION ALL +-- SELECT * FROM ( +-- WITH min_max AS ( +-- SELECT DISTINCT sl.node_set_id, result.na_feature_id, +-- null as max_value, +-- null as min_value, +-- last_value(pan.name) over w1 as max_timepoint, +-- first_value(pan.name) over w1 as min_timepoint +-- FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.HaplotypeResult result +-- WHERE result.protocol_app_node_id = sl.protocol_app_node_id +-- AND result.protocol_app_node_id = pan.protocol_app_node_id +-- AND sl.node_set_id = s.node_set_id +-- WINDOW w1 AS ( +-- PARTITION BY sl.node_set_id, result.na_feature_id +-- ORDER BY result.value ASC +-- ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING +-- ) +-- ) +-- SELECT sl.node_set_id, result.na_feature_id, 'values' as profile_type, s.node_type, +-- string_agg(coalesce(result.value, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, +-- min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint +-- FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.HaplotypeResult result, min_max +-- WHERE result.protocol_app_node_id = sl.protocol_app_node_id +-- AND result.protocol_app_node_id = pan.protocol_app_node_id +-- AND sl.node_set_id = s.node_set_id +-- AND min_max.na_feature_id = result.na_feature_id +-- AND min_max.node_set_id = sl.node_set_id +-- GROUP BY sl.node_set_id, result.na_feature_id, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type +-- ) t16 + ) gene_profile + WHERE ga.na_feature_id = gene_profile.na_feature_id + UNION ALL + -- compound profiles + SELECT compound_profile.node_set_id, + case + WHEN compound_profile.isotopomer IS NOT NULL + THEN ca.source_id || '|' || compound_profile.isotopomer + WHEN compound_profile.mass IS NOT NULL + THEN ca.source_id || '|' || compound_profile.mass || '|' || compound_profile.retention_time + ELSE ca.source_id + END as source_id, + compound_profile.profile_type, compound_profile.node_type, + compound_profile.profile_as_string, compound_profile.max_value, compound_profile.min_value, + compound_profile.max_timepoint, compound_profile.min_timepoint + FROM CompoundAttributes ca, + ( + SELECT * FROM ( + WITH result AS ( + SELECT max(value) as value, compound_id, protocol_app_node_id, isotopomer + FROM results.CompoundMassSpec + GROUP BY compound_id, protocol_app_node_id, isotopomer + ) , min_max AS ( + SELECT DISTINCT sl.node_set_id, result.compound_id, result.isotopomer, + last_value(result.value) over w1 as max_value, + first_value(result.value) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.compound_id, result.isotopomer + ORDER BY result.value ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.compound_id, result.isotopomer, 'values' as profile_type, s.node_type, + NULL::numeric as compound_peaks_id, NULL::numeric as mass, NULL::numeric as retention_time, + string_agg(coalesce(round(result.value::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, result, min_max + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.compound_id = result.compound_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.compound_id, result.isotopomer, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t1 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, cpc.compound_id, + last_value(cmsr.value) over w1 as max_value, + first_value(cmsr.value) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, + apidb.CompoundMassSpecResult cmsr, apidb.Compoundpeaks cp, + apidb.CompoundPeaksChebi cpc + WHERE cmsr.protocol_app_node_id = sl.protocol_app_node_id + AND cmsr.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND cp.compound_peaks_id = cmsr.compound_peaks_id + AND cpc.compound_peaks_id = cp.compound_peaks_id + AND pan.name like '%mean%' + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, cpc.compound_id, cpc.isotopomer, cpc.compound_peaks_id, cp.mass, cp.retention_time + ORDER BY cmsr.value ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, cpc.compound_id, cpc.isotopomer, 'values' as profile_type, s.node_type, + cpc.compound_peaks_id, cp.mass, cp.retention_time, + string_agg(coalesce(round(cmsr.value::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, + apidb.CompoundMassSpecResult cmsr, apidb.Compoundpeaks cp, + apidb.CompoundPeaksChebi cpc, min_max + WHERE cmsr.protocol_app_node_id = sl.protocol_app_node_id + AND cmsr.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND cp.compound_peaks_id = cmsr.compound_peaks_id + AND cpc.compound_peaks_id = cp.compound_peaks_id + AND pan.name like '%mean%' + AND min_max.compound_id = cpc.compound_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, cpc.compound_id, cpc.isotopomer, cpc.compound_peaks_id, cp.mass, cp.retention_time, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t2 + UNION ALL + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, cpc.compound_id, + last_value(cmsr.percentile) over w1 as max_value, + first_value(cmsr.percentile) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, + apidb.CompoundMassSpecResult cmsr, apidb.Compoundpeaks cp, + apidb.CompoundPeaksChebi cpc + WHERE cmsr.protocol_app_node_id = sl.protocol_app_node_id + AND cmsr.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND cp.compound_peaks_id = cmsr.compound_peaks_id + AND cpc.compound_peaks_id = cp.compound_peaks_id + AND pan.name like '%mean%' + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, cpc.compound_id, cpc.isotopomer, cpc.compound_peaks_id, cp.mass, cp.retention_time + ORDER BY cmsr.percentile ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, cpc.compound_id, cpc.isotopomer, 'percentiles' as profile_type, s.node_type, + cpc.compound_peaks_id, cp.mass, cp.retention_time, + string_agg(coalesce(round(cmsr.percentile::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, + apidb.CompoundMassSpecResult cmsr, apidb.Compoundpeaks cp, + apidb.CompoundPeaksChebi cpc, min_max + WHERE cmsr.protocol_app_node_id = sl.protocol_app_node_id + AND cmsr.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND cp.compound_peaks_id = cmsr.compound_peaks_id + AND cpc.compound_peaks_id = cp.compound_peaks_id + AND pan.name like '%mean%' + AND min_max.compound_id = cpc.compound_id + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, cpc.compound_id, cpc.isotopomer, cpc.compound_peaks_id, cp.mass, cp.retention_time, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t3 + ) compound_profile + WHERE ca.id = compound_profile.compound_id + UNION ALL + -- OntologyTermResult + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, ot.name, + last_value(otr.value) over w1 as max_value, + first_value(otr.value) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, + apidb.OntologyTermResult otr, sres.OntologyTerm ot + WHERE ot.ontology_term_id = otr.ontology_term_id + AND otr.protocol_app_node_id = sl.protocol_app_node_id + AND otr.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, ot.name + ORDER BY otr.value ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, ot.name as source_id, 'value' as profile_type, s.node_type, + string_agg(coalesce(round(otr.value::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, + apidb.OntologyTermResult otr, sres.OntologyTerm ot, min_max + WHERE ot.ontology_term_id = otr.ontology_term_id + AND otr.protocol_app_node_id = sl.protocol_app_node_id + AND otr.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.name = ot.name + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, ot.name, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t1 + UNION ALL + -- SubjectResult + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.subject, + last_value(result.value) over w1 as max_value, + first_value(result.value) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.SubjectResult result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.subject + ORDER BY result.value ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.subject as source_id, 'values' as profile_type, s.node_type, + string_agg(coalesce(round(result.value::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, min_max, apidb.SubjectResult result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.subject = result.subject + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.subject, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t2 + UNION ALL + -- wgcna eigengene results + SELECT * FROM ( + WITH min_max AS ( + SELECT DISTINCT sl.node_set_id, result.module_name, + last_value(result.value) over w1 as max_value, + first_value(result.value) over w1 as min_value, + last_value(pan.name) over w1 as max_timepoint, + first_value(pan.name) over w1 as min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, apidb.EigenGeneWgcnaResults result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + WINDOW w1 AS ( + PARTITION BY sl.node_set_id, result.module_name + ORDER BY result.value ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + ) + SELECT sl.node_set_id, result.module_name as source_id, 'values' as profile_type, s.node_type, + string_agg(coalesce(round(result.value::numeric, 2)::varchar, 'NA'), chr(9) order by pan.node_order_num) as profile_as_string, + min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint + FROM study.ProtocolAppNode pan, study.NodeNodeSet sl, study.NodeSet s, min_max, apidb.EigenGeneWgcnaResults result + WHERE result.protocol_app_node_id = sl.protocol_app_node_id + AND result.protocol_app_node_id = pan.protocol_app_node_id + AND sl.node_set_id = s.node_set_id + AND min_max.module_name = result.module_name + AND min_max.node_set_id = sl.node_set_id + GROUP BY sl.node_set_id, result.module_name, min_max.max_value, min_max.min_value, min_max.max_timepoint, min_max.min_timepoint, s.node_type + ) t3 + ) profile + WHERE ds.name = d.name + AND ds.version = r.version + AND d.external_database_id = r.external_database_id + AND profile.node_set_id = ps.node_set_id + AND ps.external_database_release_id = r.external_database_release_id + ) + LOOP + ctrows := ctrows + 1; + INSERT INTO Profile + (DATASET_NAME, DATASET_TYPE, DATASET_SUBTYPE, PROFILE_TYPE, NODE_TYPE, SOURCE_ID, PROFILE_STUDY_ID, PROFILE_SET_NAME, + PROFILE_SET_SUFFIX, PROFILE_AS_STRING, MAX_VALUE, MIN_VALUE, MAX_TIMEPOINT, MIN_TIMEPOINT) + VALUES + (pf_rows.DATASET_NAME, pf_rows.DATASET_TYPE, pf_rows.DATASET_SUBTYPE, pf_rows.PROFILE_TYPE, pf_rows.NODE_TYPE, pf_rows.SOURCE_ID, pf_rows.PROFILE_STUDY_ID, pf_rows.PROFILE_SET_NAME, + pf_rows.PROFILE_SET_SUFFIX, pf_rows.PROFILE_AS_STRING, pf_rows.MAX_VALUE, pf_rows.MIN_VALUE, pf_rows.MAX_TIMEPOINT, pf_rows.MIN_TIMEPOINT); + IF ctrows >= commit_after THEN + COMMIT; + ctrows := 0; + END IF; + END LOOP; + commit; + END; + $$ LANGUAGE PLPGSQL; + + ; + + + + UPDATE Profile + SET dataset_name = 'tbruTREU927_Rijo_Circadian_Regulation_rnaSeq_RSRC' + WHERE dataset_name= 'tbruTREU927_RNASeq_Rijo_Circadian_Regulation_RSRC' + + ; + diff --git a/Model/lib/psql/webready/keep/ProfileSamples.psql b/Model/lib/psql/webready/keep/ProfileSamples.psql new file mode 100644 index 000000000..e2ed3d0b2 --- /dev/null +++ b/Model/lib/psql/webready/keep/ProfileSamples.psql @@ -0,0 +1,167 @@ +:CREATE_AND_POPULATE + + + CREATE TABLE ProfileSamples AS + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id as study_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, results.nafeatureexpression r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + AND pt.profile_type not in ('pvalue', 'period', 'amplitude', 'probability_mean','sd','lower_CI','upper_CI','correlation_coefficient') + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.NAFeatureMetacycle r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + AND pt.profile_type not in ('values', 'channel1_percentiles', 'channel2_percentiles', 'standard_error') + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.LopitResults r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + AND pt.profile_type not in ('values', 'channel1_percentiles', 'channel2_percentiles', 'standard_error') + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, results.compoundMassSpec r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, APIDB.compoundmassspecresult r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + and pan.name like '%mean%' + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.ontologytermresult r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, results.nafeaturehostresponse r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.crisprphenotype r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.phenotypescore r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.phenotypegrowthrate r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.subjectresult r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + UNION + SELECT DISTINCT s.name AS study_name, pt.node_type, s.node_set_id, + REGEXP_REPLACE(REGEXP_REPLACE (pan.name, ' \[.+\] \(.+\)', ''), + ' \(.+\)', '')AS protocol_app_node_name, + pan.protocol_app_node_id, pan.node_order_num, pt.profile_type, + pt.dataset_name, pt.dataset_type, pt.dataset_subtype, + pt.profile_set_suffix + FROM profileType pt, study.nodeSet s, study.nodeNodeSet sl, + study.protocolAppNode pan, apidb.EigenGeneWgcnaResults r + WHERE pt.profile_study_id = s.node_set_id + AND sl.node_set_id = s.node_set_id + AND sl.protocol_app_node_id = pan.protocol_app_node_id + AND pan.protocol_app_node_id =r.protocol_app_node_id + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/keep/ProfileSamples_ix.psql b/Model/lib/psql/webready/keep/ProfileSamples_ix.psql new file mode 100644 index 000000000..d59cf5e21 --- /dev/null +++ b/Model/lib/psql/webready/keep/ProfileSamples_ix.psql @@ -0,0 +1,14 @@ + create index psamp_ix + on :SCHEMA.ProfileSamples + (dataset_name, profile_type, study_id, node_order_num, + protocol_app_node_id, profile_set_suffix, study_name, + node_type, protocol_app_node_name) + ; + + create index psampstdy_ix + on :SCHEMA.ProfileSamples + (study_name, node_type, profile_type, node_order_num, + protocol_app_node_id, profile_set_suffix, study_id, + protocol_app_node_name, dataset_name) + ; + diff --git a/Model/lib/psql/webready/keep/ProfileType.psql b/Model/lib/psql/webready/keep/ProfileType.psql new file mode 100644 index 000000000..0f0f83616 --- /dev/null +++ b/Model/lib/psql/webready/keep/ProfileType.psql @@ -0,0 +1,13 @@ +:CREATE_AND_POPULATE + + + CREATE TABLE ProfileType AS + SELECT DISTINCT dataset_name, profile_study_id, profile_set_name, profile_set_suffix, node_type, profile_type, + dataset_type, dataset_subtype + FROM profile + WHERE profile_as_string IS NOT NULL + ORDER BY dataset_name, profile_set_name, profile_type + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/keep/ProfileType_ix.psql b/Model/lib/psql/webready/keep/ProfileType_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/keep/Profile_ix.psql b/Model/lib/psql/webready/keep/Profile_ix.psql new file mode 100644 index 000000000..46b746ade --- /dev/null +++ b/Model/lib/psql/webready/keep/Profile_ix.psql @@ -0,0 +1,12 @@ + create exprof_idx + on :SCHEMA.Profile (source_id, profile_type, profile_set_name) + ; + + create profset_idx + on :SCHEMA.Profile (profile_set_name, profile_type) + ; + + create srcdset_idx + on :SCHEMA.Profile (source_id, dataset_subtype, dataset_type) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/ChIPchipTranscript.psql b/Model/lib/psql/webready/orgSpecific/ChIPchipTranscript.psql new file mode 100644 index 000000000..a29541ffe --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ChIPchipTranscript.psql @@ -0,0 +1,41 @@ + :CREATE_AND_POPULATE + SELECT DISTINCT ta.source_id, ta.gene_source_id, sr.protocol_app_node_id, + ta.project_id, ta.org_abbrev, current_timestamp as modification_date, + CASE + WHEN ta.is_reversed = 0 + THEN round(abs(ta.start_min - (((sr.segment_end - sr.segment_start) / 2) + sr.segment_start)),0) + ELSE round(abs(ta.end_max - (((sr.segment_end - sr.segment_start) / 2) + sr.segment_start)),0) + END as distance, + CASE + WHEN /* distance > 0 */ + CASE WHEN ta.is_reversed = 0 + THEN ta.start_min - (((sr.segment_end - sr.segment_start) / 2) + sr.segment_start) + ELSE ta.end_max - (((sr.segment_end - sr.segment_start) / 2) + sr.segment_start) + END > 0 + THEN + CASE + WHEN ta.is_reversed = 0 + THEN '-' + ELSE '+' + END + ELSE + CASE + WHEN ta.is_reversed = 1 + THEN '-' + ELSE '+' + END + END as direction, + sr.score1 as score + FROM :SCHEMA.TranscriptAttributes ta, + Results.segmentresult sr, + Study.nodenodeset sl, + Study.nodeset s + WHERE sr.na_sequence_id = ta.na_sequence_id + AND s.node_set_id = sl.node_set_id + AND sl.protocol_app_node_id = sr.protocol_app_node_id + AND lower(s.name) like '%chip%peaks' + AND ( (ta.is_reversed = 0 and abs((((sr.segment_end - sr.segment_start) / 2) + sr.segment_start) - ta.start_min) <= 3000) + or (ta.is_reversed = 1 and abs((((sr.segment_end - sr.segment_start) / 2) + sr.segment_start) - ta.end_max) <= 3000) ) + AND ta.org_abbrev = ':ORG_ABBREV' +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/ChIPchipTranscript_ix.psql b/Model/lib/psql/webready/orgSpecific/ChIPchipTranscript_ix.psql new file mode 100644 index 000000000..11d4fa447 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ChIPchipTranscript_ix.psql @@ -0,0 +1,3 @@ + create index chpgene_geneid_idx ON :SCHEMA.ChIPchipTranscript (protocol_app_node_id, source_id, gene_source_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/ChrCopyNumbers.psql b/Model/lib/psql/webready/orgSpecific/ChrCopyNumbers.psql new file mode 100644 index 000000000..eafe7652b --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ChrCopyNumbers.psql @@ -0,0 +1,22 @@ +:CREATE_AND_POPULATE + SELECT DISTINCT + ta.project_id + , ta.org_abbrev + , current_timestamp as modification_date + , ta.na_sequence_id + , ta.chromosome + , ccn.chr_copy_number AS ploidy + , io.input_pan_id + , io.output_pan_id + FROM apidb.ChrCopyNumber ccn + , :SCHEMA.TranscriptAttributes ta + , :SCHEMA.PANIo io + WHERE ta.na_sequence_id = ccn.na_sequence_id + AND ta.chromosome IS NOT NULL + AND ccn.protocol_app_node_id = io.output_pan_id + and ta.org_abbrev = ':ORG_ABBREV' + and io.org_abbrev = ':ORG_ABBREV' + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/ChrCopyNumbers_ix.psql b/Model/lib/psql/webready/orgSpecific/ChrCopyNumbers_ix.psql new file mode 100644 index 000000000..c5a7eff54 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ChrCopyNumbers_ix.psql @@ -0,0 +1,9 @@ + CREATE index ChrCN_ix + ON :SCHEMA.ChrCopyNumbers (input_pan_id, na_sequence_id) + ; + + + CREATE index ChrCN_output + ON :SCHEMA.ChrCopyNumbers (output_pan_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/CodingSequence.psql b/Model/lib/psql/webready/orgSpecific/CodingSequence.psql new file mode 100644 index 000000000..a1682717e --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/CodingSequence.psql @@ -0,0 +1,11 @@ +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + ta.source_id, + SUBSTR(sns.sequence, tf.translation_start::INTEGER, + tf.translation_stop::INTEGER - tf.translation_start::INTEGER + 1) as sequence + FROM :SCHEMA.TranscriptAttributes ta, dots.SplicedNaSequence sns, dots.TranslatedAaFeature tf + WHERE ta.source_id = sns.source_id + AND ta.na_feature_id = tf.na_feature_id + AND ta.org_abbrev = ':ORG_ABBREV' +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/CodingSequence_ix.psql b/Model/lib/psql/webready/orgSpecific/CodingSequence_ix.psql new file mode 100644 index 000000000..e580f236f --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/CodingSequence_ix.psql @@ -0,0 +1,3 @@ + create index CodSeq_ix on :SCHEMA.CodingSequence (source_id, project_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/EqtlSpan.psql b/Model/lib/psql/webready/orgSpecific/EqtlSpan.psql new file mode 100644 index 000000000..fdad100be --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EqtlSpan.psql @@ -0,0 +1,24 @@ +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + gene_source_id, haplotype_block_name as hapblock_id, sequence_id, + start_min, end_max, start_max, end_min, + max(score) as lod_score, organism + FROM (SELECT ga.gene_source_id, ga.project_id, gls.haplotype_block_name, + ens.source_id as sequence_id, nl.start_min, nl.end_max, nl.start_max, nl.end_min, + gls.lod_score_mant * power(10::double precision, gls.lod_score_exp) as score, + replace (ga.organism, ' ', '+') as organism + FROM dots.ChromosomeElementFeature cef, apidb.NAFeatureHaploblock gls, + dots.ExternalNaSequence ens, dots.NaLocation nl, :SCHEMA.TranscriptAttributes ga + WHERE gls.na_feature_id = ga.gene_na_feature_id + AND cef.name = gls.haplotype_block_name + AND nl.na_feature_id = cef.na_feature_id + AND cef.na_sequence_id = ens.na_sequence_id + AND ga.org_abbrev = ':ORG_ABBREV' + AND (gls.lod_score_mant * power(10::double precision, gls.lod_score_exp)) >= 1.5 + ) t + GROUP BY gene_source_id, project_id, sequence_id, haplotype_block_name, + start_min, end_max, start_max, end_min, organism + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/EqtlSpan_ix.psql b/Model/lib/psql/webready/orgSpecific/EqtlSpan_ix.psql new file mode 100644 index 000000000..9238b045e --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EqtlSpan_ix.psql @@ -0,0 +1,4 @@ + create index eqtlSpan_ix + on :SCHEMA.eqtlSpan (gene_source_id, project_id, hapblock_id, sequence_id, start_min, end_max, start_max, end_min, organism, lod_score) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/EstAlignmentGeneSummary.psql b/Model/lib/psql/webready/orgSpecific/EstAlignmentGeneSummary.psql new file mode 100644 index 000000000..c4546c4fe --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EstAlignmentGeneSummary.psql @@ -0,0 +1,74 @@ + drop table if exists :SCHEMA.:ORG_ABBREVEstAlignmentGeneTmp; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVEstAlignmentGeneTmp AS + SELECT ba.blat_alignment_id, ba.query_na_sequence_id, e.accession, + e.library_id, ba.query_taxon_id, ba.target_na_sequence_id, + ba.target_taxon_id, ba.percent_identity, ba.is_consistent, + ba.is_best_alignment, ba.is_reversed, ba.target_start, ba.target_end, + ga.sequence_id AS target_sequence_source_id, + least(ba.target_end, ga.gene_end_max) + - greatest(ba.target_start, ga.gene_start_min) + 1 + AS est_gene_overlap_length, + ba.query_bases_aligned / (query_sequence.length) + * 100 AS percent_est_bases_aligned, + ga.gene_source_id AS gene,':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM dots.BlatAlignment ba, dots.Est e, + :SCHEMA.TranscriptAttributes ga, + dots.NaSequence query_sequence + WHERE e.na_sequence_id = ba.query_na_sequence_id + AND ga.na_sequence_id = ba.target_na_sequence_id + AND least(ba.target_end, ga.gene_end_max) - greatest(ba.target_start, ga.gene_start_min) >= 0 + AND query_sequence.na_sequence_id = ba.query_na_sequence_id + AND ga.org_abbrev = ':ORG_ABBREV' + + ; + + drop table if exists :SCHEMA.:ORG_ABBREVEstAlignmentNoGeneTmp; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVEstAlignmentNoGeneTmp AS + SELECT * from :SCHEMA.:ORG_ABBREVEstAlignmentGeneTmp WHERE 1=0 UNION /* define datatype for null column */ + SELECT ba.blat_alignment_id, ba.query_na_sequence_id, e.accession, + e.library_id, ba.query_taxon_id, ba.target_na_sequence_id, + ba.target_taxon_id, ba.percent_identity, ba.is_consistent, + ba.is_best_alignment, ba.is_reversed, ba.target_start, ba.target_end, + sequence.source_id AS target_sequence_source_id, + NULL AS est_gene_overlap_length, + ba.query_bases_aligned / (query_sequence.length) + * 100 AS percent_est_bases_aligned, + NULL AS gene,':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM dots.BlatAlignment ba, dots.Est e, dots.AssemblySequence aseq, + dots.NaSequence sequence, dots.NaSequence query_sequence + WHERE e.na_sequence_id = ba.query_na_sequence_id + AND e.na_sequence_id = query_sequence.na_sequence_id + AND aseq.na_sequence_id = ba.query_na_sequence_id + AND ba.target_na_sequence_id = sequence.na_sequence_id + AND sequence.taxon_id = :TAXON_ID + AND ba.blat_alignment_id IN + ( /* set of blat_alignment_ids not in in first leg of UNION */ + /* (because they overlap no genes) */ + SELECT ba.blat_alignment_id + FROM dots.BlatAlignment ba, dots.NaSequence query_sequence, + sres.OntologyTerm so, dots.NaSequence target_sequence + WHERE query_sequence.na_sequence_id = ba.query_na_sequence_id + AND query_sequence.sequence_ontology_id = so.ontology_term_id + AND ba.target_na_sequence_id = target_sequence.na_sequence_id + AND target_sequence.taxon_id = :TAXON_ID + AND so.name = 'EST' + EXCEPT + SELECT blat_alignment_id FROM :SCHEMA.:ORG_ABBREVEstAlignmentGeneTmp) + + ; + +:CREATE_AND_POPULATE + + SELECT * FROM :SCHEMA.:ORG_ABBREVEstAlignmentNoGeneTmp + UNION + SELECT * FROM :SCHEMA.:ORG_ABBREVEstAlignmentGeneTmp + + +:DECLARE_PARTITION; + +drop table :SCHEMA.:ORG_ABBREVEstAlignmentGeneTmp; +drop table :SCHEMA.:ORG_ABBREVEstAlignmentNoGeneTmp; diff --git a/Model/lib/psql/webready/orgSpecific/EstAlignmentGeneSummary_ix.psql b/Model/lib/psql/webready/orgSpecific/EstAlignmentGeneSummary_ix.psql new file mode 100644 index 000000000..adac5eceb --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EstAlignmentGeneSummary_ix.psql @@ -0,0 +1,20 @@ + + + create index EstSumm_libOverlap_ix + ON :SCHEMA.EstAlignmentGeneSummary + (library_id, percent_identity, is_consistent, + est_gene_overlap_length, percent_est_bases_aligned) + + + ; + + + + create index EstSumm_estSite_ix + ON :SCHEMA.EstAlignmentGeneSummary + (target_sequence_source_id, target_start, target_end, + library_id) + + + ; + diff --git a/Model/lib/psql/webready/orgSpecific/EstAttributes.psql b/Model/lib/psql/webready/orgSpecific/EstAttributes.psql new file mode 100644 index 000000000..4564bd263 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EstAttributes.psql @@ -0,0 +1,59 @@ +:CREATE_AND_POPULATE + SELECT + ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + ens.source_id, + e.seq_primer AS primer, + ens.a_count, + ens.c_count, + ens.g_count, + ens.t_count, + (ens.length - (ens.a_count + ens.c_count + ens.g_count + ens.t_count)) + AS other_count, + ens.length, + replace(l.dbest_name, '''', '-') as dbest_name, + coalesce(regexp_replace(l.vector, '^\s+$', null), 'unknown') AS vector, + coalesce(regexp_replace(l.stage, '^\s+$', null), 'unknown') AS stage, + SUBSTR(CASE + WHEN tn.name = 'Giardia lamblia' THEN 'Giardia Assemblage A isolate WB' + ELSE tn.name + END, 1, 100) AS organism, + taxon.ncbi_tax_id, + ed.name AS external_db_name, + coalesce(best.best_alignment_count, 0) AS best_alignment_count, + l.library_id, replace(l.dbest_name, '''', '-') as library_dbest_name + FROM dots.Est e, dots.Library l, sres.Taxon, sres.OntologyTerm oterm, + sres.TaxonName tn, sres.ExternalDatabase ed, + apidb.datasource ds, apidb.organism o, + sres.ExternalDatabaseRelease edr, dots.ExternalNaSequence ens + LEFT JOIN + (select query_na_sequence_id,max(ct) as best_alignment_count + from ( + SELECT query_na_sequence_id, COUNT(*) AS ct + FROM dots.BlatAlignment ba, apidb.datasource ds, apidb.organism o, + sres.externaldatabase d, sres.externaldatabaserelease r + WHERE is_best_alignment = 1 + AND ba.query_external_db_release_id = r.external_database_release_id + AND r.external_database_id = d.external_database_id + AND d.name = ds.name + AND ds.taxon_id = o.taxon_id + AND o.is_reference_strain = 1 + AND o.taxon_id = :TAXON_ID + GROUP BY target_external_db_release_id,query_na_sequence_id) t + group by query_na_sequence_id + ) best ON ens.na_sequence_id = best.query_na_sequence_id + WHERE e.na_sequence_id = ens.na_sequence_id + AND e.library_id = l.library_id + AND ens.taxon_id = tn.taxon_id + AND ens.taxon_id = taxon.taxon_id + AND tn.name_class='scientific name' + AND ens.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND ens.sequence_ontology_id = oterm.ontology_term_id + AND ed.name = ds.name + and ds.taxon_id = o.taxon_id + and o.is_reference_strain = 1 + and o.taxon_id = :TAXON_ID + AND oterm.name = 'EST' +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/EstAttributes_ix.psql b/Model/lib/psql/webready/orgSpecific/EstAttributes_ix.psql new file mode 100644 index 000000000..3cad24bdc --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EstAttributes_ix.psql @@ -0,0 +1,7 @@ + + + create unique index EstAttr_source_id ON :SCHEMA.EstAttributes (org_abbrev, source_id) + + + ; + diff --git a/Model/lib/psql/webready/orgSpecific/EstSequence.psql b/Model/lib/psql/webready/orgSpecific/EstSequence.psql new file mode 100644 index 000000000..58bc1dfd9 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EstSequence.psql @@ -0,0 +1,21 @@ +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + , ens.source_id + , ens.sequence + FROM dots.ExternalNaSequence ens + , sres.OntologyTerm oterm + , apidb.organism o + , sres.externaldatabase d + , sres.externaldatabaserelease r + , apidb.datasource ds + WHERE oterm.name = 'EST' + AND oterm.ontology_term_id = ens.sequence_ontology_id + AND ens.external_database_release_id = r.external_database_release_id + AND r.external_database_id = d.external_database_id + AND d.name = ds.name + AND ds.taxon_id = o.taxon_id + AND ds.taxon_id = :TAXON_ID + AND o.is_reference_strain = 1 +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/EstSequence_ix.psql b/Model/lib/psql/webready/orgSpecific/EstSequence_ix.psql new file mode 100644 index 000000000..d124b417b --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/EstSequence_ix.psql @@ -0,0 +1,3 @@ + create index EstSeq_ix on :SCHEMA.EstSequence (source_id, project_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneAttributes.psql b/Model/lib/psql/webready/orgSpecific/GeneAttributes.psql new file mode 100644 index 000000000..7ee49f52c --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneAttributes.psql @@ -0,0 +1,119 @@ +:CREATE_AND_POPULATE + SELECT DISTINCT ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + , ta.gene_source_id AS source_id + , gene_na_feature_id AS na_feature_id + , na_sequence_id + , is_reversed + , gene_start_min AS start_min + , gene_end_max AS end_max + , CASE strand WHEN 'forward' THEN '+' WHEN 'reverse' THEN '-' ELSE null END as strand_plus_minus + , sequence_id + , gene_name AS name + , COALESCE(aggregates.product, aggregates.transcript_product) as old_product + , COALESCE(gp.product, 'unspecified product') as product + , 'FIX ME' as product + , gene_type + , gene_ebi_biotype + , gene_id + , aggregates.is_pseudo + , organism + , species + , taxon_id + , species as genus_species + , strain + , ncbi_tax_id + , so_id + , so_term_name + , so_term_definition + , so_version + , anticodon + , external_db_name + , external_db_version + , external_db_rls_id + , chromosome + , chromosome_order_num + , sequence_type + , gene_transcript_count AS transcript_count + , gene_exon_count as exon_count + , gene_previous_ids as previous_ids + , is_deprecated + , gene_paralog_number as paralog_number + , gene_ortholog_number as ortholog_number + , gene_context_start as context_start + , gene_context_end as context_end + , orthomcl_name + -- , gene_total_hts_snps as total_hts_snps + -- , gene_hts_nonsynonymous_snps as hts_nonsynonymous_snps + -- , gene_hts_stop_codon_snps as hts_stop_codon_snps + -- , gene_hts_noncoding_snps as hts_noncoding_snps + -- , gene_hts_synonymous_snps as hts_synonymous_snps + -- , gene_hts_nonsyn_syn_ratio as hts_nonsyn_syn_ratio + , comment_string + , uniprot.uniprot_id + , uniprot.uniprot_id_internal + , gene_entrez_id as entrez_id + , representative_transcript + , gene_zoom_context_start as zoom_context_start + , gene_zoom_context_end as zoom_context_end + , cast (null as numeric) as strain_count + , ta.gene_locations as locations + FROM :SCHEMA.TranscriptAttributes ta + INNER JOIN ( + SELECT gene_source_id, MIN(is_pseudo) AS is_pseudo, + --MIN(gene_product) AS product, + 'FIX ME' AS product, + substr(STRING_AGG(transcript_product, ',' order by transcript_product), 1, 240) as transcript_product + FROM :SCHEMA.TranscriptAttributes + WHERE org_abbrev = ':ORG_ABBREV' + GROUP BY gene_source_id + ) aggregates ON ta.gene_source_id = aggregates.gene_source_id + LEFT JOIN ( + SELECT na_feature_id, + substr(string_agg(uniprot_id, ',' order by uniprot_id), 1, 240) as uniprot_id, + substr(string_agg(uniprot_id, '+or+' order by uniprot_id), 1, 240) as uniprot_id_internal + FROM ( + SELECT distinct t.gene_na_feature_id as na_feature_id, dr.primary_identifier as uniprot_id + FROM sres.DbRef dr, dots.DbRefNaFeature x, :SCHEMA.Transcriptattributes t, + sres.ExternalDatabase d, sres.ExternalDatabaseRelease r + WHERE dr.db_ref_id = x.DB_REF_ID + AND t.org_abbrev = ':ORG_ABBREV' + AND (x.na_feature_id = t.na_feature_id OR x.na_feature_id = t.gene_na_feature_id) + AND dr.external_database_release_id = r.external_database_release_id + AND r.external_database_id = d.external_database_id + AND (d.name like '%uniprot_dbxref_RSRC' + OR d.name like '%dbxref_gene2Uniprot_RSRC' + OR d.name = 'Links to Uniprot Genes' + OR d.name like '%_dbxref_uniprot_from_annotation_RSRC') + ) t + GROUP BY na_feature_id + ) uniprot ON ta.gene_na_feature_id = uniprot.na_feature_id and ta.org_abbrev = ':ORG_ABBREV' + LEFT JOIN :SCHEMA.GeneProduct gp ON ta.gene_source_id = gp.source_id and gp.org_abbrev = ':ORG_ABBREV' + ORDER BY ta.gene_source_id +:DECLARE_PARTITION; + + drop table if exists :SCHEMA.:ORG_ABBREVSpeciesInfoTmp; + + CREATE unlogged TABLE :SCHEMA.:ORG_ABBREVSpeciesInfoTmp as + SELECT genus_species, count(distinct organism) as strain_count + FROM :SCHEMA.GeneAttributes + WHERE org_abbrev = ':ORG_ABBREV' + GROUP BY genus_species + + ; + + + + UPDATE :SCHEMA.GeneAttributes ga + SET strain_count = ( + SELECT strain_count + FROM :SCHEMA.:ORG_ABBREVSpeciesInfoTmp si + WHERE si.genus_species = ga.genus_species + ) + WHERE org_abbrev = ':ORG_ABBREV' + + ; + +drop table :SCHEMA.:ORG_ABBREVSpeciesInfoTmp +; diff --git a/Model/lib/psql/webready/orgSpecific/GeneAttributes_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneAttributes_ix.psql new file mode 100644 index 000000000..2e66f521e --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneAttributes_ix.psql @@ -0,0 +1,47 @@ + CREATE UNIQUE INDEX GeneAttr_srcPrj + ON :SCHEMA.GeneAttributes (org_abbrev, source_id) + ; + + CREATE INDEX GeneAttr_exon_ix + ON :SCHEMA.GeneAttributes (exon_count, source_id, project_id) + ; + + CREATE INDEX GeneAttr_loc_ix + ON :SCHEMA.GeneAttributes (na_sequence_id, start_min, end_max, is_reversed, na_feature_id, is_deprecated) + ; + + CREATE INDEX GeneAttr_feat_ix + ON :SCHEMA.GeneAttributes (na_feature_id, na_sequence_id, start_min, end_max, is_reversed) + ; + + CREATE INDEX GeneAttr_orthoname_ix ON :SCHEMA.GeneAttributes ( + orthomcl_name, source_id, taxon_id, gene_type, na_feature_id, + na_sequence_id, start_min, end_max, organism, species, + product, project_id + ) + ; + + CREATE INDEX GeneAttr_ortholog_ix + ON :SCHEMA.GeneAttributes (source_id, na_sequence_id, start_min, end_max, orthomcl_name, na_feature_id) + ; + + CREATE INDEX GeneAttr_orgsrc_ix + ON :SCHEMA.GeneAttributes (organism, source_id, na_sequence_id, start_min, end_max) + ; + + CREATE INDEX GeneAttr_prjsrc_ix + ON :SCHEMA.GeneAttributes (project_id, organism, source_id, coalesce(IS_DEPRECATED,0)) + ; + + CREATE INDEX GeneAttr_txid_ix + ON :SCHEMA.GeneAttributes (taxon_id, source_id, gene_type, na_feature_id, project_id) + ; + + CREATE INDEX GeneAttr_ids_ix + ON :SCHEMA.GeneAttributes (na_feature_id, source_id, project_id) + ; + + CREATE INDEX GeneAttr_loc_intjunc_ix + ON :SCHEMA.GeneAttributes (NA_SEQUENCE_ID, START_MIN, IS_REVERSED, END_MAX) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneCopyNumbers.psql b/Model/lib/psql/webready/orgSpecific/GeneCopyNumbers.psql new file mode 100644 index 000000000..fab9996a9 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneCopyNumbers.psql @@ -0,0 +1,30 @@ +:CREATE_AND_POPULATE + SELECT DISTINCT ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + , ta.source_id + , ta.gene_source_id + , REGEXP_REPLACE(pan.name, '_[A-Za-z0-9]+ (.+)$', '') AS strain + , gcn.haploid_number AS raw_estimate + , gcn.ref_copy_number AS ref_cn + , CASE WHEN (gcn.haploid_number < 0.01) THEN 0 + WHEN (0.01 < gcn.haploid_number AND gcn.haploid_number < 1.85) THEN 1 + ELSE round(gcn.haploid_number) END AS haploid_number + , ta.chromosome + , ta.na_sequence_id + , io.input_pan_id + , io.output_pan_id + FROM apidb.genecopynumber gcn + , study.protocolappnode pan + , :SCHEMA.TranscriptAttributes ta + , :SCHEMA.PANIo io + WHERE gcn.protocol_app_node_id = pan.protocol_app_node_id + AND gcn.na_feature_id = ta.gene_na_feature_id + AND gcn.protocol_app_node_id = io.output_pan_id + AND (ta.gene_type = 'protein coding' or ta.gene_type = 'protein coding gene') + AND ta.org_abbrev = ':ORG_ABBREV' + AND io.org_abbrev = ':ORG_ABBREV' + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneCopyNumbers_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneCopyNumbers_ix.psql new file mode 100644 index 000000000..084742ec0 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneCopyNumbers_ix.psql @@ -0,0 +1,4 @@ + CREATE INDEX GeneCN_ix + ON :SCHEMA.GeneCopyNumbers (input_pan_id, na_sequence_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneGoTable.psql b/Model/lib/psql/webready/orgSpecific/GeneGoTable.psql new file mode 100644 index 000000000..25fa84491 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneGoTable.psql @@ -0,0 +1,29 @@ +:CREATE_AND_POPULATE + SELECT source_id, + ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + go_id, + string_agg(transcript_source_id, ', ' order by transcript_source_id) as transcript_ids, + is_not, + max(go_term_name) as go_term_name, ontology, source, evidence_code, + reference, evidence_code_parameter, sort_key + FROM (SELECT DISTINCT ggt.gene_source_id as source_id, ga.project_id, + replace(ggt.go_id, 'GO_', 'GO:') as go_id, + ggt.transcript_source_id, + case when ggt.is_not = 'not' then 'Is not' else '' end AS is_not, + ggt.go_term_name, ggt.ontology, ggt.source, ggt.evidence_code, + ggt.reference, ggt.evidence_code_parameter, + substr(ggt.ontology, 1, 1) || replace(ggt.go_id, 'GO_', 'GO:') as sort_key + FROM :SCHEMA.GeneGoTerms ggt, :SCHEMA.GeneAttributes ga + WHERE ggt.gene_source_id = ga.source_id + and ggt.org_abbrev = ':ORG_ABBREV' + and ga.org_abbrev = ':ORG_ABBREV' + ) t + GROUP BY source_id, project_id, go_id, is_not, ontology, + source, evidence_code, reference, evidence_code_parameter, sort_key + ORDER BY source_id, ontology, go_id + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneGoTable_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneGoTable_ix.psql new file mode 100644 index 000000000..5e34d5288 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneGoTable_ix.psql @@ -0,0 +1,5 @@ + create index ggtab_ix ON :SCHEMA.GeneGoTable + (source_id, project_id, go_id, transcript_ids, is_not, go_term_name, + ontology, source, evidence_code, reference, evidence_code_parameter, sort_key) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneGoTerms.psql b/Model/lib/psql/webready/orgSpecific/GeneGoTerms.psql new file mode 100644 index 000000000..87fec450b --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneGoTerms.psql @@ -0,0 +1,37 @@ +:CREATE_AND_POPULATE + with root_term + as (select ontology_term_id, + cast(initcap(replace(name, '_', ' ')) as varchar(20)) as ontology + from sres.OntologyTerm + where source_id in ('GO_0008150','GO_0003674','GO_0005575')) + select ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + gf.source_id as gene_source_id, t.source_id as transcript_source_id, taf.aa_sequence_id, + cast (CASE ga.is_not WHEN 0 THEN '' WHEN 1 THEN 'not' ELSE ga.is_not::varchar END as varchar(3)) as is_not, ns.taxon_id, + cast (gt.source_id as varchar(20)) as go_id, + gt.ontology_term_id as go_term_id, rt.ontology, + cast(gt.name as varchar(250)) as go_term_name, + cast(gail.name as varchar(24)) as source, + cast(gec.name as varchar(12)) as evidence_code, + cast(gaiec.reference as varchar(250)) as reference, + cast(gaiec.evidence_code_parameter as varchar(80))as evidence_code_parameter + from dots.GeneFeature gf, dots.Transcript t, dots.TranslatedAaFeature taf, dots.GoAssociation ga, + dots.GoAssociationInstance gai, dots.GoAssociationInstanceLoe gail, + dots.GoAssocInstEvidCode gaiec, sres.OntologyTerm gec, dots.NaSequence ns, + sres.OntologyTerm gt LEFT JOIN root_term rt ON gt.ancestor_term_id = rt.ontology_term_id + where t.parent_id = gf.na_feature_id + and gf.na_sequence_id = ns.na_sequence_id + and ns.taxon_id = :TAXON_ID + and t.na_feature_id = taf.na_feature_id + and taf.aa_sequence_id = ga.row_id + and ga.table_id = (select table_id + from core.TableInfo + where name = 'TranslatedAASequence') + and ga.go_term_id = gt.ontology_term_id + and ga.go_association_id = gai.go_association_id + and gai.go_assoc_inst_loe_id = gail.go_assoc_inst_loe_id + and gai.go_association_instance_id = gaiec.go_association_instance_id + and gaiec.go_evidence_code_id = gec.ontology_term_id + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneGoTerms_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneGoTerms_ix.psql new file mode 100644 index 000000000..d8456c06e --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneGoTerms_ix.psql @@ -0,0 +1,6 @@ + create index ggt_ix ON :SCHEMA.GeneGoTerms + (gene_source_id, transcript_source_id, ontology, go_id, go_term_id, + go_term_name, source, evidence_code, reference, + evidence_code_parameter, aa_sequence_id, is_not) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneId.psql b/Model/lib/psql/webready/orgSpecific/GeneId.psql new file mode 100644 index 000000000..ec39a78b7 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneId.psql @@ -0,0 +1,288 @@ +drop table if exists :SCHEMA.:ORG_ABBREVGeneFeatureTmp; + +create unlogged table :SCHEMA.:ORG_ABBREVGeneFeatureTmp as +(select gf.na_feature_id + , gf.na_sequence_id + , gf.external_database_release_id + , gf.is_predicted + , gf.source_id + from dots.genefeature gf + , dots.nasequence nas + where gf.na_sequence_id = nas.na_sequence_id + and nas.taxon_id = :TAXON_ID +) +; + +create index :ORG_ABBREV_GeneFeatureTmp_na_feature_id ON :SCHEMA.:ORG_ABBREVGeneFeatureTmp (na_feature_id) +; + + +:CREATE_AND_POPULATE + SELECT substr(mapping.id, 1, 100) as id, mapping.gene, cast (0 as NUMERIC(1)) as unique_mapping, + SUBSTR(string_agg(distinct union_member,'; ' order by union_member), 1, 100) as union_member, + SUBSTR(string_agg(distinct database_name,'; ' order by database_name), 1, 200) as database_name, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM (SELECT substr(t.protein_id, 1, nullif(position('.' IN t.protein_id) - 1, -1)) AS id, + gf.source_id AS gene, + 'Transcript.protein_id before dot' as union_member, ed.name as database_name /* dots.Transcript.protein_id, trimmed at period */ + FROM dots.Transcript t, :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, + sres.ExternalDatabase ed, sres.ExternalDatabaseRelease edr + WHERE t.parent_id = gf.na_feature_id + AND substr(t.protein_id, 1, nullif(position('.' IN t.protein_id) - 1, -1)) IS NOT NULL + AND gf.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + UNION + SELECT t.protein_id AS id, + gf.source_id AS gene, + 'Transcript.protein_id' as union_member, ed.name as database_name /* dots.Transcript.protein_id */ + FROM dots.Transcript t, :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, + sres.ExternalDatabase ed, sres.ExternalDatabaseRelease edr + WHERE t.parent_id = gf.na_feature_id + AND t.protein_id IS NOT NULL + AND gf.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + UNION + SELECT dr.primary_identifier AS id, + gf.source_id AS gene, + 'DbRef.primary_identifier' as union_member, ed.name as database_name /* sres.DbRef.primary_identifier */ + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.DbRefNaFeature drnf, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE dr.primary_identifier IS NOT NULL + AND gf.na_feature_id = drnf.na_feature_id + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id + = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + -- CHECK AND FIX + --AND NOT (ed.name in ('NRDB_gb_dbXRefBySeqIdentity','NRDB_ref_dbXRefBySeqIdentity') + -- AND NOT REGEXP_LIKE (dr.primary_identifier, '\D') ) + AND NOT ed.name in ('NRDB_gb_dbXRefBySeqIdentity','NRDB_ref_dbXRefBySeqIdentity') + AND NOT edr.id_type = 'synonym' + UNION + SELECT dr.primary_identifier AS id, + gf.source_id AS gene, + 'DbRef.primary_identifier on Transcript' as union_member, ed.name as database_name /* sres.DbRef.primary_identifier */ + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.Transcript t, dots.DbRefNaFeature drnf, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE dr.primary_identifier IS NOT NULL + AND gf.na_feature_id = t.parent_id + AND t.na_feature_id = drnf.na_feature_id + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id + = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + -- CHECK AND FIX + --AND NOT (ed.name in ('NRDB_gb_dbXRefBySeqIdentity','NRDB_ref_dbXRefBySeqIdentity') + -- AND NOT REGEXP_LIKE (dr.primary_identifier, '\D') ) + AND NOT ed.name in ('NRDB_gb_dbXRefBySeqIdentity','NRDB_ref_dbXRefBySeqIdentity') + UNION + SELECT dr.primary_identifier AS id, + gf.source_id AS gene, + 'DbRef.primary_identifier on Gene' as union_member, ed.name as database_name + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.DbRefNaFeature drnf, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE dr.primary_identifier IS NOT NULL + AND gf.na_feature_id = drnf.na_feature_id + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND ed.name ='RefSeq_gene_name' + UNION + SELECT dr.primary_identifier AS id, + gf.source_id AS gene, + 'VectorBase alternate names' as union_member, ed.name as database_name /* sres.DbRef.primary_identifier */ + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.DbRefNaFeature drnf, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE dr.primary_identifier IS NOT NULL + AND gf.na_feature_id = drnf.na_feature_id + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id + = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + and ed.name in ('VB_Community_Annotation', 'VB_Community_Symbol') /* vectorbase alt names */ + UNION + SELECT dr.primary_identifier AS id, + gf.source_id AS gene, + 'synonym' as union_member, ed.name as database_name + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.DbRefNaFeature drnf, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE dr.primary_identifier IS NOT NULL + AND gf.na_feature_id = drnf.na_feature_id + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND edr.id_type = 'synonym' + UNION + SELECT dr.secondary_identifier AS id, + gf.source_id AS gene, + 'DbRef.secondary_identifier' as union_member, ed.name as database_name /* sres.DbRef.secondary_identifier */ + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.DbRefNaFeature drnf, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE dr.secondary_identifier IS NOT NULL + AND gf.na_feature_id = drnf.na_feature_id + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id + = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND ed.name IN ('NRDB_gb_dbXRefBySeqIdentity', + 'NRDB_pdb_dbXRefBySeqIdentity', + 'NRDB_ref_dbXRefBySeqIdentity', + 'NRDB_sp_dbXRefBySeqIdentity', + 'Predicted protein structures','Pf_predictedProteinStructures_RSRC', + 'GenBank') + UNION + SELECT dr.primary_identifier AS id, + gf.source_id AS gene, + 'genbank DbRef.primary_identifier' as union_member, ed.name as database_name /* sres.DbRef.primary_identifier for Genbank records */ + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.Transcript t, dots.DbRefNaSequence drns, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE gf.na_feature_id = t.parent_id + AND t.na_sequence_id = drns.na_sequence_id + AND drns.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND ed.name = 'GenBank' + UNION + SELECT pred_loc.feature_source_id AS id, + gene_loc.feature_source_id AS gene, + 'overlapping predicted gene source_id' as union_member, ed.name as database_name /* dots.genefeature.source_id for predicted genes that overlap */ + FROM apidb.FeatureLocation gene_loc, apidb.FeatureLocation pred_loc, + sres.ExternalDatabaseRelease edr, sres.ExternalDatabase ed, + dots.nasequence nas + WHERE pred_loc.feature_type = 'GenePrediction' + AND gene_loc.feature_type = 'GeneFeature' + AND pred_loc.na_sequence_id = gene_loc.na_sequence_id + AND gene_loc.na_sequence_id = nas.na_sequence_id + and nas.taxon_id = :TAXON_ID + AND gene_loc.start_min <= pred_loc.end_max + AND gene_loc.end_max >= pred_loc.start_min + AND pred_loc.is_reversed = gene_loc.is_reversed + AND pred_loc.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + UNION + SELECT ng.name AS id, gf.source_id AS gene, + 'NaGene' as union_member, ed.name as database_name /* dots.NaGene.name */ + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.NaFeatureNaGene nfng, dots.NaGene ng, + sres.ExternalDatabaseRelease edr, sres.ExternalDatabase ed + WHERE gf.na_feature_id = nfng.na_feature_id + AND ng.na_gene_id = nfng.na_gene_id + AND gf.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + UNION + SELECT source_id AS id, source_id AS gene, + 'same ID' as union_member, ed.name as database_name /* same ID (reflexive mapping) */ + FROM :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, + sres.ExternalDatabaseRelease edr, sres.ExternalDatabase ed + WHERE gf.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + UNION + SELECT n.name AS id, gf.source_id AS gene, + 'gene name' as union_member, d.name as database_name -- apidb.GeneFeatureName.name + from :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, sres.ExternalDatabaseRelease r, sres.ExternalDatabase d, + ( select na_feature_id, name + from apidb.GeneFeatureName + where is_preferred = 1 + EXCEPT + -- suppress gene/name associations from the *DELETED_RSRC databases + select gfn.na_feature_id, gfn.name + from apidb.GeneFeatureName gfn, + sres.ExternalDatabase ed, sres.ExternalDatabaseRelease edr + where gfn.external_database_release_id = edr.external_database_release_id + and ed.external_database_id = edr.external_database_id + and ed.name like '%DELETED_RSRC' + ) n + where n.na_feature_id = gf.na_feature_id + and gf.external_database_release_id = r.external_database_release_id + and r.external_database_id = d.external_database_id + UNION + select dr.primary_identifier as id, + gf.source_id as gene, + 'AA feature DbRef primary ID' as union_member, + ed.name as database_name /* DbRef.primary_identifier mapped through DbRefAaFeature */ + from :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.Transcript t, dots.TranslatedAaFeature taf, + dots.DbRefAaFeature draf, sres.DbRef dr, + sres.ExternalDatabaseRelease edr, sres.ExternalDatabase ed + where gf.na_feature_id = t.parent_id + and t.na_feature_id = taf.na_feature_id + and taf.aa_feature_id = draf.aa_feature_id + and draf.db_ref_id = dr.db_ref_id + and dr.external_database_release_id = edr.external_database_release_id + and edr.external_database_id = ed.external_database_id + and ed.name + not in ('INTERPRO', 'PFAM', 'PIRSF', 'PRODOM', 'PROSITEPROFILES', + 'SMART', 'SUPERFAMILY', 'TIGRFAM', 'CDD','HAMAP','HMMPANTHER', + 'PRINTS','SCANPROSITE','SFLD') + ) mapping, + :SCHEMA.:ORG_ABBREVGeneFeatureTmp gf, dots.NaSequence ns + WHERE mapping.gene = gf.source_id + AND gf.na_sequence_id = ns.na_sequence_id + AND (gf.is_predicted != 1 OR gf.is_predicted is null) + GROUP BY mapping.id, mapping.gene +:DECLARE_PARTITION; + + + + INSERT INTO :SCHEMA.GeneId + (id, gene, unique_mapping, union_member, database_name, project_id, org_abbrev, modification_date) + WITH munge + AS (SELECT DISTINCT + regexp_replace(id, '\.\d\d?$', '') as id, + gene, unique_mapping, union_member, database_name + FROM :SCHEMA.GeneId + WHERE org_abbrev = ':ORG_ABBREV' + -- CHECK AND FIX + --WHERE regexp_like(id, '(.*)\.\d\d?$') + ) + SELECT id, gene, 0 as unique_mapping, 'base ID' as union_member, database_name, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM munge + WHERE id NOT IN (SELECT id FROM :SCHEMA.GeneId where org_abbrev = ':ORG_ABBREV') + + ; + + + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVOneGeneIdsTmp (lower_id) AS + SELECT lower_id + FROM (SELECT DISTINCT lower(id) as lower_id, gene + FROM :SCHEMA.GeneId where org_abbrev = ':ORG_ABBREV' + ) t + GROUP BY lower_id + HAVING count(*) = 1 + + ; + + + + CREATE UNIQUE INDEX :ORG_ABBREV_gix_pk ON :SCHEMA.:ORG_ABBREVOneGeneIdsTmp (lower_id) + + ; + + + + UPDATE :SCHEMA.GeneId + SET unique_mapping = 1 + WHERE id = gene + and org_abbrev = ':ORG_ABBREV' + ; + + + + UPDATE :SCHEMA.GeneId + SET unique_mapping = 1 + WHERE lower(id) IN (select lower_id from :SCHEMA.:ORG_ABBREVOneGeneIdsTmp) + and org_abbrev = ':ORG_ABBREV' + ; + + +drop table :SCHEMA.:ORG_ABBREVOneGeneIdsTmp +; +drop table :SCHEMA.:ORG_ABBREVGeneFeatureTmp +; diff --git a/Model/lib/psql/webready/orgSpecific/GeneId_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneId_ix.psql new file mode 100644 index 000000000..5e3f12dce --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneId_ix.psql @@ -0,0 +1,15 @@ + CREATE INDEX GeneId_gene_idx ON :SCHEMA.GeneId (gene, id) + ; + + CREATE INDEX GeneId_id_idx ON :SCHEMA.GeneId (id, gene) + ; + + CREATE INDEX GeneId_uniqid_idx ON :SCHEMA.GeneId (unique_mapping, id, gene) + ; + + CREATE INDEX GeneId_lowid_idx ON :SCHEMA.GeneId (lower(id), gene) + ; + + CREATE INDEX GeneId_uniqlowid_idx ON :SCHEMA.GeneId (unique_mapping, lower(id), gene) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneIntronJunction.psql b/Model/lib/psql/webready/orgSpecific/GeneIntronJunction.psql new file mode 100644 index 000000000..39837511c --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneIntronJunction.psql @@ -0,0 +1,192 @@ +drop table if exists :SCHEMA.:ORG_ABBREVDistinctAnnotatedIntronsTmp; + +drop table if exists :SCHEMA.:ORG_ABBREVGIJtmp; + +/* + Distinct Annotated Intron Locations +*/ +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVDistinctAnnotatedIntronsTmp ( + na_sequence_id, + start_min, + end_max, + is_reversed, + feature_type +) AS + SELECT il.na_sequence_id + , il.start_min + , il.end_max + , il.is_reversed + , 'Intron' as feature_type + FROM apidb.IntronLocation il + , dots.nasequence s + WHERE il.na_sequence_id = s.na_sequence_id + AND s.taxon_id = :TAXON_ID + GROUP by il.na_sequence_id + , il.start_min + , il.end_max + , il.is_reversed + + ; + +CREATE UNIQUE INDEX :ORG_ABBREV_annottmpnew_pk_ix ON :SCHEMA.:ORG_ABBREVDistinctAnnotatedIntronsTmp (na_sequence_id,start_min,end_max,is_reversed,feature_type) + + ; + +/* + this table does the heavy lifting for gene intron junction calculations +*/ +create UNLOGGED table :SCHEMA.:ORG_ABBREVGIJtmp ( + NA_SEQUENCE_ID NUMERIC(10), + SEQUENCE_SOURCE_ID VARCHAR(100), + SEGMENT_START NUMERIC, + SEGMENT_END NUMERIC, + TOTAL_UNIQUE NUMERIC, + TOTAL_ISRPM NUMERIC, + IS_REVERSED NUMERIC(1), + INTRON_FEATURE_ID VARCHAR(200), + MATCHES_GENE_STRAND NUMERIC, + GENE_SOURCE_ID VARCHAR(100), + GENE_NA_FEATURE_ID NUMERIC, + ANNOTATED_INTRON VARCHAR(10) +) + ; + + +DO $$ + DECLARE + iter_length numeric := 4999 :PLPGSQL_DELIM + i_first_pos numeric := 1 :PLPGSQL_DELIM + i_last_pos numeric := i_first_pos + iter_length :PLPGSQL_DELIM + idlist RECORD :PLPGSQL_DELIM + BEGIN + FOR idlist IN ( + SELECT na_sequence_id + , source_id as sequence_source_id + , length + , taxon_id + , CASE WHEN step_mult > 500000 THEN 500000 ELSE step_mult END as seq_step_mult + FROM ( + SELECT gs.na_sequence_id + , gs.source_id + , gs.length + , gs.taxon_id + , 25000 * (1 + floor(gs.length/count(*))) as step_mult + FROM apidb.intronjunction ij + , dots.nasequence gs + WHERE gs.na_sequence_id = ij.na_sequence_id + AND gs.taxon_id = :TAXON_ID + GROUP BY gs.na_sequence_id, gs.length, gs.taxon_id + ) + ) + LOOP + iter_length := idlist.seq_step_mult :PLPGSQL_DELIM + i_first_pos := 1 :PLPGSQL_DELIM + i_last_pos := i_first_pos + iter_length :PLPGSQL_DELIM + WHILE i_first_pos < idlist.length + LOOP + INSERT INTO :SCHEMA.:ORG_ABBREVGIJtmp + SELECT DISTINCT + junc.*, + CASE + WHEN last_value(ga.is_reversed) over w1 = junc.is_reversed + THEN 1 + ELSE 0 + END as matches_gene_strand, + last_value(ga.source_id) over w1 as gene_source_id, + last_value(ga.na_feature_id) over w1 as gene_na_feature_id, + CASE ag.feature_type WHEN 'Intron' THEN 'Yes' ELSE 'No' END as annotated_intron + FROM ( + SELECT ij.na_sequence_id + , idlist.sequence_source_id + , ij.segment_start + , ij.segment_end + , sum(ij.unique_reads) as total_unique + , round(sum(ij.unique_reads * je.multiplier),2) as total_isrpm + , ij.is_reversed + , idlist.sequence_source_id || '_' || ij.segment_start || '_' || ij.segment_end || '_' || ij.is_reversed as intron_feature_id + FROM apidb.intronjunction ij + , :SCHEMA.JunctionToCoverageProfileMapping je + WHERE ij.na_sequence_id = idlist.na_sequence_id + AND ij.segment_start between i_first_pos and i_last_pos + AND ij.unique_reads >= 1 + AND je.junctions_pan_id = ij.protocol_app_node_id + AND je.multiplier < 20 + AND je.org_abbrev = ':ORG_ABBREV' + GROUP BY ij.na_sequence_id + , ij.segment_start + , ij.segment_end + , ij.is_reversed + , idlist.sequence_source_id + ) junc + LEFT JOIN :SCHEMA.JunctionGeneLocation ga + ON junc.na_sequence_id = ga.na_sequence_id + AND ga.org_abbrev = ':ORG_ABBREV' + AND junc.segment_start >= ga.start_min + AND junc.segment_end <= ga.end_max + AND junc.is_reversed = ga.is_reversed + LEFT JOIN :SCHEMA.:ORG_ABBREVDistinctAnnotatedIntronsTmp ag + ON junc.na_sequence_id = ag.na_sequence_id + AND junc.segment_start = ag.start_min + AND junc.segment_end = ag.end_max + AND junc.is_reversed = ag.is_reversed + WHERE (junc.total_unique >= 1 or ag.feature_type = 'Intron') + WINDOW w1 AS ( + PARTITION BY junc.na_sequence_id,junc.sequence_source_id,junc.segment_start,junc.segment_end, junc.is_reversed, junc.intron_feature_id,junc.total_unique, junc.total_isrpm,ag.feature_type + ORDER BY ga.total_expression ASC + ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING + ) + :PLPGSQL_DELIM + commit :PLPGSQL_DELIM + i_first_pos := i_last_pos + 1 :PLPGSQL_DELIM + i_last_pos := i_first_pos + iter_length :PLPGSQL_DELIM + END LOOP :PLPGSQL_DELIM + END LOOP :PLPGSQL_DELIM + END :PLPGSQL_DELIM +$$ LANGUAGE PLPGSQL :PLPGSQL_DELIM + +; + + +create index :ORG_ABBREV_gijtmp_gnscid_ix on :SCHEMA.:ORG_ABBREVGIJtmp (gene_source_id) + + ; + +:CREATE_AND_POPULATE + SELECT + junc.* + , CASE + WHEN maxv.gene_source_id is not null and maxv.max_isrpm > 0 + THEN round((junc.total_isrpm / maxv.max_isrpm) * 100,2) + ELSE null + END as percent_max + , CASE + WHEN maxv.gene_source_id is not null + THEN 1 + ELSE 0 + END as contained + , :TAXON_ID as taxon_id +-- , CAST (null as numeric(10)) as upstream_gene_id +-- , CAST (null as numeric) as upstream_distance +-- , CAST (null as numeric(10)) as downstream_gene_id +-- , CAST (null as numeric) as downstream_distance + , ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + FROM + :SCHEMA.:ORG_ABBREVGIJtmp junc LEFT JOIN + ( + SELECT gene_source_id,max(total_unique) as max_unique, max(total_isrpm) as max_isrpm + FROM :SCHEMA.:ORG_ABBREVGIJtmp + WHERE gene_source_id is not null + GROUP BY gene_source_id + ) maxv ON junc.gene_source_id = maxv.gene_source_id +:DECLARE_PARTITION + ; + + + +drop table :SCHEMA.:ORG_ABBREVDistinctAnnotatedIntronsTmp + ; + +drop table :SCHEMA.:ORG_ABBREVGIJtmp + ; diff --git a/Model/lib/psql/webready/orgSpecific/GeneIntronJunction_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneIntronJunction_ix.psql new file mode 100644 index 000000000..22bd60369 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneIntronJunction_ix.psql @@ -0,0 +1,12 @@ +create index gijnew_loc_ix on :SCHEMA.GeneIntronJunction (na_sequence_id,segment_start,segment_end,is_reversed) + ; + +create index gijnew_gnscid_ix on :SCHEMA.GeneIntronJunction (intron_feature_id) + ; + +create index gijnew_txnloc_ix + on :SCHEMA.GeneIntronJunction + (taxon_id, na_sequence_id, segment_start, segment_end, is_reversed, + total_unique, total_isrpm, annotated_intron) + + ; diff --git a/Model/lib/psql/webready/orgSpecific/GeneLocations.psql b/Model/lib/psql/webready/orgSpecific/GeneLocations.psql new file mode 100644 index 000000000..ad129a3f1 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneLocations.psql @@ -0,0 +1,20 @@ +:CREATE_AND_POPULATE + SELECT source_id, + string_agg(location, '; ' order by is_top_level desc) as locations, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM (SELECT fl.feature_source_id as source_id, fl.is_top_level, + fl.sequence_source_id || ':' + || trim(to_char(fl.start_min,'999,999,999')) || '..' + || trim(to_char(fl.end_max,'999,999,999')) || '(' + || CASE coalesce(fl.is_reversed, 0) WHEN 0 THEN '+' WHEN 1 THEN '-' ELSE fl.is_reversed::varchar END + || ')' as location + FROM apidb.FeatureLocation fl, dots.NaSequence ns + WHERE fl.feature_type = 'GeneFeature' + AND fl.na_sequence_id = ns.na_sequence_id + AND ns.taxon_id = :TAXON_ID + ) t + GROUP BY source_id + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneLocations_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneLocations_ix.psql new file mode 100644 index 000000000..006a1a1df --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneLocations_ix.psql @@ -0,0 +1,4 @@ + create index gloc_ix + on :SCHEMA.GeneLocations (source_id, locations) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneMaxJunction.psql b/Model/lib/psql/webready/orgSpecific/GeneMaxJunction.psql new file mode 100644 index 000000000..593fbbaee --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneMaxJunction.psql @@ -0,0 +1,69 @@ +drop table if exists :SCHEMA.:ORG_ABBREVGeneMaxIntronGIJTmp + ; + +create unlogged table :SCHEMA.:ORG_ABBREVGeneMaxIntronGIJTmp ( + protocol_app_node_id NUMERIC(10), + gene_source_id VARCHAR(200), + max_unique NUMERIC, + max_isrpm NUMERIC, + sum_unique NUMERIC, + sum_isrpm NUMERIC, + avg_unique NUMERIC, + avg_isrpm NUMERIC, + project_id VARCHAR(20), + org_abbrev VARCHAR(20), + modification_date TIMESTAMP + ) +; + + +DO $$ + DECLARE + idlist RECORD :PLPGSQL_DELIM + BEGIN + FOR idlist IN ( + SELECT DISTINCT na_sequence_id + FROM :SCHEMA.JunctionGeneLocation + WHERE org_abbrev = ':ORG_ABBREV' + ) + LOOP + INSERT INTO :SCHEMA.:ORG_ABBREVGeneMaxIntronGIJTmp ( + SELECT j.protocol_app_node_id + , ga.source_id + , max(unique_reads) as max_unique + , max(round(j.unique_reads * mult.multiplier,2)) as max_isrpm + , sum(unique_reads) as sum_unique + , sum(round(j.unique_reads * mult.multiplier,2)) as sum_isrpm + , avg(unique_reads) as avg_unique + , avg(round(j.unique_reads * mult.multiplier,2)) as avg_isrpm + , ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + FROM apidb.intronjunction j + , :SCHEMA.JunctionGeneLocation ga + , :SCHEMA.JunctionToCoverageProfileMapping mult + WHERE ga.na_sequence_id = idlist.na_sequence_id + AND ga.na_sequence_id = j.na_sequence_id + AND ga.start_min <= j.segment_start + AND ga.end_max >= j.segment_end + AND ga.is_reversed = j.is_reversed + AND j.protocol_app_node_id = mult.junctions_pan_id + AND ga.org_abbrev = ':ORG_ABBREV' + AND mult.org_abbrev = ':ORG_ABBREV' + GROUP BY j.protocol_app_node_id, ga.source_id + ) :PLPGSQL_DELIM + commit :PLPGSQL_DELIM + END LOOP :PLPGSQL_DELIM + END :PLPGSQL_DELIM +$$ LANGUAGE PLPGSQL :PLPGSQL_DELIM + ; + + + +:CREATE_AND_POPULATE +select * from :SCHEMA.:ORG_ABBREVGeneMaxIntronGIJTmp +:DECLARE_PARTITION; + + +drop table :SCHEMA.:ORG_ABBREVGeneMaxIntronGIJTmp + ; diff --git a/Model/lib/psql/webready/orgSpecific/GeneMaxJunction_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneMaxJunction_ix.psql new file mode 100644 index 000000000..8915a4a6e --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneMaxJunction_ix.psql @@ -0,0 +1,3 @@ +CREATE INDEX GnMxIntGIJ_ix on :SCHEMA.GeneMaxJunction (gene_source_id,protocol_app_node_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneModelDump.psql b/Model/lib/psql/webready/orgSpecific/GeneModelDump.psql new file mode 100644 index 000000000..79eab5a4d --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneModelDump.psql @@ -0,0 +1,33 @@ +:CREATE_AND_POPULATE + SELECT source_id, project_id, sequence_id, gm_start,gm_end, type, is_reversed, + string_agg(transcript_id, ',' ORDER BY transcript_id) AS transcript_ids, + ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM ( + SELECT distinct ta.source_id as transcript_id, ta.gene_source_id as source_id, ta.project_id, + ta.sequence_id,gm.start_min as gm_start, gm.end_max as gm_end, + gm.type, gl.is_reversed + FROM + apidb.FeatureLocation gl, dots.NaSequence s, + :SCHEMA.TranscriptAttributes ta, + ( + SELECT CASE el.feature_type WHEN 'ExonFeature' THEN 'Exon' ELSE el.feature_type END as type, + el.parent_id as na_feature_id, el.start_min as start_min, el.end_max as end_max + FROM apidb.FeatureLocation el, dots.nasequence nas + WHERE el.feature_type in ('ExonFeature','five_prime_UTR', 'three_prime_UTR','CDS','Intron') + AND el.is_top_level = 1 + AND nas.na_sequence_id = el.na_sequence_id + and nas.taxon_id = :TAXON_ID + ) gm + WHERE gm.na_feature_id = ta.na_feature_id + AND s.na_sequence_id = gl.na_sequence_id + AND ta.na_feature_id = gl.na_feature_id + AND gl.is_top_level = 1 + AND ta.org_abbrev = ':ORG_ABBREV' + ) t + GROUP BY source_id, project_id, sequence_id, + gm_start, gm_end, type, is_reversed + ORDER BY CASE WHEN is_reversed = 1 THEN -1 * gm_start ELSE gm_start END + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneModelDump_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneModelDump_ix.psql new file mode 100644 index 000000000..2a73000f9 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneModelDump_ix.psql @@ -0,0 +1,5 @@ + create index gmd_ix + on :SCHEMA.GeneModelDump + (source_id, project_id, sequence_id, gm_start, gm_end, is_reversed, type, transcript_ids) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneOrthologGroup.psql b/Model/lib/psql/webready/orgSpecific/GeneOrthologGroup.psql new file mode 100644 index 000000000..0876f0b3a --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneOrthologGroup.psql @@ -0,0 +1,21 @@ +DROP TABLE if exists :SCHEMA.:ORG_ABBREVGeneOrthologGroupTmp; +/* + +ATTENTION: This table is empty. We will populate it in the comparative graph +We are creating them it so that they it is partitioned. +*/ + +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVGeneOrthologGroupTmp ( + gene_id VARCHAR(80), + group_id VARCHAR(16), + project_id varchar(20), + org_abbrev varchar(20), + modification_date timestamp +); + +:CREATE_AND_POPULATE +SELECT ogt.* from :SCHEMA.:ORG_ABBREVGeneOrthologGroupTmp ogt +:DECLARE_PARTITION; + + +DROP TABLE :SCHEMA.:ORG_ABBREVGeneOrthologGroupTmp; diff --git a/Model/lib/psql/webready/orgSpecific/GeneOrthologGroup_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneOrthologGroup_ix.psql new file mode 100644 index 000000000..c7ce2fd90 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneOrthologGroup_ix.psql @@ -0,0 +1,2 @@ +alter table :SCHEMA.GeneOrthologGroup + ADD CONSTRAINT GeneOrthologGroup_pk PRIMARY KEY (gene_id, group_id); diff --git a/Model/lib/psql/webready/orgSpecific/GeneProduct.psql b/Model/lib/psql/webready/orgSpecific/GeneProduct.psql new file mode 100644 index 000000000..ed8670eda --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneProduct.psql @@ -0,0 +1,130 @@ + +create unlogged table :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp as +(select gf.na_feature_id + , gf.source_id + , gf.product + from dots.genefeature gf + , dots.nasequence nas + where gf.na_sequence_id = nas.na_sequence_id + and nas.taxon_id = :TAXON_ID +) +; + + +:CREATE_AND_POPULATE + with gfp_preferred + as (select source_id, + substr(STRING_AGG(product, ', ' order by product), 1, 4000) as product, + count(*) as value_count + from (select distinct gf.source_id, gfp.product + from :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp gf, apidb.GeneFeatureProduct gfp + where gfp.na_feature_id = gf.na_feature_id + and gfp.is_preferred = 1) tmp + group by source_id), + gfp_any + as (select source_id, + substr(STRING_AGG(product, ', ' order by product), 1, 4000) as product, + count(*) as value_count + from (select distinct gf.source_id, gfp.product + from :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp gf, apidb.GeneFeatureProduct gfp + where gfp.na_feature_id = gf.na_feature_id) tmp + group by source_id), + tp_preferred + as (select source_id, + substr(STRING_AGG(product, ', ' order by product), 1, 4000) as product, + count(*) as value_count + from (select distinct gf.source_id, tp.product + from :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp gf, dots.Transcript t, apidb.TranscriptProduct tp + where t.parent_id = gf.na_feature_id + and tp.na_feature_id = t.na_feature_id + and tp.is_preferred = 1) tmp + group by source_id), + gf_product + as (select source_id, product + from :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp gf + where product is not null), + tp_any + as (select source_id, + substr(STRING_AGG(product, ', ' order by product), 1, 4000) as product, + count(*) as value_count + from (select distinct gf.source_id, tp.product + from :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp gf, dots.Transcript t, apidb.TranscriptProduct tp + where t.parent_id = gf.na_feature_id + and tp.na_feature_id = t.na_feature_id) tmp + group by source_id), + t_product + as (select source_id, + substr(STRING_AGG(product, ', ' order by product), 1, 4000) as product, + count(*) as value_count + from (select gf.source_id, t.product + from :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp gf, dots.Transcript t + where t.parent_id = gf.na_feature_id + and t.product is not null) tmp + group by source_id) + select gf.source_id, + coalesce(gfp_preferred.product, gfp_any.product, tp_preferred.product, + gf.product, tp_any.product, t_product.product) + as product, + case + when gfp_preferred.product is not null + then gfp_preferred.value_count + when gfp_any.product is not null + then gfp_any.value_count + when tp_preferred.product is not null + then tp_preferred.value_count + when gf.product is not null + then 1 + when tp_any.product is not null + then tp_any.value_count + when t_product.product is not null + then t_product.value_count + else 0 -- 'unspecified product' + end + as value_count, + case + when gfp_preferred.product is not null + then 1 + when gfp_any.product is not null + then 2 + when tp_preferred.product is not null + then 3 + when gf.product is not null + then 4 + when tp_any.product is not null + then 5 + when t_product.product is not null + then 6 + else 7 + end + as source_rule, + ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date + from :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp gf + LEFT JOIN gfp_preferred ON gf.source_id = gfp_preferred.source_id + LEFT JOIN gfp_any ON gf.source_id = gfp_any.source_id + LEFT JOIN tp_preferred ON gf.source_id = tp_preferred.source_id + LEFT JOIN gf_product ON gf.source_id = gf_product.source_id + LEFT JOIN tp_any ON gf.source_id = tp_any.source_id + LEFT JOIN t_product ON gf.source_id = t_product.source_id +:DECLARE_PARTITION; + + +/* + + +Dependent Step + gene/protein_id -> uniprot accession and product + +This step will assume it has all uniprot accessions assined to the protein and uniprot product name and have pfams in our hand + + + + + +*/ + +delete from :SCHEMA.GeneProduct where product is null and org_abbrev = ':ORG_ABBREV'; + +drop table :SCHEMA.:ORG_ABBREVGeneFeatureProductTmp +; diff --git a/Model/lib/psql/webready/orgSpecific/GeneProduct_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneProduct_ix.psql new file mode 100644 index 000000000..910d96552 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneProduct_ix.psql @@ -0,0 +1,2 @@ + CREATE INDEX GeneProduct_gene_idx ON :SCHEMA.GeneProduct (source_id, product) + ; diff --git a/Model/lib/psql/webready/orgSpecific/GeneSummaryFilter.psql b/Model/lib/psql/webready/orgSpecific/GeneSummaryFilter.psql new file mode 100644 index 000000000..02e0664c2 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GeneSummaryFilter.psql @@ -0,0 +1,13 @@ +:CREATE_AND_POPULATE + SELECT CAST(filter_name AS VARCHAR(80)) AS filter_name,':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM (SELECT species as filter_name + FROM :SCHEMA.GeneAttributes + WHERE org_abbrev = ':ORG_ABBREV' + UNION + SELECT organism as filter_name + FROM :SCHEMA.GeneAttributes + WHERE org_abbrev = ':ORG_ABBREV') t + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GeneSummaryFilter_ix.psql b/Model/lib/psql/webready/orgSpecific/GeneSummaryFilter_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSeqAttributes.psql b/Model/lib/psql/webready/orgSpecific/GenomicSeqAttributes.psql new file mode 100644 index 000000000..e8f448107 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSeqAttributes.psql @@ -0,0 +1,71 @@ +:CREATE_AND_POPULATE + SELECT + ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + SUBSTR(sequence.source_id, 1, 60) AS source_id, sequence.a_count, + sequence.c_count, sequence.g_count, sequence.t_count, + (sequence.length + - (sequence.a_count + sequence.c_count + sequence.g_count + sequence.t_count)) + AS other_count, + sequence.length, + to_char((sequence.a_count + sequence.t_count) / sequence.length * 100, '99.99') + AS at_percent, + SUBSTR(tn.name, 1, 100) AS organism, + taxon.ncbi_tax_id, + taxon.taxon_id, + CASE WHEN sequence.description IS NULL THEN SUBSTR(tn.name, 1, 100) + ELSE SUBSTR(sequence.description, 1, 400) + END AS sequence_description, + SUBSTR(genbank.genbank_accession, 1, 20) AS genbank_accession, + SUBSTR(db.database_version, 1, 30) AS database_version, db.database_name, + SUBSTR(sequence.chromosome, 1, 20) AS chromosome, + sequence.external_database_release_id, sequence.sequence_ontology_id, + sequence.chromosome_order_num, so.source_id as so_id, so.name as sequence_type, + 1 as is_top_level, + sequence.na_sequence_id, organism.genome_source, + organism.name_for_filenames, coalesce(msa.has_msa, 0) as has_msa + FROM sres.Taxon LEFT JOIN apidb.Organism ON taxon.taxon_id = :TAXON_ID and taxon.taxon_id = organism.taxon_id, + sres.OntologyTerm so, + ( SELECT na_sequence_id, source_id, length, chromosome, chromosome_order_num, taxon_id, description, + a_count, c_count, g_count, t_count, external_database_release_id, sequence_ontology_id + FROM dots.ExternalNaSequence + WHERE taxon_id = :TAXON_ID + ) sequence + LEFT JOIN + (SELECT drns.na_sequence_id, max(dr.primary_identifier) AS genbank_accession + FROM dots.dbrefNaSequence drns, sres.DbRef dr, + sres.ExternalDatabaseRelease gb_edr, sres.ExternalDatabase gb_ed + WHERE drns.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id + = gb_edr.external_database_release_id + AND gb_edr.external_database_id = gb_ed.external_database_id + AND gb_ed.name = 'GenBank' + GROUP BY drns.na_sequence_id + ) genbank ON sequence.na_sequence_id = genbank.na_sequence_id + LEFT JOIN + (SELECT edr.external_database_release_id, + edr.version AS database_version, ed.name AS database_name + FROM sres.ExternalDatabase ed, sres.ExternalDatabaseRelease edr + WHERE edr.external_database_id = ed.external_database_id + ) db ON sequence.external_database_release_id = db.external_database_release_id + LEFT JOIN + (SELECT a_na_sequence_id as na_sequence_id, 1 as has_msa + FROM apidb.Synteny syn + GROUP BY a_na_sequence_id + ) msa ON sequence.na_sequence_id = msa.na_sequence_id + LEFT JOIN + (SELECT taxon_id, max(name) as name + FROM sres.TaxonName + WHERE name_class = 'scientific name' + GROUP BY taxon_id + ) tn ON sequence.taxon_id = tn.taxon_id + WHERE + sequence.taxon_id = taxon.taxon_id + AND sequence.sequence_ontology_id = so.ontology_term_id + AND sequence.taxon_id = :TAXON_ID + AND so.name IN ('random_sequence', 'chromosome', 'contig', 'supercontig','mitochondrial_chromosome','plastid_sequence','cloned_genomic','apicoplast_chromosome','maxicircle') + ORDER BY organism, source_id + + +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSeqAttributes_ix.psql b/Model/lib/psql/webready/orgSpecific/GenomicSeqAttributes_ix.psql new file mode 100644 index 000000000..de455dd7f --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSeqAttributes_ix.psql @@ -0,0 +1,11 @@ + create unique index pk_SeqAttr_ ON :SCHEMA.GenomicSeqAttributes (org_abbrev, lower(source_id), project_id) + ; + + create unique index SeqAttr_source_id ON :SCHEMA.GenomicSeqAttributes (org_abbrev, source_id) + ; + + create unique index SeqAttr_naseqid ON :SCHEMA.GenomicSeqAttributes (org_abbrev, na_sequence_id) + ; + + create unique index SeqAttr_taxsrc_id ON :SCHEMA.GenomicSeqAttributes (org_abbrev, taxon_id, source_id) + ; diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSeqJunctionStats.psql b/Model/lib/psql/webready/orgSpecific/GenomicSeqJunctionStats.psql new file mode 100644 index 000000000..b2a85443d --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSeqJunctionStats.psql @@ -0,0 +1,32 @@ +:CREATE_AND_POPULATE + WITH org_tot AS ( + SELECT + gs.organism + , min(gij.total_unique) as min_annot_score + , PERCENTILE_cont(0.005) within group (order by gij.total_unique asc) as perc005_annot_score + , PERCENTILE_cont(0.01) within group (order by gij.total_unique asc) as perc01_annot_score + , min(gij.percent_max) as min_annot_percent_max + , PERCENTILE_cont(0.0001) within group (order by gij.percent_max asc) as perc0001_annot_percent_max + , PERCENTILE_cont(0.0005) within group (order by gij.percent_max asc) as perc0005_annot_percent_max + , floor(max(gij.segment_end - gij.segment_start) * 1.25) as max_intron_length + FROM :SCHEMA.geneintronjunction gij + , :SCHEMA.genomicseqattributes gs + WHERE gs.na_sequence_id = gij.na_sequence_id + AND gij.annotated_intron = 'Yes' + AND gij.org_abbrev = ':ORG_ABBREV' + AND gs.org_abbrev = ':ORG_ABBREV' + GROUP BY gs.organism + ) + SELECT gs.na_sequence_id + , gs.source_id + , ot.* + , ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + FROM :SCHEMA.genomicseqattributes gs + , org_tot ot + WHERE gs.organism = ot.organism + AND gs.na_sequence_id in (SELECT DISTINCT ij.na_sequence_id FROM apidb.intronjunction ij, dots.nasequence s where ij.na_sequence_id = s.na_sequence_id and s.taxon_id = :TAXON_ID) + AND gs.org_abbrev = ':ORG_ABBREV' +:DECLARE_PARTITION + ; diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSeqJunctionStats_ix.psql b/Model/lib/psql/webready/orgSpecific/GenomicSeqJunctionStats_ix.psql new file mode 100644 index 000000000..73401d0fe --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSeqJunctionStats_ix.psql @@ -0,0 +1,2 @@ +create index GeneIntJuncStat_ix on :SCHEMA.GenomicSeqJunctionStats (na_sequence_id) + ; diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSequenceId.psql b/Model/lib/psql/webready/orgSpecific/GenomicSequenceId.psql new file mode 100644 index 000000000..489b45043 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSequenceId.psql @@ -0,0 +1,24 @@ +:CREATE_AND_POPULATE + SELECT DISTINCT substr(id, 1, 60) as id, substr(sequence, 1, 60) AS sequence, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, CURRENT_TIMESTAMP as modification_date + FROM ( + SELECT ns.source_id as id, ns.source_id as sequence + FROM dots.NaSequence ns, sres.OntologyTerm oterm + WHERE ns.sequence_ontology_id = oterm.ontology_term_id + AND ns.taxon_id = :TAXON_ID + AND oterm.name in ('random_sequence', 'contig', 'supercontig', 'chromosome','mitochondrial_chromosome','plastid_sequence','cloned_genomic','apicoplast_chromosome','maxicircle','kinetoplast') + UNION + SELECT dr.primary_identifier AS id, ns.source_id AS sequence + FROM dots.NaSequence ns, dots.DbRefNaSequence drnf, + sres.DbRef dr, sres.ExternalDatabaseRelease edr, + sres.ExternalDatabase ed + WHERE dr.primary_identifier IS NOT NULL + AND ns.taxon_id = :TAXON_ID + AND ns.na_sequence_id = drnf.na_sequence_id + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id + = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + ) subquery1 +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSequenceId_ix.psql b/Model/lib/psql/webready/orgSpecific/GenomicSequenceId_ix.psql new file mode 100644 index 000000000..0cfa5601e --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSequenceId_ix.psql @@ -0,0 +1,9 @@ + CREATE INDEX genSeqId_sequence_idx ON :SCHEMA.GenomicSequenceId (sequence, id) + ; + + CREATE INDEX GenSeqId_id_idx ON :SCHEMA.GenomicSequenceId (id, sequence) + ; + + CREATE INDEX GenSeqId_lowid_idx ON :SCHEMA.GenomicSequenceId (lower(id), sequence) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSequenceSequence.psql b/Model/lib/psql/webready/orgSpecific/GenomicSequenceSequence.psql new file mode 100644 index 000000000..bc1f31fb2 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSequenceSequence.psql @@ -0,0 +1,13 @@ +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + sa.source_id, + ns.sequence + FROM :SCHEMA.GenomicSeqAttributes sa, dots.NaSequence ns + WHERE sa.na_sequence_id = ns.na_sequence_id + and sa.org_abbrev = ':ORG_ABBREV' + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GenomicSequenceSequence_ix.psql b/Model/lib/psql/webready/orgSpecific/GenomicSequenceSequence_ix.psql new file mode 100644 index 000000000..dc3711740 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GenomicSequenceSequence_ix.psql @@ -0,0 +1,3 @@ + create index GenomicSeq_ix on :SCHEMA.GenomicSequenceSequence (source_id, project_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/GoTermSummary.psql b/Model/lib/psql/webready/orgSpecific/GoTermSummary.psql new file mode 100644 index 000000000..db5e94653 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GoTermSummary.psql @@ -0,0 +1,57 @@ +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + ggt.gene_source_id, ggt.transcript_source_id, ggt.aa_sequence_id, + ggt.taxon_id, ggt.is_not, replace(ggt.go_id, '_', ':') as go_id, + ggt.go_term_id, ggt.ontology, + replace(ggt.go_term_name, '_',' ') as go_term_name, ggt.source, + ggt.evidence_code, + CASE ggt.evidence_code WHEN 'IEA' THEN 'Computed' ELSE 'Curated' END as evidence_category, + ggt.reference, ggt.evidence_code_parameter, + ol.min_depth as depth, + case + when gs.ontology_term_id is null then 0 + else 1 + end as is_go_slim + FROM :SCHEMA.GeneGoTerms ggt + LEFT JOIN :SCHEMA.OntologyLevels ol ON ggt.go_term_id = ol.ontology_term_id + LEFT JOIN ( + SELECT distinct ontology_term_id + FROM apidb.GoSubset + WHERE go_subset_term = 'goslim_generic' + ) gs ON ggt.go_term_id = gs.ontology_term_id + WHERE ggt.org_abbrev = ':ORG_ABBREV' + UNION + SELECT ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + ggt.gene_source_id, ggt.transcript_source_id, ggt.aa_sequence_id, + ggt.taxon_id, ggt.is_not, + replace (substr(ot.source_id, 1, 25),'_',':') as go_id, + ot.ontology_term_id as go_term_id, ggt.ontology, + replace (substr(ot.name, 1,250),'_',' ') as go_term_name, + ggt.source, ggt.evidence_code, + CASE ggt.evidence_code WHEN 'IEA' THEN 'Computed' ELSE 'Curated' END as evidence_category, + ggt.reference, + ggt.evidence_code_parameter, + ol.min_depth as depth, + case + when gs.ontology_term_id is null then 0 + else 1 + end as is_go_slim + FROM :SCHEMA.GeneGoTerms ggt, sres.OntologyRelationship orel, + sres.ExternalDatabase ed, sres.ExternalDatabaseRelease edr, + sres.OntologyTerm ot + LEFT JOIN :SCHEMA.OntologyLevels ol ON ot.ontology_term_id = ol.ontology_term_id + LEFT JOIN ( + SELECT distinct ontology_term_id + FROM apidb.GoSubset + WHERE go_subset_term = 'goslim_generic' + ) gs ON ot.ontology_term_id = gs.ontology_term_id + WHERE ggt.go_term_id = orel.subject_term_id + AND orel.object_term_id = ot.ontology_term_id + AND edr.external_database_release_id = ot.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND ed.name ='GO_RSRC' + AND ggt.org_abbrev = ':ORG_ABBREV' + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/GoTermSummary_ix.psql b/Model/lib/psql/webready/orgSpecific/GoTermSummary_ix.psql new file mode 100644 index 000000000..2d0531bdf --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/GoTermSummary_ix.psql @@ -0,0 +1,8 @@ + create index GoTermSum_aaSeqId_idx ON :SCHEMA.GoTermSummary (aa_sequence_id, go_id, source) + ; + + create index GoTermSum_plugin_ix ON :SCHEMA.GoTermSummary + (ontology, gene_source_id, is_not, is_go_slim, + go_id, go_term_name, evidence_code, evidence_category) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/IntronSupportLevel.psql b/Model/lib/psql/webready/orgSpecific/IntronSupportLevel.psql new file mode 100644 index 000000000..1e8d38653 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/IntronSupportLevel.psql @@ -0,0 +1,53 @@ +:CREATE_AND_POPULATE +with annotatedJunctions AS ( + select gene_source_id + , count(*) as annotated_count + from ( + select distinct ta.gene_source_id + , il.na_sequence_id + , il.start_min + , il.end_max + , il.is_reversed + from apidb.intronlocation il + inner join :SCHEMA.transcriptattributes ta + ON il.parent_id = ta.na_feature_id + ) group by gene_source_id +), exptJunctions as ( + select gij.gene_source_id + , case when gij.total_unique >= stats.perc01_annot_score and gij.percent_max >= 2 + then 'High' + else 'Low' + end as conf + FROM :SCHEMA.GeneIntronJunction gij + JOIN :SCHEMA.GenomicSeqJunctionStats stats + ON gij.na_sequence_id = stats.na_sequence_id + where gij.annotated_intron = 'Yes' -- we only care about the annotated ones + and gij.org_abbrev = ':ORG_ABBREV' + and stats.org_abbrev = ':ORG_ABBREV' +), exptJunctionCounts as ( + select gene_source_id + , conf + , count(*) as intron_count + from exptJunctions + group by gene_source_id + , conf +) +select aj.gene_source_id + , CASE WHEN aj.annotated_count = ejc.intron_count + THEN 'All-high' + ELSE 'Any-high' + END as string_value + from annotatedJunctions aj + left join exptJunctionCounts ejc on aj.gene_source_id = ejc.gene_source_id + where ejc.conf = 'High' + AND ejc.intron_count > 0 +select aj.gene_source_id + , CASE WHEN aj.annotated_count = sum(ejc.intron_count) + THEN 'All-low' + ELSE 'Any-low' + END as string_value + from annotatedJunctions aj + left join exptJunctionCounts ejc on aj.gene_source_id = ejc.gene_source_id +group by aj.gene_source_id, aj.annotated_count +having sum(ejc.intron_count) > 0 +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/IntronSupportLevel_ix.psql b/Model/lib/psql/webready/orgSpecific/IntronSupportLevel_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/IntronUtrCoords.psql b/Model/lib/psql/webready/orgSpecific/IntronUtrCoords.psql new file mode 100644 index 000000000..cfed91dac --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/IntronUtrCoords.psql @@ -0,0 +1,32 @@ +:CREATE_AND_POPULATE + SELECT na_feature_id, source_id, + '[' || regexp_replace(string_agg(text,',' ORDER BY start_min), '.quot;', '"' ) || ']' AS gen_rel_intron_utr_coords, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM ( + SELECT na_feature_id, source_id, start_min, + '["' || feature_type || '",' || start_min || ',' || end_max || ']' AS text + FROM ( + SELECT fl.feature_type, tl.na_feature_id, tl.feature_source_id AS source_id, + CASE + WHEN tl.is_reversed = 1 + THEN tl.end_max - fl.end_max + 1 + ELSE fl.start_min - tl.start_min + 1 + END AS start_min, + CASE + WHEN tl.is_reversed = 1 + THEN tl.end_max - fl.start_min + 1 + ELSE fl.end_max - tl.start_min + 1 + END AS end_max + FROM + apidb.TranscriptLocation tl, apidb.FeatureLocation fl, dots.nasequence nas + WHERE + tl.na_feature_id = fl.parent_id + AND fl.na_sequence_id = nas.na_sequence_id + AND nas.taxon_id = :TAXON_ID + AND fl.feature_type in('UTR', 'Intron') + AND tl.is_top_level = 1 + AND fl.is_top_level = 1 + ) t1 + ) t2 + GROUP BY na_feature_id, source_id +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/IntronUtrCoords_ix.psql b/Model/lib/psql/webready/orgSpecific/IntronUtrCoords_ix.psql new file mode 100644 index 000000000..78bff1d74 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/IntronUtrCoords_ix.psql @@ -0,0 +1,8 @@ + CREATE INDEX iuc_srcid_ix + ON :SCHEMA.IntronUtrCoords (source_id, na_feature_id) + ; + + CREATE INDEX iuc_nfid_ix + ON :SCHEMA.IntronUtrCoords (na_feature_id, source_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/JunctionGeneLocation.psql b/Model/lib/psql/webready/orgSpecific/JunctionGeneLocation.psql new file mode 100644 index 000000000..35b6c6318 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/JunctionGeneLocation.psql @@ -0,0 +1,23 @@ +/* + The tuning table this was based on used a loop over the nasequences. We can add that back if slow but I am: + 1. using geneattributes which will make this faster than joining genefeature to nalocation and grouping + 2. using input tables here that are already partitioned +*/ +:CREATE_AND_POPULATE +SELECT ga.na_sequence_id + , ga.start_min + , ga.is_reversed + , ga.end_max + , ga.na_feature_id + , ga.source_id, + round(sum(nafe.value)::NUMERIC,2) as total_expression + FROM :SCHEMA.GeneAttributes ga + , :SCHEMA.JunctionToCoverageProfileMapping je + , results.nafeatureexpression nafe + WHERE ga.na_feature_id = nafe.na_feature_id + AND nafe.protocol_app_node_id = je.exp_pan_id + AND ga.org_abbrev = ':ORG_ABBREV' + AND je.org_abbrev = ':ORG_ABBREV' + GROUP BY ga.na_sequence_id,l.start_min,l.is_reversed,l.end_max,ga.na_feature_id,ga.source_id +:DECLARE_PARTITION + ; diff --git a/Model/lib/psql/webready/orgSpecific/JunctionGeneLocation_id.psql b/Model/lib/psql/webready/orgSpecific/JunctionGeneLocation_id.psql new file mode 100644 index 000000000..a2d30c4cb --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/JunctionGeneLocation_id.psql @@ -0,0 +1,5 @@ +create index gnidloc_nafid_ix on :SCHEMA.JunctionGeneLocation (na_feature_id) + ; + +CREATE INDEX gnattidloc_pk_ix ON :SCHEMA.JunctionGeneLocation (na_sequence_id,start_min,is_reversed,end_max,na_feature_id,source_id,total_expression) + ; diff --git a/Model/lib/psql/webready/orgSpecific/JunctionToCoverageProfileMapping.psql b/Model/lib/psql/webready/orgSpecific/JunctionToCoverageProfileMapping.psql new file mode 100644 index 000000000..462937437 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/JunctionToCoverageProfileMapping.psql @@ -0,0 +1,143 @@ +drop table if exists :SCHEMA.:ORG_ABBREVJunctionExpressionTmp; + +drop table if exists :SCHEMA.:ORG_ABBREVJunctionMappingStatsTmp; + + +/* + Join junctions to Expression values and get multiplier and decide on strand switching +*/ +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVJunctionExpressionTmp AS + WITH stats AS ( + SELECT protocol_app_node_id + --, 'total' as type + --, count(*) as total_junctions + --, sum(unique_reads) as total_reads + , round(1000000/sum(unique_reads),4) as multiplier + FROM apidb.IntronJunction ij, dots.nasequence s + WHERE unique_reads >= 1 + AND ij.na_sequence_id = s.na_sequence_id + AND s.taxon_id = 1802185 + GROUP BY protocol_app_node_id + ), ij AS ( + SELECT pj.output_pan_id as junctions_pan_id + , p.output_pan_id as expression_pan_id + , avg(nafe.value) as avg_value,pan.name as exp_name + , regexp_replace(pan.name, ' \[htseq-union.*', '') as sample_name + FROM :SCHEMA.panio p + , :SCHEMA:panio pj + , results.nafeatureexpression nafe + , study.protocolappnode pan + WHERE pj.output_pan_id in (select distinct protocol_app_node_id from stats) + AND pj.input_pan_id = p.input_pan_id + AND p.output_pan_id = pan.protocol_app_node_id + AND pan.name like '%tpm - unique%' -- NOTE: probably better to use raw counts here?? + AND p.output_pan_id = nafe.protocol_app_node_id + AND p.org_abbrev = ':ORG_ABBREV' + AND pj.org_abbrev = ':ORG_ABBREV' + GROUP BY pj.output_pan_id, p.output_pan_id, pan.name + ) , part AS ( + SELECT + ij.junctions_pan_id + , ij.avg_value + , stats.multiplier + , max(ij.expression_pan_id) OVER w as max_exp_pan_id + , max(ij.sample_name) OVER w as max_sample_Name + , max(ij.exp_name) OVER w as max_exp_name + FROM ij, stats + WHERE ij.junctions_pan_id = stats.protocol_app_node_id + WINDOW w AS (partition by ij.junctions_pan_id, stats.multiplier, ij.avg_value) + ) + SELECT DISTINCT junctions_pan_id + , first_value(max_exp_pan_id) OVER w1 as exp_pan_id + , first_value(max_sample_name) OVER w1 as sample_name + , CASE WHEN first_value(max_exp_name) OVER w1 LIKE '%secondstrand%' THEN 'true' ELSE 'false' END as switch_strands + , multiplier + FROM part + WINDOW w1 AS (PARTITION BY junctions_pan_id, multiplier ORDER BY avg_value DESC) + ; + +create index :ORG_ABBREVjunexpgijtmp_ix on :SCHEMA.:ORG_ABBREVJunctionExpressionTmp(junctions_pan_id,exp_pan_id) + ; + + +/* + Grab general mapping stats +*/ + +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVJunctionMappingStatsTmp ( + junctions_pan_id, + read_length, + mapped_reads, + avg_mapping_coverage, + num_replicates +) AS + SELECT junctions_pan_id + , round(avg(average_read_length - 2),1) as read_length + , round(avg(number_mapped_reads),1) as mapped_reads + , round(avg(avg_mapping_coverage) * ((avg(average_read_length) - 2) / avg(average_read_length)),2) as avg_mapping_coverage + , count(*) as num_replicates + FROM (SELECT je.junctions_pan_id + , ca.value::NUMERIC as average_read_length + , cb.value::NUMERIC as number_mapped_reads + , cc.value::NUMERIC as avg_mapping_coverage + FROM :SCHEMA.:ORG_ABBREVJunctionExpressionTmp je + , :SCHEMA.PANIO ioa + , STUDY.CHARACTERISTIC ca + , STUDY.CHARACTERISTIC cb + , sres.ontologyterm ota + , sres.ontologyterm otb, + STUDY.CHARACTERISTIC cc, sres.ontologyterm otc + WHERE je.junctions_pan_id = ioa.output_pan_id + AND ioa.input_pan_id = ca.protocol_app_node_id + AND ca.value is not null + AND ca.QUALIFIER_ID = ota.ONTOLOGY_TERM_ID + AND ota.source_id IN ('EuPathUserDefined_00504','EUPATH_0000457') -- '%average read length' + AND ca.protocol_app_node_id = cb.protocol_app_node_id + AND cb.value is not null + AND cb.QUALIFIER_ID = otb.ONTOLOGY_TERM_ID + AND otb.source_id IN ('EuPathUserDefined_00503','EUPATH_0000456') -- '%number mapped reads' + AND ca.protocol_app_node_id = cc.protocol_app_node_id + AND cc.value is not null + AND cc.QUALIFIER_ID = otc.ONTOLOGY_TERM_ID + AND otc.source_id IN ('EuPathUserDefined_00501','GENEPIO_0000092') -- '%average mapping coverage' + AND ioa.org_abbrev = ':ORG_ABBREV' + ) t + GROUP by junctions_pan_id +; + + +CREATE INDEX :ORG_ABBREVmpstats_pk_ix on :SCHEMA.:ORG_ABBREVJunctionMappingStatsTmp + (junctions_pan_id,read_length,mapped_reads,avg_mapping_coverage,num_replicates) +; + + + +/* + This is the main table +*/ +:CREATE_AND_POPULATE + SELECT DISTINCT je.sample_name + , je.junctions_pan_id + , je.exp_pan_id + , ms.read_length + , ms.mapped_reads + , ms.avg_mapping_coverage + , ms.num_replicates + , je.switch_strands + , je.multiplier + , ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + + FROM junexpgijtmp je + , mappingstatsgijtmp ms + WHERE je.junctions_pan_id = ms.junctions_pan_id +:DECLARE_PARTITION + ; + + +drop table :SCHEMA.:ORG_ABBREVJunctionExpressionTmp +; + +drop table :SCHEMA.:ORG_ABBREVJunctionMappingStatsTmp +; diff --git a/Model/lib/psql/webready/orgSpecific/JunctionToCoverageProfileMapping_ix.psql b/Model/lib/psql/webready/orgSpecific/JunctionToCoverageProfileMapping_ix.psql new file mode 100644 index 000000000..c724f94d6 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/JunctionToCoverageProfileMapping_ix.psql @@ -0,0 +1,2 @@ +create index JunctionCovMap_ix on :SCHEMA.JunctionToCoverageProfileMapping(junctions_pan_id,exp_pan_id) + ; diff --git a/Model/lib/psql/webready/orgSpecific/OrganismAbbreviation.psql b/Model/lib/psql/webready/orgSpecific/OrganismAbbreviation.psql new file mode 100644 index 000000000..bcdedad21 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/OrganismAbbreviation.psql @@ -0,0 +1,10 @@ +:CREATE_AND_POPULATE + select tn.name as organism, o.name_for_filenames, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, CURRENT_TIMESTAMP as modification_date + from apidb.Organism o, sres.TaxonName tn + where o.taxon_id = tn.taxon_id + and tn.name_class = 'scientific name' + and o.taxon_id = :TAXON_ID + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/OrganismAbbreviation_ix.psql b/Model/lib/psql/webready/orgSpecific/OrganismAbbreviation_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/OrganismSelectTaxonRank.psql b/Model/lib/psql/webready/orgSpecific/OrganismSelectTaxonRank.psql new file mode 100644 index 000000000..34c531b5c --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/OrganismSelectTaxonRank.psql @@ -0,0 +1,49 @@ +:CREATE_AND_POPULATE + WITH organism_rank AS ( + SELECT tn1.name as organism, o.public_abbrev, tn2.name as parent_organism, + case when tn2.name = 'Oomycetes' then 'class' else r.rank end as rank + FROM ( + WITH RECURSIVE cte AS( + SELECT taxon_id input, taxon_id, rank, parent_id + FROM sres.taxon + WHERE taxon_id IN (SELECT taxon_id FROM apidb.organism WHERE taxon_id = :TAXON_ID and is_annotated_genome = 1) + UNION + SELECT cte.input, t.taxon_id, t.rank, t.parent_id + FROM sres.taxon t, cte + WHERE cte.parent_id = t.taxon_id + ) + SELECT input, taxon_id, rank + FROM cte + ) r + , sres.taxonname tn1 + , sres.taxonname tn2 + , apidb.organism o + WHERE r.input = tn1.taxon_id + AND r.taxon_id = tn2.taxon_id + AND tn1.name_class = 'scientific name' + AND tn2.name_class = 'scientific name' + AND (r.rank in ('phylum', 'genus', 'species', 'kingdom', 'class') or (r.rank = 'no rank' and tn2.name = 'Oomycetes')) + AND tn1.taxon_id = o.taxon_id + ) + SELECT + ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + , organisms.organism + , organisms.public_abbrev + , coalesce(phylum.parent_organism, 'N/A') as phylum + , coalesce(genus.parent_organism, 'N/A') as genus + , coalesce(species.parent_organism, 'N/A') as species + , coalesce(kingdom.parent_organism, 'N/A') as kingdom + , coalesce(class.parent_organism, 'N/A') as class + FROM + (select distinct organism, public_abbrev from organism_rank) organisms + LEFT JOIN ( select * from organism_rank where rank= 'phylum') phylum ON organisms.organism = phylum.organism + LEFT JOIN ( select * from organism_rank where rank= 'genus') genus ON organisms.organism = genus.organism + LEFT JOIN ( select * from organism_rank where rank= 'species') species ON organisms.organism = species.organism + LEFT JOIN ( select * from organism_rank where rank= 'kingdom') kingdom ON organisms.organism = kingdom.organism + LEFT JOIN ( select * from organism_rank where rank= 'class') class ON organisms.organism = class.organism + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/OrganismSelectTaxonRank_ix.psql b/Model/lib/psql/webready/orgSpecific/OrganismSelectTaxonRank_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/PANExtDbRls.psql b/Model/lib/psql/webready/orgSpecific/PANExtDbRls.psql new file mode 100644 index 000000000..524ef017c --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/PANExtDbRls.psql @@ -0,0 +1,42 @@ +:CREATE_AND_POPULATE + + SELECT distinct protocol_app_node_id as pan_id, external_database_release_id, name as dataset_name, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM ( + SELECT + sl.protocol_app_node_id + , s.external_database_release_id + , d.name + FROM + study.nodeSet s + , study.nodeNodeSet sl + , sres.externaldatabaserelease r + , sres.externaldatabase d + , apidb.datasource ds + WHERE + s.external_database_release_id = r.external_database_release_id + and r.external_database_id = d.external_database_id + and s.node_set_id = sl.node_set_id + and s.external_database_release_id is not null + and d.name = ds.external_database_name + and r.version = ds.version + and ds.taxon_id = :TAXON_ID + UNION + SELECT pan.protocol_app_node_id + , pan.external_database_release_id + , d.name + FROM study.protocolappnode pan + , sres.externaldatabaserelease r + , sres.externaldatabase d + , apidb.datasource ds + WHERE + pan.external_database_release_id = r.external_database_release_id + and r.external_database_id = d.external_database_id + and pan.external_database_release_id is not null + and d.name = ds.external_database_name + and r.version = ds.version + and ds.taxon_id = :TAXON_ID + ) t + ORDER BY external_database_release_id, protocol_app_node_id + +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/PANIO.psql b/Model/lib/psql/webready/orgSpecific/PANIO.psql new file mode 100644 index 000000000..b9b918779 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/PANIO.psql @@ -0,0 +1,24 @@ +:CREATE_AND_POPULATE + SELECT DISTINCT io.*, ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM ( + SELECT i.protocol_app_node_id input_pan_id, pa.protocol_app_id, + o.protocol_app_node_id output_pan_id, + in_type.source_id as input_pan_type_source_id, + --in_type.name as input_pan_type, + in_type.ontology_term_id as input_pan_type_id, + out_type.source_id as output_pan_type_source_id, + --out_type.name as output_pan_type, + out_type.ontology_term_id as output_pan_type_id + FROM :SCHEMA.panextdbrls panExtDbRls + INNER JOIN study.Input i on i.protocol_app_node_id = panExtDbRls.pan_id + INNER JOIN study.ProtocolApp pa on i.protocol_app_id = pa.protocol_app_id + INNER JOIN study.Output o on o.protocol_app_id = pa.protocol_app_id + INNER JOIN study.ProtocolAppNode in_pan on i.protocol_app_node_id = in_pan.protocol_app_node_id + INNER JOIN study.ProtocolAppNode out_pan on o.protocol_app_node_id = out_pan.protocol_app_node_id + LEFT JOIN sres.OntologyTerm out_type ON out_pan.type_id = out_type.ontology_term_id + LEFT JOIN sres.OntologyTerm in_type ON in_pan.type_id = in_type.ontology_term_id + WHERE panExtDbRls.org_abbrev = ':ORG_ABBREV' + ) io + ORDER BY io.input_pan_id, io.output_pan_id + +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/PANIO_ix.psql b/Model/lib/psql/webready/orgSpecific/PANIO_ix.psql new file mode 100644 index 000000000..f809a0b72 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/PANIO_ix.psql @@ -0,0 +1,29 @@ + create index painio2_iix on :SCHEMA.PANIO + (input_pan_id, output_pan_id, protocol_app_id, input_pan_type_source_id, output_pan_type_source_id) + + + ; + + + + create index painio2_oix on :SCHEMA.PANIO + (output_pan_id, input_pan_id, protocol_app_id, input_pan_type_source_id, output_pan_type_source_id) + + + ; + + + + create index painio2_otypeix on :SCHEMA.PANIO + (output_pan_type_source_id, input_pan_type_source_id, output_pan_id, input_pan_id, protocol_app_id) + + + ; + + + + create index painio2_itypeix on :SCHEMA.PANIO + (input_pan_type_source_id, output_pan_type_source_id, input_pan_id, output_pan_id, protocol_app_id) + + + ; diff --git a/Model/lib/psql/webready/orgSpecific/PathwaysGeneTable.psql b/Model/lib/psql/webready/orgSpecific/PathwaysGeneTable.psql new file mode 100644 index 000000000..fbac42a0d --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/PathwaysGeneTable.psql @@ -0,0 +1,28 @@ +DROP TABLE IF EXISTS :SCHEMA.ORG_ABBREVPathwaysGeneTableTmp; + + +/* ATTENTION: This table is empty. We will populate it in the comparative graph as loading depends on TranscriptPathway. +We are creating it here so that it is partitioned */ + +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVPathwaysGeneTableTmp ( + gene_source_id VARCHAR(80), + pathway_source_id VARCHAR(50), + pathway_name VARCHAR(150), + reactions NUMERIC(8), + enzyme VARCHAR(20), + expasy_url TEXT, + pathway_source TEXT, + exact_match VARCHAR(8), + + --for partitioning + project_id VARCHAR(20), + org_abbrev VARCHAR(20), + modification_date timestamp +); + + +:CREATE_AND_POPULATE +SELECT pgt.* FROM :SCHEMA.:ORG_ABBREVPathwaysGeneTableTmp pgt +:DECLARE_PARTITION; + +DROP TABLE :SCHEMA.:ORG_ABBREVPathwaysGeneTableTmp; diff --git a/Model/lib/psql/webready/orgSpecific/PathwaysGeneTable_ix.psql b/Model/lib/psql/webready/orgSpecific/PathwaysGeneTable_ix.psql new file mode 100644 index 000000000..f69349df0 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/PathwaysGeneTable_ix.psql @@ -0,0 +1,6 @@ + create index pgt_ix on :SCHEMA.PathwaysGeneTable + (gene_source_id, project_id, pathway_source_id, pathway_name, + reactions, enzyme, expasy_url, pathway_source, exact_match) + + ; + diff --git a/Model/lib/psql/webready/orgSpecific/PdbSimilarity.psql b/Model/lib/psql/webready/orgSpecific/PdbSimilarity.psql new file mode 100644 index 000000000..b3de1cfe9 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/PdbSimilarity.psql @@ -0,0 +1,33 @@ +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + ta.source_id, eas.source_id AS pdb_chain, + substr(eas.description, 1, 100) AS pdb_title, + substr(eas.source_id + , 1 + , length(eas.source_id) - ( + CASE strpos(reverse(eas.source_id), '_') + WHEN 0 THEN length(eas.source_id) + ELSE strpos(reverse(eas.source_id), '_') END + ) + ) AS pdb_id, + s.evalue_mant, s.evalue_exp, + s.pident as percent_identity, + ROUND((s.length / ta.protein_length) * 100) AS percent_plasmo_coverage, + SUBSTR(tn.name, 1, 100) AS taxon, + eas.taxon_id as pdb_taxon_id, ta.taxon_id as gene_taxon_id + FROM apidb.PdbSimilarity s, + apiDB.ProteinDataBank eas, + sres.TaxonName tn, + :SCHEMA.TranscriptAttributes ta + WHERE ta.aa_sequence_id = s.aa_sequence_id + AND s.pident = eas.protein_data_bank_id + and tn.name_class = 'scientific name' + AND eas.taxon_id = tn.taxon_id + and ta.org_abbrev = ':ORG_ABBREV' + ORDER BY ta.source_id, eas.source_id + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/PdbSimilarity_ix.psql b/Model/lib/psql/webready/orgSpecific/PdbSimilarity_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/ProteinAttributes.psql b/Model/lib/psql/webready/orgSpecific/ProteinAttributes.psql new file mode 100644 index 000000000..14be4666d --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ProteinAttributes.psql @@ -0,0 +1,220 @@ + DROP TABLE IF EXISTS :SCHEMA.:ORG_ABBREVGoTermList_tmp; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVGoTermList_tmp AS + SELECT aa_sequence_id, ontology, source, + string_agg(go_term_name, ';' ORDER BY go_term_name) AS go_terms, + string_agg(go_id, ';' ORDER BY go_term_name) AS go_ids + FROM ( + SELECT aa_sequence_id, ontology, + CASE evidence_code WHEN 'IEA' THEN 'predicted' ELSE 'annotated' END AS source, go_term_name, go_id + FROM :SCHEMA.GeneGoTerms + WHERE org_abbrev = ':ORG_ABBREV' + + ) t + GROUP BY aa_sequence_id, ontology, source + + ; + + DROP TABLE IF EXISTS :SCHEMA.:ORG_ABBREVProteinGoAttributes_tmp; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVProteinGoAttributes_tmp AS + SELECT DISTINCT gts.aa_sequence_id, + substr(annotated_go_component.go_terms, 1, 300) AS annotated_go_component, + substr(annotated_go_function.go_terms, 1, 300) AS annotated_go_function, + substr(annotated_go_process.go_terms, 1, 300) AS annotated_go_process, + substr(predicted_go_component.go_terms, 1, 300) AS predicted_go_component, + substr(predicted_go_function.go_terms, 1, 300) AS predicted_go_function, + substr(predicted_go_process.go_terms, 1, 300) AS predicted_go_process, + substr(annotated_go_component.go_ids, 1, 300) AS annotated_go_id_component, + substr(annotated_go_function.go_ids, 1, 300) AS annotated_go_id_function, + substr(annotated_go_process.go_ids, 1, 300) AS annotated_go_id_process, + substr(predicted_go_component.go_ids, 1, 300) AS predicted_go_id_component, + substr(predicted_go_function.go_ids, 1, 300) AS predicted_go_id_function, + substr(predicted_go_process.go_ids, 1, 300) AS predicted_go_id_process + FROM + (SELECT DISTINCT aa_sequence_id FROM :SCHEMA.GoTermSummary where org_abbrev = ':ORG_ABBREV') gts + LEFT JOIN ( + SELECT * FROM :SCHEMA.:ORG_ABBREVGoTermList_tmp + WHERE source = 'annotated' AND ontology = 'Cellular Component' + ) annotated_go_component ON + gts.aa_sequence_id = annotated_go_component.aa_sequence_id + AND 'annotated' = annotated_go_component.source + AND 'Cellular Component' = annotated_go_component.ontology + LEFT JOIN ( + SELECT * FROM :SCHEMA.:ORG_ABBREVGoTermList_tmp + WHERE source = 'annotated' AND ontology = 'Molecular Function' + ) annotated_go_function ON + gts.aa_sequence_id = annotated_go_function.aa_sequence_id + AND 'annotated' = annotated_go_function.source + AND 'Molecular Function' = annotated_go_function.ontology + LEFT JOIN ( + SELECT * FROM :SCHEMA.:ORG_ABBREVGoTermList_tmp + WHERE source = 'annotated' AND ontology = 'Biological Process' + ) annotated_go_process ON + gts.aa_sequence_id = annotated_go_process.aa_sequence_id + AND 'annotated' = annotated_go_process.source + AND 'Biological Process' = annotated_go_process.ontology + LEFT JOIN ( + SELECT * FROM :SCHEMA.:ORG_ABBREVGoTermList_tmp + WHERE source = 'predicted' AND ontology = 'Cellular Component' + ) predicted_go_component ON + gts.aa_sequence_id = predicted_go_component.aa_sequence_id + AND 'predicted' = predicted_go_component.source + AND 'Cellular Component' = predicted_go_component.ontology + LEFT JOIN ( + SELECT * FROM :SCHEMA.:ORG_ABBREVGoTermList_tmp + WHERE source = 'predicted' AND ontology = 'Molecular Function' + ) predicted_go_function ON + gts.aa_sequence_id = predicted_go_function.aa_sequence_id + AND 'predicted' = predicted_go_function.source + AND 'Molecular Function' = predicted_go_function.ontology + LEFT JOIN ( + SELECT * FROM :SCHEMA.:ORG_ABBREVGoTermList_tmp + WHERE source = 'predicted' AND ontology = 'Biological Process' + ) predicted_go_process ON + gts.aa_sequence_id = predicted_go_process.aa_sequence_id + AND 'predicted' = predicted_go_process.source + AND 'Biological Process' = predicted_go_process.ontology + + ; + + + + create index ProteinGoAttr_aaSequenceId_:ORG_ABBREV ON :SCHEMA.:ORG_ABBREVProteinGoAttributes_tmp (aa_sequence_id) + + + ; + + DROP TABLE IF EXISTS :SCHEMA.:ORG_ABBREVProteinAttrsEc_tmp; + + CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVProteinAttrsEc_tmp AS + SELECT aa_sequence_id, SUBSTR(string_agg(ec_number, ';' order by ec_number),1, 300) AS ec_numbers + FROM (SELECT DISTINCT asec.aa_sequence_id, + ec.ec_number || ' (' || ec.description || ')' AS ec_number + FROM dots.AaSequenceEnzymeClass asec, sres.EnzymeClass ec, dots.aasequence seq + WHERE ec.enzyme_class_id = asec.enzyme_class_id + AND seq.aa_sequence_id = asec.aa_sequence_id + AND seq.taxon_id = :TAXON_ID + AND NOT asec.evidence_code = 'OrthoMCLDerived' + ) t + GROUP BY aa_sequence_id + + ; + + DROP TABLE IF EXISTS :SCHEMA.:ORG_ABBREVProteinAttrsEcDerived_tmp; + + --TODO: these rows will not exist in org specific land + -- CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVProteinAttrsEcDerived_tmp AS + -- SELECT aa_sequence_id, SUBSTR(string_agg(ec_number, ';' order by ec_number),1, 300) AS ec_numbers_derived + -- FROM (SELECT DISTINCT asec.aa_sequence_id, + -- ec.ec_number || ' (' || ec.description || ')' AS ec_number + -- FROM dots.AaSequenceEnzymeClass asec, sres.EnzymeClass ec, dots.aasequence seq + -- WHERE ec.enzyme_class_id = asec.enzyme_class_id + -- AND seq.aa_sequence_id = asec.aa_sequence_id + -- AND seq.taxon_id = :TAXON_ID + -- AND asec.evidence_code = 'OrthoMCLDerived' + -- ) t + -- GROUP BY aa_sequence_id + + -- ; + + +-- TODO: Filter the subqueries or break into tmp tables for performance +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + tas.source_id, tas.aa_sequence_id, + t.source_id as transcript_source_id, + gf.source_id as gene_source_id, + cdsl.na_sequence_id as na_sequence_id, + cdsl.is_reversed, + cdsl.start_min as cds_start, + cdsl.end_max as cds_end, + (taf.translation_stop - taf.translation_start) + 1 AS cds_length, + length(tas.sequence) AS protein_length, + coalesce(transmembrane.tm_domains, 0) AS tm_count, + tas.molecular_weight, + asa.min_molecular_weight, asa.max_molecular_weight, + asa.isoelectric_point, asa.hydropathicity_gravy_score, + asa.aromaticity_score, + SUBSTR(sigp.peptide_sequence, 1, 200) as signalp_peptide, + ec_numbers, + --ec_numbers_derived, + go.annotated_go_component, + go.annotated_go_function, + go.annotated_go_process, + go.predicted_go_component, + go.predicted_go_function, + go.predicted_go_process, + go.annotated_go_id_component, + go.annotated_go_id_function, + go.annotated_go_id_process, + go.predicted_go_id_component, + go.predicted_go_id_function, + go.predicted_go_id_process, + SUBSTR(coalesce(rt1.anticodon, rt2.anticodon), 1, 3) AS anticodon, + 0 AS has_seqedit, + row_number() over (partition by t.source_id order by tas.length desc) as rank_in_transcript, + uniprot.uniprot_ids + FROM + dots.Transcript t + INNER JOIN dots.GeneFeature gf ON gf.na_feature_id = t.parent_id + INNER JOIN dots.nasequence nas ON gf.na_sequence_id = nas.na_sequence_id AND nas.taxon_id = :TAXON_ID + INNER JOIN dots.TranslatedAaFeature taf ON t.na_feature_id = taf.na_feature_id + INNER JOIN dots.TranslatedAaSequence tas ON taf.aa_sequence_id = tas.aa_sequence_id + LEFT JOIN dots.RnaType rt2 ON gf.na_feature_id = rt2.parent_id + LEFT JOIN dots.RnaType rt1 ON t.na_feature_id = rt1.parent_id + LEFT JOIN apidb.AaSequenceAttribute asa ON taf.aa_sequence_id = asa.aa_sequence_id + LEFT JOIN :SCHEMA.:ORG_ABBREVProteinGoAttributes_tmp go ON tas.aa_sequence_id = go.aa_sequence_id + LEFT JOIN ( + SELECT aa_sequence_id, string_agg(peptide_sequence, ', ') peptide_sequence + FROM (SELECT DISTINCT aa_sequence_id, peptide_sequence FROM :SCHEMA.SignalPeptideDomains where org_abbrev = ':ORG_ABBREV') t + GROUP BY aa_sequence_id + ) sigp ON tas.aa_sequence_id = sigp.aa_sequence_id + LEFT JOIN ( + SELECT protein_source_id, na_sequence_id, is_reversed, + MIN(start_min) AS start_min, MAX(end_max) AS end_max + FROM apidb.CdsLocation WHERE is_top_level=1 + GROUP BY protein_source_id, na_sequence_id, is_reversed + ) cdsl ON tas.source_id = cdsl.protein_source_id + LEFT JOIN ( + SELECT aa_sequence_id, max(tm_domains) AS tm_domains + FROM (SELECT tmaf.aa_sequence_id, COUNT(*) AS tm_domains + FROM dots.TransmembraneAaFeature tmaf, dots.AaLocation al + WHERE tmaf.aa_feature_id = al.aa_feature_id + GROUP BY tmaf.aa_sequence_id) tms + GROUP BY tms.aa_sequence_id + ) transmembrane ON tas.aa_sequence_id = transmembrane.aa_sequence_id + LEFT JOIN :SCHEMA.:ORG_ABBREVProteinAttrsEc_tmp ec ON tas.aa_sequence_id = ec.aa_sequence_id + --LEFT JOIN :SCHEMA.:ORG_ABBREVProteinAttrsEcDerived_tmp ecDerived ON tas.aa_sequence_id = ecDerived.aa_sequence_id + LEFT JOIN ( + SELECT af.aa_sequence_id, + string_agg(dbref.primary_identifier, ',' order by dbref.primary_identifier) as uniprot_ids + FROM sres.ExternalDatabase d, sres.ExternalDatabaseRelease r, + sres.DbRef, dots.DbRefAaFeature daf, dots.AaFeature af + WHERE d.name like 'Uniprot%' + AND d.external_database_id = r.external_database_id + AND r.external_database_release_id = dbref.external_database_release_id + AND dbref.db_ref_id = daf.db_ref_id + AND daf.aa_feature_id = af.aa_feature_id + GROUP BY af.aa_sequence_id + ) uniprot ON tas.aa_sequence_id = uniprot.aa_sequence_id + ORDER BY tas.source_id + + +:DECLARE_PARTITION; + + + + update :SCHEMA.ProteinAttributes gaup + set has_seqedit = 1 + where source_id in (select source_id from apidb.seqedit) + and org_abbrev = ':ORG_ABBREV' + + ; + +drop table :SCHEMA.:ORG_ABBREVGoTermList_tmp; +drop table :SCHEMA.:ORG_ABBREVProteinGoAttributes_tmp; +drop table :SCHEMA.:ORG_ABBREVProteinAttrsEc_tmp; +--drop table :SCHEMA.:ORG_ABBREVProteinAttrsEcDerived_tmp; diff --git a/Model/lib/psql/webready/orgSpecific/ProteinAttributes_ix.psql b/Model/lib/psql/webready/orgSpecific/ProteinAttributes_ix.psql new file mode 100644 index 000000000..3abf03219 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ProteinAttributes_ix.psql @@ -0,0 +1,6 @@ + CREATE INDEX PA_sourceId ON :SCHEMA.ProteinAttributes (source_id) + ; + + CREATE INDEX PA_aaSequenceId ON :SCHEMA.ProteinAttributes (aa_sequence_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/ProteinSequence.psql b/Model/lib/psql/webready/orgSpecific/ProteinSequence.psql new file mode 100644 index 000000000..0d2f58218 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ProteinSequence.psql @@ -0,0 +1,9 @@ +:CREATE_AND_POPULATE + WITH pAttr AS ( + SELECT distinct source_id, aa_sequence_id + FROM :SCHEMA.ProteinAttributes where org_abbrev = ':ORG_ABBREV') + SELECT pa.source_id, tas.sequence, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM pAttr pa, dots.TranslatedAaSequence tas + WHERE pa.aa_sequence_id = tas.aa_sequence_id +:DECLARE_PARTITION; diff --git a/Model/lib/psql/webready/orgSpecific/ProteinSequence_ix.psql b/Model/lib/psql/webready/orgSpecific/ProteinSequence_ix.psql new file mode 100644 index 000000000..c91f81d84 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/ProteinSequence_ix.psql @@ -0,0 +1,3 @@ + create index ProtSeq_ix on :SCHEMA.:ORG_ABBREVProteinSequence (source_id, project_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/RnaSeqStats.psql b/Model/lib/psql/webready/orgSpecific/RnaSeqStats.psql new file mode 100644 index 000000000..8945fa2c9 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/RnaSeqStats.psql @@ -0,0 +1,56 @@ +:CREATE_AND_POPULATE + select study_id, study_name, dataset_name, taxon_id, round(avg(num_reads::integer),0) as avg_unique_reads, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + from (select sl.node_set_id as study_id + , s.name || '[' || s.node_type || ']' as study_name + , ed.name as dataset_name + , ds.taxon_id + , c.value as num_reads + from apidb.Datasource ds + , sres.ExternalDatabase ed + , sres.ExternalDatabaseRelease edr + , study.nodeSet s, study.nodeNodeSet sl + , study.ProtocolAppNode pan + , study.Characteristic c + , sres.OntologyTerm ot + where ds.external_database_name = ed.name + and ed.external_database_id = edr.external_database_id + and edr.external_database_release_id = s.external_database_release_id + and sl.node_set_id = s.node_set_id + and sl.protocol_app_node_id = pan.protocol_app_node_id + and pan.protocol_app_node_id = c.protocol_app_node_id + and c.qualifier_id = ot.ontology_term_id + and (ot.source_id = 'EUPATH_0000460' or ot.source_id = 'EuPathUserDefined_00507') + and ds.taxon_id = :TAXON_ID + ) subquery1 + group by study_id, study_name, dataset_name, taxon_id + union + select study_id, study_name, dataset_name, taxon_id, round(2*avg(num_reads::integer),0) as avg_unique_reads, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + from (select sl.node_set_id as study_id + , s.name || '[' || s.node_type || ']' as study_name + , ed.name as dataset_name + , ds.taxon_id + , c.value as num_reads + from apidb.Datasource ds + , sres.ExternalDatabase ed + , sres.ExternalDatabaseRelease edr + , study.nodeSet s, study.nodeNodeSet sl + , study.ProtocolAppNode pan + , study.Characteristic c + , sres.OntologyTerm ot + where ds.external_database_name = ed.name + and ed.external_database_id = edr.external_database_id + and edr.external_database_release_id = s.external_database_release_id + and sl.node_set_id = s.node_set_id + and sl.protocol_app_node_id = pan.protocol_app_node_id + and pan.protocol_app_node_id = c.protocol_app_node_id + and c.qualifier_id = ot.ontology_term_id + and (ot.source_id = 'EUPATH_0000468' or ot.source_id = 'EuPathUserDefined_00515' or ot.source_id = 'EUPATH_0000476' or ot.source_id = 'EuPathUserDefined_00523') + and ds.taxon_id = :TAXON_ID + ) subquery2 + group by study_id, study_name, dataset_name, taxon_id + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/RnaSeqStats_ix.psql b/Model/lib/psql/webready/orgSpecific/RnaSeqStats_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/SequencePieceClosure.psql b/Model/lib/psql/webready/orgSpecific/SequencePieceClosure.psql new file mode 100644 index 000000000..e5b60d5e5 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/SequencePieceClosure.psql @@ -0,0 +1,47 @@ + + CREATE TABLE :ORG_ABBREVSequencePieceClosure AS + SELECT sp.sequence_piece_id, + sp.virtual_na_sequence_id, + sp.piece_na_sequence_id, + sp.sequence_order, + sp.distance_from_left, + sp.uncertainty, + sp.strand_orientation, + sp.start_position, + sp.end_position, + sp.modification_date, + 1 AS edge_level + FROM dots.SequencePiece sp, dots.NaSequence ns + WHERE sp.piece_na_sequence_id = ns.na_sequence_id + AND (ns.taxon_id::varchar = ':TAXON_ID' OR length(':TAXON_ID') = 0) + + ; + + + + /* known issue: this should be run not just once, but iteratively + until it doesn't create new records. Currently (7/2008), + SequencePieces aren't nested even once. */ + INSERT INTO :ORG_ABBREVSequencePieceClosure + (edge_level, virtual_na_sequence_id, piece_na_sequence_id, + distance_from_left, strand_orientation, modification_date, + start_position, end_position, sequence_order, sequence_piece_id) + SELECT 2, higher.virtual_na_sequence_id, lower.piece_na_sequence_id, + higher.distance_from_left, + case + when coalesce(higher.strand_orientation, '+') = coalesce(lower.strand_orientation, '+') + then '+' + else '-' + end as strand_orientation, + now(), + higher.start_position - lower.distance_from_left, + higher.end_position - lower.distance_from_left, + higher.sequence_order, + nextval('dots.sequencepiece_sq') + FROM :ORG_ABBREVSequencePieceClosure higher, :ORG_ABBREVSequencePieceClosure lower + WHERE higher.piece_na_sequence_id = lower.virtual_na_sequence_id + AND higher.start_position >= lower.start_position + lower.distance_from_left + AND higher.end_position <= lower.end_position + lower.distance_from_left + + ; + diff --git a/Model/lib/psql/webready/orgSpecific/SignalPeptideDomains.psql b/Model/lib/psql/webready/orgSpecific/SignalPeptideDomains.psql new file mode 100644 index 000000000..a23b7ea12 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/SignalPeptideDomains.psql @@ -0,0 +1,42 @@ +:CREATE_AND_POPULATE + SELECT + ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + , gf.source_id gene_source_id + , t.source_id transcript_source_id + , taf.na_feature_id + , spf.aa_feature_id + , spf.aa_sequence_id + , spf.parent_id + , aal.start_min + , aal.end_max + , spf.algorithm_name + , substr(s.sequence, aal.end_max::INTEGER, 1) peptide_sequence + FROM + dots.SignalPeptideFeature spf + , dots.AaLocation aal + , dots.TranslatedAaFeature taf + , dots.TranslatedAaSequence tas + , dots.GeneFeature gf + , dots.AaSequence s + , dots.Transcript t + WHERE + spf.aa_sequence_id = s.aa_sequence_id + AND aal.aa_feature_id = spf.aa_feature_id + AND t.na_feature_id = taf.na_feature_id + AND taf.aa_sequence_id = tas.aa_sequence_id + AND tas.aa_sequence_id = spf.aa_sequence_id + AND gf.na_feature_id = t.parent_id + AND tas.taxon_id = :TAXON_ID + AND (spf.signal_probability >= .5 + OR spf.signal_probability IS NULL + OR ((spf.means_score + spf.maxy_score) / 2) >= .5 + OR ( spf.maxy_conclusion + spf.maxc_conclusion + spf.maxs_conclusion + spf.means_conclusion ) >= 3 + ) + ORDER BY + spf.aa_sequence_id, spf.aa_feature_id + + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/SignalPeptideDomains_ix.psql b/Model/lib/psql/webready/orgSpecific/SignalPeptideDomains_ix.psql new file mode 100644 index 000000000..404e82cc0 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/SignalPeptideDomains_ix.psql @@ -0,0 +1,8 @@ + CREATE INDEX SignalP1_ix + ON :SCHEMA.SignalPeptideDomains (aa_sequence_id) + ; + + CREATE INDEX SignalP2_ix + ON :SCHEMA.SignalPeptideDomains (gene_source_id, transcript_source_id, end_max) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/SpliceSiteTranscript.psql b/Model/lib/psql/webready/orgSpecific/SpliceSiteTranscript.psql new file mode 100644 index 000000000..4c50716bf --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/SpliceSiteTranscript.psql @@ -0,0 +1,33 @@ +drop table if exists :SCHEMA.:ORG_ABBREVSpliceSiteTranscriptTmp + +/* +ATTENTION: This script is used to make an empty table. It will be populated with a perl script. + +We make the table here in the usual way so we get partitioned webready table +*/ + + +create unlogged table :SCHEMA.:ORG_ABBREVSpliceSiteTranscriptTmp ( + location numeric(10), + type varchar(20), + na_sequence_id numeric(10), + is_unique numeric(1), + sum_cpm float8, + dist_to_first_atg numeric(10), + gene_source_id varchar(80), + transcript_source_id varchar(80), + dist_to_cds numeric(10), + is_dominant numeric(1), + strand char(1), + project_id varchar(20), + org_abbrev varchar(20), + modification_date timestamp + ); + + +:CREATE_AND_POPULATE +select * from :SCHEMA.:ORG_ABBREVSpliceSiteTranscriptTmp +:DECLARE_PARTITION; + +drop table :SCHEMA.:ORG_ABBREVSpliceSiteTranscriptTmp +; diff --git a/Model/lib/psql/webready/orgSpecific/SpliceSiteTranscript_ix.psql b/Model/lib/psql/webready/orgSpecific/SpliceSiteTranscript_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/TFBSGene.psql b/Model/lib/psql/webready/orgSpecific/TFBSGene.psql new file mode 100644 index 000000000..52fa02bb4 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TFBSGene.psql @@ -0,0 +1,46 @@ +:CREATE_AND_POPULATE + SELECT DISTINCT + ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + ga.source_id as gene_source_id, + ga.organism as organism, + ga.genus_species as species, + aef.source_id as probe_id, + aef.na_feature_id as tfbs_na_feature_id, + CASE + WHEN ga.is_reversed = 0 + THEN round(abs(ga.start_min - (((arrloc.end_max - arrloc.start_min) / 2) + arrloc.start_min)),0) + ELSE round(abs(ga.end_max - (((arrloc.end_max - arrloc.start_min) / 2) + arrloc.start_min)),0) + END as distance, + CASE + WHEN /* distance > 0 */ + CASE WHEN ga.is_reversed = 0 + THEN ga.start_min - (((arrloc.end_max - arrloc.start_min) / 2) + arrloc.start_min) + ELSE ga.end_max - (((arrloc.end_max - arrloc.start_min) / 2) + arrloc.start_min) + END > 0 + THEN + CASE + WHEN ga.is_reversed = 0 + THEN '-' + ELSE '+' + END + ELSE + CASE + WHEN ga.is_reversed = 1 + THEN '-' + ELSE '+' + END + END as direction + -- , aef.* + FROM dots.BindingSiteFeature aef, + apidb.FeatureLocation arrloc, + :SCHEMA.GeneAttributes ga + WHERE aef.na_feature_id = arrloc.na_feature_id + AND arrloc.na_sequence_id = ga.na_sequence_id + AND ga.org_abbrev = ':ORG_ABBREV' + AND ( (ga.is_reversed = 0 and abs((((arrloc.end_max - arrloc.start_min) / 2) + arrloc.start_min) - ga.start_min) <= 3000) + or (ga.is_reversed = 1 and abs((((arrloc.end_max - arrloc.start_min) / 2) + arrloc.start_min) - ga.end_max) <= 3000) ) + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/TFBSGene_ix.psql b/Model/lib/psql/webready/orgSpecific/TFBSGene_ix.psql new file mode 100644 index 000000000..7eee68e1d --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TFBSGene_ix.psql @@ -0,0 +1,6 @@ + create index tfbs_geneid_idx ON :SCHEMA.TFBSGene (gene_source_id, tfbs_na_feature_id) + ; + + create index geneid_tfbs_idx ON :SCHEMA.TFBSGene (tfbs_na_feature_id,gene_source_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/TaxonSpecies.psql b/Model/lib/psql/webready/orgSpecific/TaxonSpecies.psql new file mode 100644 index 000000000..22bf73943 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TaxonSpecies.psql @@ -0,0 +1,43 @@ +drop table if exists :SCHEMA.taxonOfInterest_:ORG_ABBREV; + +create unlogged table :SCHEMA.taxonOfInterest_:ORG_ABBREV as +select distinct ens.taxon_id +from dots.est e,apidb.datasource ds, apidb.organism o, + sres.ExternalDatabaseRelease edr, dots.ExternalNaSequence ens, + sres.externaldatabase ed, sres.ontologyterm oterm +WhERE e.na_sequence_id = ens.na_sequence_id + AND ens.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND ens.sequence_ontology_id = oterm.ontology_term_id + AND ed.name = ds.name + and ds.taxon_id = o.taxon_id + and o.is_reference_strain = 1 + and o.taxon_id = :TAXON_ID + union + select :TAXON_ID +; + +-- recursively walk taxon tree to find ancestor with rank "species" +-- Update this to select max/min level with rank species if there are multiple +:CREATE_AND_POPULATE + WITH RECURSIVE cte AS ( + SELECT TAXON_ID, taxon_id as parent_id, 1 as lvl + FROM sres.taxon + WHERE taxon_id IN (SELECT taxon_id from :SCHEMA.taxonOfInterest_:ORG_ABBREV) + UNION ALL + SELECT cte.taxon_id, sub.parent_id, lvl + 1 + FROM cte, sres.taxon sub + WHERE cte.parent_id = sub.taxon_id + ) + SELECT c.taxon_id, c.parent_id as species_taxon_id, + ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date + FROM cte c, sres.taxon t + WHERE t.taxon_id = c.parent_id + AND t.rank='species' +:DECLARE_PARTITION; + +drop table if exists :SCHEMA.taxonOfInterest_:ORG_ABBREV; + + diff --git a/Model/lib/psql/webready/orgSpecific/TaxonSpecies_ix.psql b/Model/lib/psql/webready/orgSpecific/TaxonSpecies_ix.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/orgSpecific/Taxonomy.psql b/Model/lib/psql/webready/orgSpecific/Taxonomy.psql new file mode 100644 index 000000000..45422830b --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/Taxonomy.psql @@ -0,0 +1,30 @@ +DROP TABLE IF EXISTS :SCHEMA.TaxonTree_:ORG_ABBREV; + +CREATE TABLE :SCHEMA.TaxonTree_:ORG_ABBREV as + SELECT t.taxon_id, t.parent_id, t.ncbi_tax_id, + cast(tn.name as varchar(80)) as name, + cast(t.rank as varchar(24)) as rank + FROM sres.Taxon t, sres.TaxonName tn + WHERE t.taxon_id = tn.taxon_id + AND tn.name_class = 'scientific name' + AND t.taxon_id = :TAXON_ID + ; + +:CREATE_AND_POPULATE + WITH RECURSIVE cte AS ( + SELECT tt.*, name as organism, ARRAY[taxon_id::numeric] as path + FROM :SCHEMA.TaxonTree_:ORG_ABBREV tt + WHERE taxon_id = :TAXON_ID + UNION + SELECT tt.*, cte.organism, cte.path || tt.taxon_id as path + FROM :SCHEMA.TaxonTree_:ORG_ABBREV tt, cte + WHERE cte.parent_id = tt.taxon_id + AND tt.name != 'root' + ) + SELECT taxon_id, parent_id, ncbi_tax_id, name, rank, organism, row_number() over() as orderNum, + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date + FROM (SELECT cte.* FROM cte ORDER BY path) t +:DECLARE_PARTITION; + + +DROP TABLE :SCHEMA.TaxonTree_:ORG_ABBREV; diff --git a/Model/lib/psql/webready/orgSpecific/Taxonomy_ix.psql b/Model/lib/psql/webready/orgSpecific/Taxonomy_ix.psql new file mode 100644 index 000000000..e7005f831 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/Taxonomy_ix.psql @@ -0,0 +1,5 @@ + create index tax_ix + on :SCHEMA.Taxonomy + (organism, ordernum, taxon_id, parent_id, ncbi_tax_id, name, rank) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptAttributes.psql b/Model/lib/psql/webready/orgSpecific/TranscriptAttributes.psql new file mode 100644 index 000000000..c6ca34e89 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptAttributes.psql @@ -0,0 +1,455 @@ + DROP TABLE IF EXISTS :SCHEMA.:ORG_ABBREVTranscriptUniprot_tmp; + + CREATE UNLOGGED table :SCHEMA.:ORG_ABBREVTranscriptUniprot_tmp AS + select na_feature_id, + substr(string_agg(uniprot_id, ',' order by uniprot_id), 1, 240) as uniprot_id, + substr(string_agg(uniprot_id, '+or+' order by uniprot_id), 1, 240) as uniprot_id_internal + FROM (SELECT DISTINCT t.na_feature_id, dr.primary_identifier as uniprot_id + FROM sres.DbRef dr, dots.DbRefNaFeature x, dots.Transcript t, + dots.genefeature gf, dots.nasequence nas, + sres.ExternalDatabase d, sres.ExternalDatabaseRelease r + WHERE dr.db_ref_id = x.DB_REF_ID + AND t.parent_id = gf.na_feature_id + and gf.na_sequence_id = nas.na_sequence_id + and nas.taxon_id = :TAXON_ID + AND (x.na_feature_id = t.na_feature_id -- + or x.na_feature_id = t.parent_id) + AND dr.external_database_release_id = r.external_database_release_id + AND r.external_database_id = d.external_database_id + AND (d.name like '%uniprot_dbxref_RSRC' + OR d.name like '%dbxref_gene2Uniprot_RSRC' + OR d.name = 'Links to Uniprot Genes' + OR d.name like '%_dbxref_uniprot_linkout_RSRC' + OR d.name like '%_dbxref_uniprot_from_annotation_RSRC') + ) t + GROUP BY na_feature_id + + ; + +:CREATE_AND_POPULATE + WITH genefeat AS ( + SELECT DISTINCT + -- first the gene attributes: + gf.source_id AS gene_source_id, + gf.na_feature_id AS gene_na_feature_id, + LEAST(nl.start_min, nl.end_max) AS gene_start_min, + GREATEST(nl.start_min, nl.end_max) AS gene_end_max, + COALESCE(preferred_name.name, any_name.name) AS gene_name, + cast(coalesce(preferred_gene_product.product, any_gene_product.product, gf.product) + as VARCHAR(300)) as old_gene_product, + COALESCE(gp.product, 'unspecified product') as gene_product, + REPLACE(so.name, '_', ' ') AS gene_type, + gf.name as gene_ebi_biotype, + gi.gene_id, + transcripts.gene_transcript_count, + exons.gene_exon_count, + olds.old_ids AS gene_previous_ids, + coalesce(deprecated.is_deprecated, 0) as is_deprecated, + GREATEST(1, least(nl.start_min, nl.end_max) - 15000) AS gene_context_start, + LEAST(gsa.length, greatest(nl.start_min, nl.end_max) + 15000) AS gene_context_end, + GREATEST(1, least(nl.start_min, nl.end_max) - 1500) AS gene_zoom_context_start, + LEAST(gsa.length, greatest(nl.start_min, nl.end_max) + 1500) AS gene_zoom_context_end, + CAST(orthologs.name AS VARCHAR(60)) AS orthomcl_name, + -- coalesce(tothtssnps.total_hts_snps,0) AS gene_total_hts_snps, + -- coalesce(tothtssnps.hts_nonsynonymous_snps,0) AS gene_hts_nonsynonymous_snps, + -- coalesce(tothtssnps.hts_stop_codon_snps,0) AS gene_hts_stop_codon_snps, + -- coalesce(tothtssnps.hts_noncoding_snps,0) AS gene_hts_noncoding_snps, + -- coalesce(tothtssnps.hts_synonymous_snps,0) AS gene_hts_synonymous_snps, + -- coalesce(tothtssnps.hts_nonsyn_syn_ratio,0) AS gene_hts_nonsyn_syn_ratio, + CAST(cmnt.comment_string AS VARCHAR(300)) AS comment_string, + entrez_table.entrez_id AS gene_entrez_id, + gloc.locations AS gene_locations, + CAST(gsa.source_id AS VARCHAR(50)) AS sequence_id, + CAST(SUBSTR(tn.name, 1, 80) AS VARCHAR(80)) AS organism, + CAST(species_name.name AS VARCHAR(60)) AS species, + LTRIM(REGEXP_REPLACE(tn.name, replace (replace (species_name.name,'[',''), ']','') ,'')) AS strain, + taxon.ncbi_tax_id, tn.taxon_id, + so.source_id as so_id, + CAST(so.name AS VARCHAR(150)) AS so_term_name, + CAST(SUBSTR(so.definition, 1, 150) AS VARCHAR(150)) AS so_term_definition, + CAST(soRls.version AS VARCHAR(7)) AS so_version, + rt2.anticodon rt2_anticodon, + ed.name AS external_db_name, + edr.version AS external_db_version, + edr.external_database_release_id AS external_db_rls_id, + CAST(gsa.chromosome AS VARCHAR(20)) AS chromosome, + gsa.sequence_type, + gsa.chromosome_order_num, gsa.na_sequence_id + FROM dots.GeneFeature gf + INNER JOIN dots.nasequence seq ON seq.na_sequence_id = gf.na_sequence_id and seq.taxon_id = :TAXON_ID + INNER JOIN apidb.FeatureLocation nl ON gf.na_feature_id = nl.na_feature_id + INNER JOIN sres.OntologyTerm so ON gf.sequence_ontology_id = so.ontology_term_id + INNER JOIN :SCHEMA.GeneLocations gloc ON gf.source_id = gloc.source_id and gloc.org_abbrev = ':ORG_ABBREV' + LEFT JOIN :SCHEMA.GeneProduct gp ON gf.source_id = gp.source_id and gp.org_abbrev = ':ORG_ABBREV' + INNER JOIN sres.ExternalDatabaseRelease edr ON gf.external_database_release_id = edr.external_database_release_id + INNER JOIN sres.ExternalDatabase ed ON edr.external_database_id = ed.external_database_id + INNER JOIN :SCHEMA.GenomicSeqAttributes gsa ON nl.na_sequence_id = gsa.na_sequence_id and gsa.org_abbrev = ':ORG_ABBREV' + INNER JOIN sres.TaxonName tn ON gsa.taxon_id = tn.taxon_id + INNER JOIN sres.Taxon ON gsa.taxon_id = taxon.taxon_id + INNER JOIN sres.externalDatabaseRelease soRls ON so.external_database_release_id = soRls.external_database_release_id + INNER JOIN ( + SELECT DISTINCT gene AS source_id FROM :SCHEMA.GeneId where org_abbrev = ':ORG_ABBREV' + ) gene ON gf.source_id = gene.source_id + LEFT JOIN dots.RnaType rt2 ON gf.na_feature_id = rt2.parent_id + LEFT JOIN :SCHEMA.TaxonSpecies ts ON gsa.taxon_id = ts.taxon_id and ts.org_abbrev = ':ORG_ABBREV' + LEFT JOIN dots.geneinstance gi ON gf.na_feature_id = gi.na_feature_id + INNER JOIN sres.TaxonName species_name ON ts.species_taxon_id = species_name.taxon_id + LEFT JOIN ( + SELECT parent_id, count(*) AS gene_transcript_count + FROM dots.Transcript + GROUP BY parent_id + ) transcripts ON gf.na_feature_id = transcripts.parent_id + LEFT JOIN ( + SELECT parent_id, count(*) AS gene_exon_count + FROM dots.ExonFeature + GROUP BY parent_id + ) exons ON gf.na_feature_id = exons.parent_id + LEFT JOIN ( + SELECT nfc.na_feature_id, MAX(SUBSTR(nfc.comment_string, 300, 1)) AS comment_string + FROM dots.NaFeatureComment nfc + GROUP BY nfc.na_feature_id + ) cmnt ON gf.na_feature_id = cmnt.na_feature_id + LEFT JOIN ( + SELECT distinct drnf.na_feature_id as gene_na_feature_id, 1 as is_deprecated + FROM dots.DbRefNaFeature drnf, sres.DbRef dr, sres.ExternalDatabaseRelease edr, sres.ExternalDatabase ed + WHERE drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND ed.name = 'gassAWB_dbxref_gene2Deprecated_RSRC' + ) deprecated ON gf.na_feature_id = deprecated.gene_na_feature_id + -- NEED NEW TUNING TABLE ONCE NGS SNP WORK IS DONE + -- LEFT JOIN ( + -- SELECT gene_source_id, total_hts_snps, hts_nonsynonymous_snps, hts_stop_codon_snps,hts_noncoding_snps,hts_synonymous_snps, + -- case when (hts_nonsynonymous_snps is null) then 0 + -- when (hts_synonymous_snps = 0) then 0 + -- else round ((hts_nonsynonymous_snps/ hts_synonymous_snps), 2) end as hts_nonsyn_syn_ratio + -- FROM ( + -- select gene_source_id, + -- count(*) as total_hts_snps, + -- sum(has_nonsynonymous_allele) as hts_nonsynonymous_snps, + -- sum(has_stop_codon) as hts_stop_codon_snps, + -- sum(is_noncoding_snp) as hts_noncoding_snps, + -- count(*) - sum(has_nonsynonymous_allele) - sum(has_stop_codon) - sum(is_noncoding_snp) as hts_synonymous_snps + -- FROM :ORG_ABBREVSnpAttributes + -- WHERE gene_source_id is not null + -- GROUP by gene_source_id + -- ) t + -- ) tothtssnps ON gf.source_id = tothtssnps.gene_source_id + LEFT JOIN ( + SELECT ssg.sequence_id as gene_na_feature_id, sg.name + FROM dots.SequenceSequenceGroup ssg, + dots.sequencegroup sg, core.tableinfo ti + WHERE ssg.sequence_group_id = sg.sequence_group_id + AND ssg.source_table_id = ti.table_id + AND ti.name = 'GeneFeature' + ) orthologs ON gf.na_feature_id = orthologs.gene_na_feature_id + LEFT JOIN ( + SELECT na_feature_id, max(product) as product + FROM apidb.GeneFeatureProduct + WHERE is_preferred = 1 + GROUP BY na_feature_id + ) preferred_gene_product ON gf.na_feature_id = preferred_gene_product.na_feature_id + LEFT JOIN ( + SELECT na_feature_id, max(product) as product + FROM apidb.GeneFeatureProduct + GROUP BY na_feature_id + ) any_gene_product ON gf.na_feature_id = any_gene_product.na_feature_id + LEFT JOIN ( + SELECT na_feature_id, max(name) as name + FROM apidb.GeneFeatureName + WHERE is_preferred = 1 + GROUP BY na_feature_id + EXCEPT + -- suppress gene/name associations from the *DELETED_RSRC databases + SELECT gfn.na_feature_id, gfn.name + FROM apidb.GeneFeatureName gfn, + sres.ExternalDatabase ed, sres.ExternalDatabaseRelease edr + WHERE gfn.external_database_release_id = edr.external_database_release_id + AND ed.external_database_id = edr.external_database_id + AND ed.name like '%DELETED_RSRC' + ) preferred_name ON gf.na_feature_id = preferred_name.na_feature_id + LEFT JOIN ( + SELECT na_feature_id, max(name) as name + FROM apidb.GeneFeatureName + GROUP by na_feature_id + EXCEPT + -- suppress gene/name associations from the *DELETED_RSRC databases + SELECT gfn.na_feature_id, gfn.name + FROM apidb.GeneFeatureName gfn, + sres.ExternalDatabase ed, sres.ExternalDatabaseRelease edr + WHERE gfn.external_database_release_id = edr.external_database_release_id + AND ed.external_database_id = edr.external_database_id + AND ed.name like '%DELETED_RSRC' + ) any_name ON gf.na_feature_id = any_name.na_feature_id + LEFT JOIN ( + SELECT dbna.na_feature_id, + substr(string_agg(db.primary_identifier, ',' order by db.primary_identifier), 1, 300) as entrez_id + FROM sres.ExternalDatabaseRelease edr, sres.DbRef db, + dots.DbRefNaFeature dbna, sres.ExternalDatabase ed + WHERE edr.external_database_release_id = db.external_database_release_id + AND ed.external_database_id = edr.external_database_id + AND dbna.db_ref_id = db.db_ref_id + AND lower(ed.name) like '%entrez%' + GROUP BY dbna.na_feature_id + ) entrez_table ON gf.na_feature_id = entrez_table.na_feature_id + LEFT JOIN ( + -- TODO: PERFORMANCE + SELECT drnf.na_feature_id, + substr(string_agg(dr.primary_identifier, ';' order by dr.primary_identifier), 1, 900) as old_ids + FROM dots.DbRefNaFeature drnf, sres.DbRef dr, sres.ExternalDatabaseRelease edr, sres.ExternalDatabase ed + WHERE dr.primary_identifier is not null + AND drnf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND edr.id_type = 'previous id' + GROUP BY drnf.na_feature_id + ) olds ON gf.na_feature_id = olds.na_feature_id + WHERE nl.is_top_level = 1 + AND nl.feature_type = 'GeneFeature' + AND gsa.taxon_id = :TAXON_ID + AND species_name.name_class = 'scientific name' + AND (gf.is_predicted != 1 OR gf.is_predicted is null) + AND tn.name_class = 'scientific name' + AND tn.taxon_id NOT IN (SELECT o.taxon_id FROM apidb.Organism o WHERE o.is_annotated_genome=0) + -- JB: NOT SURE WHY THERE ARE COMMENTED. (if needed, move to model) + --AND tn.name not in ('Plasmodium gallinaceum','Plasmodium reichenowi') + ), transcript AS ( + SELECT DISTINCT + t.parent_id as transcript_parent_id, + -- first the gene attributes: + cast(null as varchar(80)) as representative_transcript, + 0 as gene_paralog_number, 0 as gene_ortholog_number, + transcript_uniprot.uniprot_id, transcript_uniprot.uniprot_id_internal, + -- next the transcript attributes: + t.source_id AS transcript_source_id, + tso.name as transcript_type, + t.na_feature_id, + CAST(coalesce(preferred_tx_product.product, any_tx_product.product, t.product) + AS VARCHAR(300)) + AS transcript_product, + tl.start_min, tl.end_max, + tl.is_reversed, --CHECK if needed + CASE coalesce(tl.is_reversed, 0) WHEN 0 THEN 'forward' WHEN 1 THEN 'reverse' ELSE tl.is_reversed::varchar END AS strand, + CASE t.is_pseudo WHEN null THEN 0 ELSE t.is_pseudo END as is_pseudo, + transcript_exons.exon_count, + sns.length AS length, sns.na_sequence_id as spliced_na_sequence_id, + rt1.anticodon rt1_anticodon, + utr_lengths.five_prime_utr_length, + utr_lengths.three_prime_utr_length + FROM dots.Transcript t + INNER JOIN dots.genefeature gf on t.parent_id = gf.na_feature_id + INNER JOIN dots.nasequence nas on gf.na_sequence_id = nas.na_sequence_id and nas.taxon_id = :TAXON_ID + LEFT JOIN dots.RnaType rt1 ON t.na_feature_id = rt1.parent_id + LEFT JOIN dots.SplicedNaSequence sns ON t.na_sequence_id = sns.na_sequence_id + INNER JOIN sres.OntologyTerm tso ON t.sequence_ontology_id = tso.ontology_term_id + INNER JOIN apidb.TranscriptLocation tl ON tl.feature_source_id = t.source_id + INNER JOIN ( + SELECT rna_feature_id, count(*) as exon_count + FROM dots.RnaFeatureExon + GROUP BY rna_feature_id + ) transcript_exons ON t.na_feature_id = transcript_exons.rna_feature_id + INNER JOIN ( + SELECT t.na_feature_id as transcript_na_feature_id, five_prime.utr_length as five_prime_utr_length, three_prime.utr_length as three_prime_utr_length + FROM dots.transcript t + LEFT JOIN ( + SELECT parent_id, sum(end_max-start_min + 1) as utr_length FROM apidb.UtrLocation + WHERE direction = 5 AND is_top_level = 1 + GROUP BY parent_id + ) five_prime ON t.na_feature_id = five_prime.parent_id + LEFT JOIN ( + SELECT parent_id, sum(end_max-start_min + 1) as utr_length FROM apidb.UtrLocation + WHERE direction = 3 AND is_top_level = 1 + GROUP BY parent_id + ) three_prime ON t.na_feature_id = three_prime.parent_id + ) utr_lengths ON t.na_feature_id = utr_lengths.transcript_na_feature_id + LEFT JOIN :SCHEMA.:ORG_ABBREVTranscriptUniprot_tmp transcript_uniprot ON t.na_feature_id = transcript_uniprot.na_feature_id + LEFT JOIN ( + SELECT na_feature_id, max(product) as product + FROM apidb.TranscriptProduct + WHERE is_preferred = 1 + GROUP BY na_feature_id + ) preferred_tx_product ON t.na_feature_id = preferred_tx_product.na_feature_id + LEFT JOIN ( + SELECT na_feature_id, max(product) as product + FROM apidb.TranscriptProduct + GROUP BY na_feature_id + ) any_tx_product ON t.na_feature_id = any_tx_product.na_feature_id + WHERE + tl.is_top_level=1 + ) + SELECT DISTINCT + ':PROJECT_ID' as project_id, + ':ORG_ABBREV' as org_abbrev, + current_timestamp as modification_date, + transcript.transcript_source_id AS source_id, + -- first the gene attributes: + genefeat.gene_source_id, + genefeat.gene_na_feature_id, + genefeat.gene_start_min, + genefeat.gene_end_max, + genefeat.gene_name, + CAST(COALESCE(genefeat.old_gene_product, transcript.transcript_product, + case when transcript.is_pseudo = 1 + then 'pseudogene' + else 'unspecified product' + end + ) as VARCHAR(300) + ) as old_gene_product, +-- genefeat.gene_product, + genefeat.gene_type, + genefeat.gene_ebi_biotype, + genefeat.gene_id, + genefeat.gene_transcript_count, + genefeat.gene_exon_count, + cast(null as varchar(80)) as representative_transcript, + genefeat.gene_previous_ids, + genefeat.is_deprecated, + 0 as gene_paralog_number, 0 as gene_ortholog_number, + genefeat.gene_context_start, + genefeat.gene_context_end, + genefeat.gene_zoom_context_start, + genefeat.gene_zoom_context_end, + genefeat.orthomcl_name, + -- genefeat.gene_total_hts_snps, + -- genefeat.gene_hts_nonsynonymous_snps, + -- genefeat.gene_hts_stop_codon_snps, + -- genefeat.gene_hts_noncoding_snps, + -- genefeat.gene_hts_synonymous_snps, + -- genefeat.gene_hts_nonsyn_syn_ratio, + genefeat.comment_string, + transcript.uniprot_id, transcript.uniprot_id_internal, + genefeat.gene_entrez_id, + genefeat.gene_locations, + -- next the transcript attributes: + transcript.transcript_source_id, + transcript.transcript_type, + transcript.na_feature_id, + CAST(coalesce(transcript.transcript_product, genefeat.old_gene_product, + case when transcript.is_pseudo = 1 + then 'pseudogene' + else 'unspecified product' + end + )AS VARCHAR(300) + ) as transcript_product, + transcript.start_min, transcript.end_max, + transcript.is_reversed, --CHECK if needed + transcript.strand, + transcript.is_pseudo, + transcript.exon_count, + transcript.length, + transcript.spliced_na_sequence_id, + genefeat.sequence_id, + genefeat.organism, + genefeat.species, + genefeat.strain, + genefeat.ncbi_tax_id, genefeat.taxon_id, + genefeat.so_id, + genefeat.so_term_name, + genefeat.so_term_definition, + genefeat.so_version, + CAST(coalesce(rt1_anticodon, rt2_anticodon)AS VARCHAR(3)) AS anticodon, + genefeat.external_db_name, + genefeat.external_db_version, + genefeat.external_db_rls_id, + genefeat.chromosome, + genefeat.sequence_type, + genefeat.chromosome_order_num, genefeat.na_sequence_id, + --next the protein attributes: + pa.source_id AS protein_source_id, + pa.aa_sequence_id, + pa.cds_start as coding_start, + pa.cds_end as coding_end, + pa.cds_length, + pa.protein_length, + pa.has_seqedit, + pa.tm_count, + pa.molecular_weight, + pa.isoelectric_point, + pa.signalp_peptide, + pa.ec_numbers, + --pa.ec_numbers_derived, + pa.annotated_go_component, + pa.annotated_go_function, + pa.annotated_go_process, + pa.predicted_go_component, + pa.predicted_go_function, + pa.predicted_go_process, + pa.annotated_go_id_component, + pa.annotated_go_id_function, + pa.annotated_go_id_process, + pa.predicted_go_id_component, + pa.predicted_go_id_function, + pa.predicted_go_id_process, + transcript.five_prime_utr_length, + transcript.three_prime_utr_length + FROM genefeat + INNER JOIN transcript ON genefeat. gene_na_feature_id = transcript.transcript_parent_id + LEFT JOIN :SCHEMA.proteinattributes pa ON pa.org_abbrev = ':ORG_ABBREV' and transcript.transcript_source_id = pa.transcript_source_id AND pa.rank_in_transcript = 1 + ORDER BY taxon_id, source_id + + +:DECLARE_PARTITION; + + + -- ADDED ORG_ABBREV filter in 2 places just in case + UPDATE :SCHEMA.TranscriptAttributes ta + SET exon_count = (SELECT count(*) + 1 FROM apidb.IntronLocation il WHERE ta.org_abbrev = ':ORG_ABBREV' AND il.parent_id = ta.na_feature_id AND il.end_max - il.start_min + 1 > 10 ) + WHERE ta.project_id = 'TriTrypDB' + AND ta.org_abbrev = ':ORG_ABBREV' + + ; + + + + UPDATE :SCHEMA.TranscriptAttributes gaup + SET gene_paralog_number = ( + SELECT count(distinct gene_source_id) + FROM :SCHEMA.TranscriptAttributes g1 + WHERE g1.orthomcl_name = gaup.orthomcl_name + AND g1.organism = gaup.organism + AND gaup.gene_source_id != g1.gene_source_id + AND g1.org_abbrev = ':ORG_ABBREV' + ), + gene_ortholog_number = ( + SELECT count(distinct gene_source_id) + FROM :SCHEMA.TranscriptAttributes g1 + WHERE g1.orthomcl_name = gaup.orthomcl_name + AND g1.organism != gaup.organism + AND g1.org_abbrev = ':ORG_ABBREV' + ) + WHERE (gaup.gene_type = 'protein coding' or gaup.gene_type = 'protein coding gene') + and gaup.org_abbrev = ':ORG_ABBREV' + + ; + + +-- TODO: test this + UPDATE :SCHEMA.TranscriptAttributes + SET gene_id = gene_na_feature_id + (select nextval('dots.gene_sq')) + WHERE gene_id is null + and org_abbrev = ':ORG_ABBREV' + + ; + + + UPDATE :SCHEMA.TranscriptAttributes + SET representative_transcript = ( + select min(source_id) + from :SCHEMA.TranscriptAttributes ga + where ga.gene_source_id = :SCHEMA.TranscriptAttributes.gene_source_id + and org_abbrev = ':ORG_ABBREV' + ) + WHERE representative_transcript is null + AND gene_id is not null + and org_abbrev = ':ORG_ABBREV' + ; + + + UPDATE :SCHEMA.TranscriptAttributes + SET representative_transcript = source_id + WHERE representative_transcript is null + and org_abbrev = ':ORG_ABBREV' + ; + +drop table :SCHEMA.:ORG_ABBREVTranscriptUniprot_tmp +; diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptAttributes_ix.psql b/Model/lib/psql/webready/orgSpecific/TranscriptAttributes_ix.psql new file mode 100644 index 000000000..d0eee715f --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptAttributes_ix.psql @@ -0,0 +1,71 @@ + CREATE UNIQUE INDEX TranscriptAttr_sourceId + ON :SCHEMA.TranscriptAttributes (org_abbrev, source_id) + ; + + CREATE UNIQUE INDEX TranscriptAttr_srcPrj + ON :SCHEMA.TranscriptAttributes (org_abbrev, source_id, gene_source_id, project_id) + ; + + CREATE UNIQUE INDEX TranscriptAttr_genesrc + ON :SCHEMA.TranscriptAttributes (org_abbrev, gene_source_id, source_id, project_id) + ; + + CREATE UNIQUE INDEX TranscriptAttr_exon_ix + ON :SCHEMA.TranscriptAttributes (org_abbrev, gene_exon_count, source_id, gene_source_id, project_id) + ; + + CREATE UNIQUE INDEX TranscriptAttr_loc_ix + ON :SCHEMA.TranscriptAttributes + (org_abbrev, na_sequence_id, gene_start_min, gene_end_max, is_reversed, na_feature_id, + is_deprecated, source_id, gene_source_id, project_id) + ; + + CREATE UNIQUE INDEX TranscriptAttr_feat_ix + ON :SCHEMA.TranscriptAttributes (org_abbrev, na_feature_id, source_id, gene_source_id, project_id) + ; + + CREATE UNIQUE INDEX TranscriptAttr_geneid_ix + ON :SCHEMA.TranscriptAttributes (org_abbrev, gene_id, source_id, gene_source_id, project_id) + ; + + CREATE UNIQUE INDEX TransAttr_orthoname_ix + ON :SCHEMA.TranscriptAttributes (org_abbrev, orthomcl_name, source_id, taxon_id, gene_type, organism, gene_source_id, project_id) + ; + + CREATE UNIQUE INDEX TransAttr_molwt_ix + ON :SCHEMA.TranscriptAttributes (org_abbrev, taxon_id, molecular_weight, source_id, gene_source_id, project_id) + ; + + CREATE INDEX TransAttr_ortholog_ix + ON :SCHEMA.TranscriptAttributes + (source_id, na_sequence_id, gene_start_min, gene_end_max, orthomcl_name, gene_source_id, project_id) + ; + + CREATE INDEX TransAttr_orgsrc_ix + ON :SCHEMA.TranscriptAttributes (organism, source_id, sequence_id, gene_start_min, gene_end_max) + ; + + CREATE INDEX TransAttr_lwrsrc_ix + ON :SCHEMA.TranscriptAttributes (lower(source_id), gene_source_id, project_id, source_id) + ; + + CREATE INDEX TransAttr_species_ix + ON :SCHEMA.TranscriptAttributes (species, source_id, gene_id, gene_source_id, project_id) + ; + + CREATE UNIQUE INDEX TrnscrptAttr_geneinfo + ON :SCHEMA.TranscriptAttributes + (org_abbrev, gene_source_id, project_id, source_id, na_feature_id, spliced_na_sequence_id, + protein_source_id, na_sequence_id, length, protein_length, + five_prime_utr_length, three_prime_utr_length) + ; + + CREATE UNIQUE INDEX TranscriptAttr_genenaf + ON :SCHEMA.TranscriptAttributes (org_abbrev, gene_na_feature_id, gene_source_id, source_id, project_id) + ; + + CREATE INDEX TransAttr_locsIds_ix + ON :SCHEMA.TranscriptAttributes + (na_sequence_id, start_min, end_max, is_reversed, gene_source_id, source_id, project_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptCenDistance.psql b/Model/lib/psql/webready/orgSpecific/TranscriptCenDistance.psql new file mode 100644 index 000000000..f1f905600 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptCenDistance.psql @@ -0,0 +1,18 @@ +:CREATE_AND_POPULATE + SELECT DISTINCT + ':PROJECT_ID' as project_id, ':ORG_ABBREV' as org_abbrev, current_timestamp as modification_date, + tl.feature_source_id AS transcript, + LEAST(ABS(mfl.start_min - tl.end_max), + ABS(mfl.end_max - tl.start_min)) AS centromere_distance, + tl.sequence_source_id AS genomic_sequence + FROM apidb.TranscriptLocation tl, apidb.FeatureLocation mfl, + dots.nasequence nas, sres.OntologyTerm so + WHERE tl.na_sequence_id = mfl.na_sequence_id + AND mfl.feature_type = 'Miscellaneous' + AND mfl.sequence_ontology_id = so.ontology_term_id + AND so.name = 'centromere' + AND tl.is_top_level = 1 + and mfl.na_sequence_id = nas.na_sequence_id + and nas.taxon_id = :TAXON_ID +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptCenDistance_ix.psql b/Model/lib/psql/webready/orgSpecific/TranscriptCenDistance_ix.psql new file mode 100644 index 000000000..25e656ced --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptCenDistance_ix.psql @@ -0,0 +1,4 @@ + create index GCent_loc_ix + on :SCHEMA.TranscriptCenDistance (genomic_sequence, centromere_distance) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptEC.psql b/Model/lib/psql/webready/orgSpecific/TranscriptEC.psql new file mode 100644 index 000000000..e979f9498 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptEC.psql @@ -0,0 +1,35 @@ + +DROP TABLE if exists :SCHEMA.:ORG_ABBREVTranscriptECTmp; +/* + +ATTENTION: This table is empty. We will populate it in the comparative graph so that we can include ortho-derived EC numbers. +We are creating them it so that they it is partitioned. +*/ + +/* This table maps EC numbers to transcripts */ + +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVTranscriptECTmp ( + source_id VARCHAR(80), + gene_source_id VARCHAR(80), + enzyme_class_id NUMERIC(12), + ec_number VARCHAR(16), + ec_number_1 NUMERIC(3), + ec_number_2 NUMERIC(3), + ec_number_3 NUMERIC(3), + ec_number_4 NUMERIC(3), + wildcard_count NUMERIC(3), + evidence_code VARCHAR(255), + + -- fields required for partitioning + project_id VARCHAR(20), + org_abbrev VARCHAR(20), + modification_date timestamp + +); + + +:CREATE_AND_POPULATE +select tet.* from :SCHEMA.:ORG_ABBREVTranscriptECTmp tet +:DECLARE_PARTITION; + +DROP TABLE :SCHEMA.:ORG_ABBREVTranscriptECTmp; diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptEC_ix.psql b/Model/lib/psql/webready/orgSpecific/TranscriptEC_ix.psql new file mode 100644 index 000000000..5b9767528 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptEC_ix.psql @@ -0,0 +1,5 @@ +CREATE INDEX TranscriptEc_1_ix on :SCHEMA.TranscriptEC (source_id, ec_number) + ; + +CREATE INDEX TranscriptEc_2_ix on :SCHEMA.TranscriptEC (ec_number, source_id) + ; diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptOrthologGroup.psql b/Model/lib/psql/webready/orgSpecific/TranscriptOrthologGroup.psql new file mode 100644 index 000000000..09facffb8 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptOrthologGroup.psql @@ -0,0 +1,23 @@ + +DROP TABLE if exists :SCHEMA.:ORG_ABBREVTranscriptOrthologGroupTmp; +/* + +ATTENTION: This table is empty. We will populate it in the comparative graph +We are creating them it so that they it is partitioned. +*/ + +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVTranscriptOrthologGroupTmp ( + source_id VARCHAR(80), + gene_id VARCHAR(80), + group_id VARCHAR(16), + project_id varchar(20), + org_abbrev varchar(20), + modification_date timestamp +); + +:CREATE_AND_POPULATE +SELECT ogt.* from :SCHEMA.:ORG_ABBREVTranscriptOrthologGroupTmp ogt +:DECLARE_PARTITION; + + +DROP TABLE :SCHEMA.:ORG_ABBREVTranscriptOrthologGroupTmp; diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptOrthologGroup_ix.psql b/Model/lib/psql/webready/orgSpecific/TranscriptOrthologGroup_ix.psql new file mode 100644 index 000000000..ac9b3670e --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptOrthologGroup_ix.psql @@ -0,0 +1,2 @@ +alter table :SCHEMA.TranscriptOrthologGroup + add constraint TranscriptOrthologGroup_pk primary key (source_id); diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptPathway.psql b/Model/lib/psql/webready/orgSpecific/TranscriptPathway.psql new file mode 100644 index 000000000..f63449f73 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptPathway.psql @@ -0,0 +1,40 @@ +DROP TABLE if exists :SCHEMA.:ORG_ABBREVTranscriptPathwayTmp; +/* + +ATTENTION: This table is empty. We will populate it in the comparative graph so that we can include ortho-derived EC numbers. +We are creating it here so that it is partitioned. +*/ + +/* This is the final TranscriptPathway table */ + +CREATE UNLOGGED TABLE :SCHEMA.:ORG_ABBREVTranscriptPathwayTmp ( + -- transcript information + source_id VARCHAR(80), + gene_source_id VARCHAR(80), + + --pathway information + pathway_source_id VARCHAR(50), + pathway_name VARCHAR(150), + pathway_id NUMERIC(12), + pathway_source TEXT, -- TODO should this be a varchar? + external_database_release_id NUMERIC(10), + + --match information + ec_number_gene VARCHAR(16), + wildcard_count_gene NUMERIC(3), + ec_number_pathway VARCHAR(16), + wildcard_count_pathway NUMERIC(3), + exact_match NUMERIC(3), + complete_ec NUMERIC(3), + + -- fields required for partioning + project_id varchar(20), + org_abbrev varchar(20), + modification_date timestamp +); + +:CREATE_AND_POPULATE +SELECT tpt.* from :SCHEMA.:ORG_ABBREVTranscriptPathwayTmp tpt +:DECLARE_PARTITION; + +DROP TABLE :SCHEMA.:ORG_ABBREVTranscriptPathwayTmp; diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptPathway_ix.psql b/Model/lib/psql/webready/orgSpecific/TranscriptPathway_ix.psql new file mode 100644 index 000000000..92f77af43 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptPathway_ix.psql @@ -0,0 +1,12 @@ + create index TranscriptPath_ix + on :SCHEMA.TranscriptPathway + (gene_source_id, source_id, pathway_source_id, + pathway_name, pathway_id, ec_number_gene, wildcard_count_pathway, + ec_number_pathway, pathway_source) + ; + + create index TranscriptPathSource_ix + on :SCHEMA.TranscriptPathway (pathway_source, + gene_source_id, source_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptSequence.psql b/Model/lib/psql/webready/orgSpecific/TranscriptSequence.psql new file mode 100644 index 000000000..edccc1d80 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptSequence.psql @@ -0,0 +1,7 @@ +:CREATE_AND_POPULATE + SELECT ta.source_id, ta.project_id, ta.org_abbrev, sns.sequence, current_timestamp as modification_date + FROM :SCHEMA.TranscriptAttributes ta, dots.SplicedNaSequence sns + WHERE ta.source_id = sns.source_id + AND ta.org_abbrev = ':ORG_ABBREV' +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/TranscriptSequence_ix.psql b/Model/lib/psql/webready/orgSpecific/TranscriptSequence_ix.psql new file mode 100644 index 000000000..98bb3137f --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TranscriptSequence_ix.psql @@ -0,0 +1,3 @@ + create index XScriptSeq_ix on :SCHEMA.TranscriptSequence (source_id, project_id) + ; + diff --git a/Model/lib/psql/webready/orgSpecific/TransmembraneDomains.psql b/Model/lib/psql/webready/orgSpecific/TransmembraneDomains.psql new file mode 100644 index 000000000..0a6b669d8 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TransmembraneDomains.psql @@ -0,0 +1,26 @@ +:CREATE_AND_POPULATE + SELECT ':PROJECT_ID' as project_id + , ':ORG_ABBREV' as org_abbrev + , current_timestamp as modification_date + , ta.source_id as transcript_source_id + , ta.gene_source_id AS gene_source_id + , tmf.topology AS tmf_topology + , aal.start_min AS tmf_start_min + , aal.end_max AS tmf_end_max + , substr(tas.sequence, aal.end_max::INTEGER - aal.start_min::INTEGER + 1, aal.start_min::INTEGER) AS tmf_sequence + , tmf.aa_feature_id tmf_aa_feature_id + , tmf.aa_sequence_id tmf_aa_sequence_id + , tas.source_id as protein_source_id + FROM dots.aalocation aal + , :SCHEMA.transcriptattributes ta + , dots.translatedaafeature taf + , dots.translatedaasequence tas + , dots.transmembraneaafeature tmf + WHERE ta.na_feature_id = taf.na_feature_id + AND taf.aa_sequence_id = tas.aa_sequence_id + AND tas.aa_sequence_id = tmf.aa_sequence_id + and tmf.aa_feature_id = aal.aa_feature_id + and ta.org_abbrev = ':ORG_ABBREV' + +:DECLARE_PARTITION; + diff --git a/Model/lib/psql/webready/orgSpecific/TransmembraneDomains_ix.psql b/Model/lib/psql/webready/orgSpecific/TransmembraneDomains_ix.psql new file mode 100644 index 000000000..964f23421 --- /dev/null +++ b/Model/lib/psql/webready/orgSpecific/TransmembraneDomains_ix.psql @@ -0,0 +1,4 @@ + create index TransDom1_ix + on :SCHEMA.TransmembraneDomains (tmf_aa_sequence_id, tmf_aa_feature_id, tmf_start_min, tmf_end_max, tmf_topology) + ; + diff --git a/Model/lib/psql/webready/unknown/AssociatedDataset.psql b/Model/lib/psql/webready/unknown/AssociatedDataset.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/unknown/DomainAssignment.psql b/Model/lib/psql/webready/unknown/DomainAssignment.psql new file mode 100644 index 000000000..7cb927d97 --- /dev/null +++ b/Model/lib/psql/webready/unknown/DomainAssignment.psql @@ -0,0 +1,69 @@ + + + create table DomainAssignment as + select sa.full_id, sa.group_name, + r.interpro_primary_id as accession, + r.interpro_desc as description, + CAST (NULL as NUMERIC) as domain_index, + sa.aa_sequence_id, + r.interpro_start_min as start_min, + r.interpro_end_min as end_max + from SequenceAttributes sa, apidb.interproresults r + where sa.full_id = r.protein_source_id + and upper(r.interpro_db_name) = 'PFAM' + + ; + + + + create index domain_accession_ix + on DomainAssignment (accession, full_id, group_name) + + ; + + + + create table domainIndex as + select row_number() OVER () as domain_index, accession + from (select distinct accession + from DomainAssignment + order by accession) + + ; + + + + create index domainIdxIdx on DomainIndex(accession, domain_index) + + ; + + + + update DomainAssignment da + set domain_index = (select domain_index + from DomainIndex + where accession = da.accession) + + ; + + + + create index domain_ix_ix + on DomainAssignment (domain_index, accession, full_id) + + ; + + + + create index domain_group_ix + on DomainAssignment (group_name, accession, full_id) + + ; + + + + create index domain_seq_ix + on DomainAssignment (aa_sequence_id, accession, full_id, group_name) + + ; + diff --git a/Model/lib/psql/webready/unknown/EdaGeneGraph.psql b/Model/lib/psql/webready/unknown/EdaGeneGraph.psql new file mode 100644 index 000000000..e69de29bb diff --git a/Model/lib/psql/webready/unknown/PANResults.psql b/Model/lib/psql/webready/unknown/PANResults.psql new file mode 100644 index 000000000..de67af481 --- /dev/null +++ b/Model/lib/psql/webready/unknown/PANResults.psql @@ -0,0 +1,91 @@ + + + CREATE TABLE :ORG_ABBREVPANResults AS + SELECT DISTINCT r.* + FROM ( + SELECT protocol_app_node_id as pan_id, 'Results::NAFeatureDiffResult' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.NAFeatureDiffResult) + UNION + SELECT protocol_app_node_id, 'Results::ReporterIntensity' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.ReporterIntensity) + UNION + SELECT protocol_app_node_id, 'Results::SegmentResult' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.SegmentResult) + UNION + SELECT protocol_app_node_id, 'Results::CompoundMassSpec' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.CompoundMassSpec) + UNION + SELECT protocol_app_node_id, 'Results::NaFeatureHostResponse' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.NaFeatureHostResponse) + UNION + SELECT protocol_app_node_id, 'ApiDB::ChrCopyNumber' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM ApiDB.ChrCopyNumber) + UNION + SELECT protocol_app_node_id, 'ApiDB::GeneCopyNumber' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM ApiDB.GeneCopyNumber) + UNION + SELECT protocol_app_node_id, 'Results::NAFeatureExpression' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.NAFeatureExpression) + UNION + SELECT protocol_app_node_id, 'Results::EditingEvent' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.EditingEvent) + UNION + SELECT protocol_app_node_id, 'Results::FamilyDiffResult' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.FamilyDiffResult) + UNION + SELECT protocol_app_node_id, 'Results::FamilyExpression' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.FamilyExpression) + UNION + SELECT protocol_app_node_id, 'Results::GeneDiffResult' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.GeneDiffResult) + UNION + SELECT protocol_app_node_id, 'Results::GeneExpression' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.GeneExpression) + UNION + SELECT protocol_app_node_id, 'Results::GeneSimilarity' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.GeneSimilarity) + UNION + SELECT protocol_app_node_id, 'Results::ReporterDiffResult' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.ReporterDiffResult) + UNION + SELECT protocol_app_node_id, 'Results::ReporterExpression' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.ReporterExpression) + UNION + SELECT protocol_app_node_id, 'Results::RnaDiffResult' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.RnaDiffResult) + UNION + SELECT protocol_app_node_id, 'Results::RnaExpression' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.RnaExpression) + UNION + SELECT protocol_app_node_id, 'Results::LineageAbundance' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.LineageAbundance) + UNION + SELECT protocol_app_node_id, 'Results::SegmentDiffResult' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.SegmentDiffResult) + UNION + SELECT protocol_app_node_id, 'Results::SeqVariation' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM Results.SeqVariation) + UNION + SELECT protocol_app_node_id, 'ApiDB::SequenceVariation' as result_table FROM study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM study.ProtocolAppNode WHERE name like '% (Sequence Variation)') + UNION + SELECT protocol_app_node_id, 'ApiDB::MassSpecSummary' as result_table from study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM apidb.MASSSPECSUMMARY) + UNION + SELECT protocol_app_node_id, 'ApiDB::IntronJunction' as result_table from study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM apidb.IntronJunction) + UNION + SELECT protocol_app_node_id, 'ApiDB::RflpGenotype' as result_table from study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM apidb.RflpGenotype) + UNION + SELECT protocol_app_node_id, 'ApiDB::RflpGenotypeNumber' as result_table from study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM apidb.RflpGenotypeNumber) + UNION + SELECT protocol_app_node_id, 'ApiDB::CrisprPhenotype' as result_table from study.ProtocolAppNode + WHERE protocol_app_node_id in (select protocol_app_node_id FROM apidb.crisprphenotype) + ) r, :ORG_ABBREVpanextdbrls panExtDbRls + WHERE r.pan_id = panExtDbRls.pan_id + AND (panExtDbRls.dataset_name = ':TAXON_IDValue' or length(':TAXON_IDValue') = 0) + + ; + diff --git a/Model/lib/psql/webready/unknown/ProjectTaxon.psql b/Model/lib/psql/webready/unknown/ProjectTaxon.psql new file mode 100644 index 000000000..dd3d4ee56 --- /dev/null +++ b/Model/lib/psql/webready/unknown/ProjectTaxon.psql @@ -0,0 +1,151 @@ + + + CREATE TABLE :ORG_ABBREVProjectTaxon AS + WITH + local_taxon -- a taxon found in this instance, either in dots.ExternalNaSequence or in apidb.Organism + AS ( SELECT distinct tn.name as taxon, + substr((tn.name), 1, position(' ' IN tn.name||' ') - 1) as first_word, + pi.name as project_id + FROM dots.ExternalNaSequence ens, sres.TaxonName tn, core.ProjectInfo pi + WHERE ens.taxon_id = tn.taxon_id + and ((tn.name not like 'Bodo %' and tn.name not like 'Drosophila %') + OR tn.name_class = 'scientific name') + and ens.row_project_id = pi.project_id + -- get names from apidb.Organism.family_name_for_files + -- (may not be necessary) + UNION + SELECT family_name_for_files as taxon, + substr((family_name_for_files), 1, position(' ' IN family_name_for_files||' ') - 1) as first_word, + project_name as project_id + FROM apidb.Organism + WHERE family_name_for_files in (select name from sres.TaxonName) + ), + mononym -- a taxon name that's the first word of a local taxon + AS (SELECT distinct lower(lt.first_word) as taxon, lt.project_id + FROM local_taxon lt, sres.TaxonName tn + WHERE lt.first_word = tn.name + -- and tn.name_class = 'scientific name' + ), + full_name -- the full name of a local taxon whose first name is not a taxon + AS (SELECT distinct lower(lt.taxon) as taxon, lt.project_id + FROM local_taxon lt, sres.TaxonName tn + WHERE lt.taxon = tn.name + -- and tn.name_class = 'scientific name' + and lower(lt.first_word) not in (select taxon from mononym)) + SELECT * FROM mononym + UNION + SELECT * FROM full_name + UNION + SELECT 'hypocrea', 'FungiDB' + UNION + SELECT 'perkinsiella-like_sp._plo/afsm11', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/afsm2', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/amopi', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/asl1', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/de11d', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/de4a', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/frs/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/gd-d1-1', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/gd-d1-2', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/gd-d1-3', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/gillnor1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/gillnor2/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/gillrich3/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/lithon', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/net12afl/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/netc1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/netc2/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/neth2t3/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/np251002/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/nrss/ii', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/pal2', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/pao27/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/rp', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sed5a/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sedc1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sedcb1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sedct1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sedmh1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sedst1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sm53', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/sm68', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/st4n', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/st8v/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/su03', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/su4', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/tg1162', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/tg1267', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/tun1/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/wt2708/i', 'TriTrypDB' + UNION + SELECT 'perkinsiella-like_sp._plo/wtuts/i', 'TriTrypDB' + UNION + SELECT 'plo_of_paramoeba_invadens_ags-2013', 'TriTrypDB' + UNION + SELECT 'soil_flagellate_and31', 'TriTrypDB' + UNION + SELECT 'kinetoplastid_flagellate_lfs2', 'TriTrypDB' + UNION + SELECT 'cryptaulaxoides-like_sp._tcs-2003', 'TriTrypDB' + ORDER BY 2, 1 + + ; + + + + update :ORG_ABBREVProjectTaxon + set taxon = replace(taxon, '''', '') + where taxon like '%''%' + + ; + + + + create unique index data_load_prjct_err on :ORG_ABBREVProjectTaxon (taxon) + + ; + + + + create unique index projtax_ix on :ORG_ABBREVProjectTaxon (taxon, project_id) + + ; + diff --git a/Model/lib/psql/webready/unknown/SequenceEnzymeClass.psql b/Model/lib/psql/webready/unknown/SequenceEnzymeClass.psql new file mode 100644 index 000000000..f948ba46c --- /dev/null +++ b/Model/lib/psql/webready/unknown/SequenceEnzymeClass.psql @@ -0,0 +1,25 @@ +:CREATE_AND_POPULATE + SELECT sa.project_id + , sa.org_abbrev + , current_timestamp as modification_date + , sa.full_id + , sa.group_name + -- , sec.uniprot_accession + , ec.ec_number + , ec.description + , ec.parent_id as ec_parent + , ec.ec_number_1 + , ec.ec_number_2 + , ec.ec_number_3 + , ec.ec_number_4 + FROM :SCHEMA.sequenceattributes sa + , dots.AASequence aa + , dots.AASequenceEnzymeClass sec + , sres.ENZYMECLASS ec + WHERE sa.aa_sequence_id = aa.aa_sequence_id + AND sec.aa_sequence_id = aa.aa_sequence_id + AND sec.enzyme_class_id = ec.enzyme_class_id + AND sa.org_abbrev = ':ORG_ABBREV' +:DECLARE_PARTITION +; + diff --git a/Model/lib/psql/webready/unknown/SequenceEnzymeClass_ix.psql b/Model/lib/psql/webready/unknown/SequenceEnzymeClass_ix.psql new file mode 100644 index 000000000..255767b47 --- /dev/null +++ b/Model/lib/psql/webready/unknown/SequenceEnzymeClass_ix.psql @@ -0,0 +1,7 @@ + + CREATE INDEX SequenceEnzymeClass_idx1 ON :SCHEMA.SequenceEnzymeClass (group_name, ec_number, description) + ; + + CREATE INDEX SequenceEnzymeClass_idx2 ON :SCHEMA.SequenceEnzymeClass (full_id, ec_number, description) + ; + diff --git a/Model/lib/psql/webready/unknown/TypeAheadCounts.psql b/Model/lib/psql/webready/unknown/TypeAheadCounts.psql new file mode 100644 index 000000000..921efa9bd --- /dev/null +++ b/Model/lib/psql/webready/unknown/TypeAheadCounts.psql @@ -0,0 +1,42 @@ + + +CREATE TABLE TypeAheadCounts AS + (SELECT DISTINCT dr.primary_identifier as option_id, + count(distinct aaf.aa_sequence_id) AS protein_count + FROM dots.DbRefAaFeature draf, sres.DbRef dr, dots.aafeature aaf, + sres.ExternalDatabaseRelease edr, sres.ExternalDatabase ed + WHERE draf.db_ref_id = dr.db_ref_id + AND dr.external_database_release_id + = edr.external_database_release_id + AND edr.external_database_id = ed.external_database_id + AND aaf.aa_feature_id = draf.aa_feature_id + GROUP BY dr.primary_identifier + ) + UNION + (SELECT DISTINCT ec.ec_number AS option_id, + COUNT(DISTINCT asec.aa_sequence_id) AS protein_count + FROM sres.enzymeClass ec, + dots.aaSequenceEnzymeClass asec + WHERE asec.enzyme_class_id = ec.enzyme_class_id + GROUP BY ec.ec_number) + UNION + (SELECT DISTINCT gt.source_id AS option_id, + COUNT(DISTINCT aaf.aa_sequence_id) AS protein_count + FROM dots.AaFeature aaf, + dots.GoAssociation ga, sres.OntologyTerm gt, + dots.GoAssociationInstance gai,core.TableInfo ti + WHERE aaf.aa_sequence_id = ga.row_id + AND ga.table_id = ti.table_id + AND ti.name = 'ExternalAASequence' + AND ga.go_term_id = gt.ontology_term_id + AND ga.go_association_id = gai.go_association_id + GROUP BY gt.source_id) + + ; + + + +CREATE INDEX TypeAheadCounts_idx ON TypeAheadCounts (option_id) + + ; + diff --git a/Model/lib/wdk/model/questions/params/geneParams.xml b/Model/lib/wdk/model/questions/params/geneParams.xml index 142a6d1b6..4635e8221 100644 --- a/Model/lib/wdk/model/questions/params/geneParams.xml +++ b/Model/lib/wdk/model/questions/params/geneParams.xml @@ -8274,16 +8274,16 @@ products of your selected type (or types).


SELECT three_letter_abbrev as term, name as internal, name as display - FROM apidb.orthomcltaxon + FROM apidb.orthomclclade WHERE three_letter_abbrev not in ('BACI') ORDER BY depth_first_index ASC, three_letter_abbrev ASC diff --git a/Model/lib/xml/tuningManager/apiTuningManager-pruned.xml b/Model/lib/xml/tuningManager/apiTuningManager-pruned.xml new file mode 100644 index 000000000..587655a7d --- /dev/null +++ b/Model/lib/xml/tuningManager/apiTuningManager-pruned.xml @@ -0,0 +1,3046 @@ + + + + + + + + + + + + + + + + + + + + + + Map each GO term that is assigned to at least one gene to a GoSubset term + that is either itself or an immediate ancestor. By "immediate ancestor" + we mean an ancestor such that there isn't an intermediate ancestor also + in the subset. (Note that there can be multiple links as long as none is + in the subset.) This is currently restricted to 'goslim_generic', solely + by the condition in the SUBSET_TERM subquery. + + + + + + + + + + + + + + + Each row maps a dataset onto an ID for which the dataset contains data; + each dataset gets one such row. + Used in dataset record queries. + + + + + + + + + + + + Stores per-organism information. Used by the organism record, as well + as by project_id(), the function that maps an organism to a project. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + 0 then 1 else 0 end as hasCentromere + FROM DOTS.MISCELLANEOUS f + , sres.ontologyTerm ot + , dots.nasequence s + WHERE ot.ontology_term_id = f.sequence_ontology_id + AND ot.name='centromere' + AND f.na_sequence_id = s.na_sequence_id + GROUP BY s.taxon_id + ]]> + + + + + + + + + + + + = g.start_min + AND g.na_sequence_id = seq.na_sequence_id + AND t.name = 'ExternalNASequence' + ) gene + ON gene.source_id = sim.source_id AND gene.sequence_id = sim.sequence_source_id) + GROUP BY sim.taxon_id + ]]> + + + = 100 + AND s.is_best_alignment in (1) + AND s.percent_est_bases_aligned >= 20 + AND s.percent_identity >= 90 + AND e.best_alignment_count <= 1 + AND e.source_id = s.accession + GROUP by s.gene HAVING count(*) >= 1 + ) est ON ga.source_id = est.source_id + RIGHT OUTER JOIN ( + SELECT project_id, taxon_id, + max(database_version) as database_version, + CASE WHEN ncbi_tax_id > 9000000000 THEN NULL + ELSE ncbi_tax_id + END ncbi_tax_id, + to_char(sum(length)/1000000,'9999.99') as megabps + FROM GenomicSeqAttributes + WHERE is_top_level = 1 + GROUP BY project_ID, taxon_id, ncbi_tax_id + ) genomestat ON genomestat.taxon_id = ga.taxon_id + LEFT OUTER JOIN ( + SELECT count(distinct ga.source_id) as ct, ga.taxon_id + FROM GeneAttributes ga, SnpAttributes sf + WHERE sf.gene_source_id = ga.source_id + AND ga.is_deprecated = 0 + GROUP BY ga.taxon_id + ) snpCount ON ga.taxon_id = snpCount.taxon_id + GROUP BY genomestat.taxon_id, + genomestat.project_id, + genomestat.database_version, + genomestat.ncbi_tax_id, + genomestat.Megabps, + snpCount.ct + ]]> + + + 10000000 + -- then 'TMPTX_' || round(t.ncbi_tax_id / 10000000) || '_' || + -- mod(t.ncbi_tax_id, 10000000) -- e.g. "TMPTX_930_1" + -- then 'TMPTX_' || t.ncbi_tax_id -- all the many digits + then 'TMPTX_' || o.public_abbrev + else 'NCBITAXON_' || t.ncbi_tax_id + end as source_id, + o.abbrev as internal_abbrev, + o.public_abbrev, + o.orthomcl_abbrev, + o.family_name_for_files, + tn.name as organism_name, + o.genome_source, + o.strain_abbrev, + o.is_annotated_genome, + o.is_reference_strain, + o.is_family_representative, + o.name_for_filenames, + o.taxon_id as component_taxon_id, + gc.database_version, + gc.megabps as megabps, + gc.ncbi_tax_id as ncbi_tax_id, + gc.snpCount as snpCount, + gc.geneCount as geneCount, + gc.pseudoGeneCount as pseudoGeneCount, + gc.codingGeneCount as codingGeneCount, + gc.otherGeneCount as otherGeneCount, + gc.ChipChipGeneCount as ChipChipGeneCount, + gc.orthologCount as orthologCount, + gc.goCount as goCount, + gc.tfbsCount as tfbsCount, + gc.proteomicsCount as proteomicsCount, + gc.estCount as estCount, + gc.ecNumberCount as ecNumberCount, + cast(coalesce(dsc.Organellar_Has, 0) as NUMERIC(1)) as isOrganellar, + cast(coalesce(dsc.HTSIsolate_Has, 0) as NUMERIC(1)) as hasHTSIsolate, + cast(coalesce(dsc.Popset_Has, 0) as NUMERIC(1)) as hasPopset, + cast(coalesce(dsc.Epitope_Has, 0) as NUMERIC(1)) as hasEpitope, + cast(coalesce(dsc.Array_Has, 0) as NUMERIC(1)) as hasArray, + coalesce(oc.hasCentromere, 0) as hasCentromere, + coalesce(sc.contig_num, 0) as contigCount, + coalesce(sc.supercont_num, 0) as supercontigCount, + coalesce(sc.chrom_num, 0) as chromosomeCount, + coalesce(cc.communityCount, 0) as communityCount, + coalesce(psc.popsetCount, 0) as popsetCount, + coalesce(pc.geneArrayCount, 0) as arrayGeneCount, + coalesce(pc.rnaSeqCount, 0) as rnaSeqCount, + coalesce(pc.rtPCRCount, 0) as rtPCRCount, + coalesce(ta.avg_transcript_length, 0) as avg_transcript_length + FROM apidb.Organism o + INNER JOIN sres.TaxonName tn ON tn.taxon_id = o.taxon_id + INNER JOIN sres.Taxon t ON t.taxon_id = tn.taxon_id + LEFT JOIN DataSourceCount dsc ON o.taxon_id = dsc.taxon_id + LEFT JOIN OrganismCentromere oc ON o.taxon_id = oc.taxon_id + LEFT JOIN SequenceCount sc ON o.taxon_id = sc.taxon_id + LEFT JOIN CommunityCount cc ON o.taxon_id = cc.taxon_id + LEFT JOIN GeneCount gc ON o.taxon_id = gc.taxon_id + LEFT JOIN popsetCount psc ON o.taxon_id = psc.taxon_id + LEFT JOIN profileCount pc ON o.taxon_id = pc.taxon_id + LEFT JOIN ( + SELECT taxon_id, round(avg(length),1) as avg_transcript_length + FROM TranscriptAttributes + GROUP by taxon_id + ) ta ON o.taxon_id = ta.taxon_id + WHERE tn.name_class = 'scientific name' + ) oa, + TaxonSpecies ts, + sres.taxon t, + sres.taxonname tn2 + WHERE oa.component_taxon_id = ts.taxon_id + AND ts.species_taxon_id = t.taxon_id + AND ts.species_taxon_id = tn2.taxon_id + AND tn2.name_class = 'scientific name' + ]]> + + + + + + + + + + Stores, for each transcript, a string containing the gene-relative coordinates + of all its introns and UTRs. + + + + + + + + + + + + + + + + + + + + + Stores special webservice abbreviations which are not standard organism + names. Each record maps an organism name onto this abbreviation, as + well as the species name and project ID. Used by the model and as an + input in the creation of the OrganismAbbreviationBlast tuning table. + Propagated to portal instances. + + + + + + + + + + + + + Group species by higher level taxonomy. Each row associates a taxon of + interest with one of its ancestors in the taxon tree. Used in parameter + queries that have to know about the taxon tree. Propagated to portal + instances. + + + + + + + + + + + + + + + + + + + + + + + + + + + + Each record maps an organism to its BLAST abbreviation. Used by + BLAST-query parameters. Propagated to portal instances. + + + + + + + + + + + + + For each project, show which BLAST databases are available for which + species. Used in BLAST param queries. Propagated to portal instances. + + + + + + + + + + + + + + + Each row stores mass-spec. based expression evidence for one sample of + one experiment for one gene. Used for mass spec queries in the model, + GBrowse, and PBrowse, and also in the creation of the MSTranscriptSummary + tuning table. + + + + + + + + + + + + + + + + + + + + + + + + + + + + Stores summary information from annotated genomes to facilitate overview section of gene page + + + + + + + + + + + + + + + Mass-spec experiment results for a peptide. Used by the model, GBrowse, + and PBrowse. + + + + + + + + + + + + + + + + + + + + + + + + + Data from the Seattle Structural Genomics Center for Infectious Disease, + populated from their web service. Used in the gene record. + + + + + + + + Used by the model and GBrowse, as well as an input in the creation of + the tuning tables like MSModifiedPeptideSummary and MSPeptideSummary. + + + + + + + + Used by the model when writing profile data + + + + + + + + Associates an organism with the GBrowse and PBrowse tracks available + for it. Used by the gene record. + + + + + + + + + + Each row maps a dataset onto an ID for which the dataset contains data; + each dataset gets one such row. + Used in dataset record queries. + + + + + + + + + + + + + + + + + Citation info for proteomics datasets, used by GBrowse + + + + ' || sample || '

' as sample_i + FROM MSPeptideSummary mps, DatasetPresenter ds + -- consider using the tuning table ExternalDbDatasetPresenter instead of the LIKE below, if its performance is a problem + WHERE (ds.name = mps.external_database_name or mps.external_database_name like ds.dataset_name_pattern) + ) t + group by name, id + ) + SELECT name, + substr(description, 4000, 1) || ' Primary Contact Email: '|| coalesce(email, 'unavailable') + || ' PMID: ' || publications || '

Samples:

' + || sample_table || chr(10) || + ' Please note that subtrack labels will disappear if the selected subtracks number is over 15!' as citation + FROM ( + SELECT ds.name as name, ds.summary as description, pubs.contact_email as email, + pubs.pmids as publications, samples.sample_table as sample_table + FROM DatasetPresenter ds, pubs, samples + WHERE ds.dataset_presenter_id = pubs.id + AND ds.dataset_presenter_id = samples.id + ) t + ]]> +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + = commit_after THEN + COMMIT; + ctrows := 0; + END IF; + END LOOP; + commit; + END; + $$ LANGUAGE PLPGSQL; + ]]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + For all datasets, list all genes (source_id) of a gene_group where one of them (profile_graph_id) + has data for a profile_set. + + + + + + + + + + + + + + + for gene-page expression graphs + + + + + + + + + + + + + + + + + + + + + Data from STRING-DB.org, populated from their web service. + Used in the gene record. + + + + + + + + + annotation updates from Apollo + + + + + + + + + + + Stores a mapping between external databases, taxon IDs, and URLs + + + + + + + + + + + + + + + + + + + + + + + + + + + + Text from ApolloUpdate that can be used in site search to find genes + + + + + = au.mapping_start + AND ta.source_id = au.apolloTranscript + AND ga.strand_plus_minus = au.strand + AND ta.gene_source_id = ga.source_id + ]]> + + + + + + + + + + Text for PreferredProduct table on gene record page. + + + + + + + + + + + + + + + + + + + + + all products for each gene + + + + + + + + + + + + + = 3 -- no product in apidb.GeneFeatureProduct + union + select ta.gene_source_id AS source_id, t.product, ta.project_id, + null as is_alternate, + string_agg(ta.source_id, ', ' order by ta.source_id) as transcript_ids, + null as reference, null as evidence_code, null as evidence_code_parameter, + null as assigned_by, 'dots.Transcript' as source + from TranscriptAttributes ta, dots.Transcript t + where ta.na_feature_id = t.na_feature_id + and t.product is not null + and ta.gene_source_id + not in (select source_id + from GeneProduct + where source_rule < 6) -- product only in dots.Transcript + group by ta.gene_source_id, ta.project_id, t.product + union + select gp.source_id as source_id, gp.product, ga.project_id, + null as is_alternate, + null as transcript_ids, null as reference, null as evidence_code, + null as evidence_code_parameter, null as assigned_by, + 'unspecified product' as source + from GeneProduct gp, GeneAttributes ga + where gp.source_rule = 7 + and gp.source_id = ga.source_id + order by is_alternate desc, transcript_ids + ]]> + + + + + + + +
diff --git a/Model/lib/xml/tuningManager/apiTuningManager.xml b/Model/lib/xml/tuningManager/apiTuningManager.xml index 7e341f4a5..cd99b2a4d 100644 --- a/Model/lib/xml/tuningManager/apiTuningManager.xml +++ b/Model/lib/xml/tuningManager/apiTuningManager.xml @@ -1031,7 +1031,7 @@ ) t GROUP BY taxon_id ]]> -
+ - + + - @@ -5893,7 +5893,6 @@ delete from &prefixGeneProduct&1 where product is null Aggregates reactions irrespective of pathway. Required to determine if BioCyc reactions are reversible. Used extensively in the model in conjunction with pathwaycompounds for pathway related queries - @@ -5901,11 +5900,50 @@ delete from &prefixGeneProduct&1 where product is null - - ' || o.enzyme || '' ELSE o.enzyme END as expasy_html + FROM ( + SELECT i.* + , CASE WHEN i.enzyme like '%.%.%.%' and i.enzyme != '-.-.-.-' + THEN + 'http://enzyme.expasy.org/cgi-bin/enzyme/enzyme-search-ec?field1=' + || ec.ec_number_1 + || CASE ec.ec_number_2 WHEN null THEN null ELSE chr(38) || 'field2=' || ec.ec_number_2 END + || CASE ec.ec_number_3 WHEN null THEN null ELSE chr(38) || 'field3=' || ec.ec_number_3 END + || CASE ec.ec_number_4 WHEN null THEN null ELSE chr(38) || 'field4=' || ec.ec_number_4 END + ELSE reaction_url END as expasy_url + , ec.description as enzyme_description + FROM ( + SELECT + reaction_id + , reaction_source_id + , reaction_url + , ext_db_name + , ext_db_version + , enzyme + , substrates_html || ' ' || sign || ' ' || products_html as equation_html + , substrates_text || ' ' || sign || ' ' || products_text as equation_text + , case when sign = '<=>' then 1 else 0 end as is_reversible + , substrates_text + , products_text + FROM ( + SELECT + reaction_id + , reaction_source_id + , reaction_url + , ext_db_name + , ext_db_version + , enzyme + , (case when (string_agg (case when type_list like '%substrate%' then compound end, ',' order by compound)) = (string_agg (case when type_list like '%product%' then compound end, ',' order by compound)) or is_reversible = 1 then '<=>' else '=>' end) as sign + , string_agg(case when type like '%substrate%' then compound_url end, ' + ' order by compound_url) as substrates_html + , string_agg(case when type like '%substrate%' then compound end, ' + ' order by compound) as substrates_text + , string_agg(case when type like '%product%' then compound_url end, ' + ' order by compound_url) as products_html + , string_agg(case when type like '%product%' then compound end, ' + ' order by compound) as products_text + FROM ( + WITH rep AS ( SELECT DISTINCT pr.PATHWAY_REACTION_ID as reaction_id , pr.SOURCE_ID as reaction_source_id @@ -5931,19 +5969,15 @@ delete from &prefixGeneProduct&1 where product is null AND ot.ONTOLOGY_TERM_ID = pn.PATHWAY_NODE_TYPE_ID AND pc.PATHWAY_ID = p.PATHWAY_ID AND pc.REACTION_id = pr.PATHWAY_REACTION_ID - ]]> - - - - - - ' || o.enzyme || '' ELSE o.enzyme END as expasy_html - FROM ( - SELECT i.* - , CASE WHEN i.enzyme like '%.%.%.%' and i.enzyme != '-.-.-.-' - THEN - 'http://enzyme.expasy.org/cgi-bin/enzyme/enzyme-search-ec?field1=' - || ec.ec_number_1 - || CASE ec.ec_number_2 WHEN null THEN null ELSE chr(38) || 'field2=' || ec.ec_number_2 END - || CASE ec.ec_number_3 WHEN null THEN null ELSE chr(38) || 'field3=' || ec.ec_number_3 END - || CASE ec.ec_number_4 WHEN null THEN null ELSE chr(38) || 'field4=' || ec.ec_number_4 END - ELSE reaction_url END as expasy_url - , ec.description as enzyme_description - FROM ( - SELECT - reaction_id - , reaction_source_id - , reaction_url - , ext_db_name - , ext_db_version - , enzyme - , substrates_html || ' ' || sign || ' ' || products_html as equation_html - , substrates_text || ' ' || sign || ' ' || products_text as equation_text - , case when sign = '<=>' then 1 else 0 end as is_reversible - , substrates_text - , products_text - FROM ( - SELECT - reaction_id - , reaction_source_id - , reaction_url - , ext_db_name - , ext_db_version - , enzyme - , (case when (string_agg (case when type_list like '%substrate%' then compound end, ',' order by compound)) = (string_agg (case when type_list like '%product%' then compound end, ',' order by compound)) or is_reversible = 1 then '<=>' else '=>' end) as sign - , string_agg(case when type like '%substrate%' then compound_url end, ' + ' order by compound_url) as substrates_html - , string_agg(case when type like '%substrate%' then compound end, ' + ' order by compound) as substrates_text - , string_agg(case when type like '%product%' then compound_url end, ' + ' order by compound_url) as products_html - , string_agg(case when type like '%product%' then compound end, ' + ' order by compound) as products_text - FROM PR_t1 as t1 + ) t1 GROUP BY reaction_id, reaction_source_id, reaction_url, ext_db_name, ext_db_version, enzyme, is_reversible ) t2 ) i @@ -6972,20 +6962,20 @@ delete from &prefixGeneProduct&1 where product is null @@ -8821,7 +8811,6 @@ delete from &prefixGeneProduct&1 where product is null - + diff --git a/Model/lib/xml/tuningManager/tablePruning.txt b/Model/lib/xml/tuningManager/tablePruning.txt index c96690117..11012b11d 100644 --- a/Model/lib/xml/tuningManager/tablePruning.txt +++ b/Model/lib/xml/tuningManager/tablePruning.txt @@ -1,23 +1,23 @@ -MG -?? +K FIXFIX +MO ?? ?? -?? -?? -?? +K +K +R ?? -?? -?? -?? -?? -?? +K +MC Should rename this "ProteinSequenceGroup" +R +K Need to confirm with Rich but this should be handled now with the new interpro table (or tt) +MO ?? -?? -?? -?? -?? -?? -?? +MO +K +MO (Pretty sure this is MO... but we do have profiles for compounds (not genes). they should be handled different) +K +K +MC MO MO MO (could be renamed to LineageForSynteny) @@ -33,9 +33,9 @@ MO MO K MO (need BOTH org specific version and global -- WHY???) -MO +K MO -MO (Comment column needs to be made into a dedicated attribute query) +K (Comment column needs to be made into a dedicated attribute query) MO MO MO @@ -56,7 +56,7 @@ R R R MO (replace study table with nodeset) -MO (replace study table with nodeset) +MO FIX (replace study table with nodeset) MO R (never used but possibly should add back?) R @@ -82,7 +82,7 @@ K R R K -MO +K K MG (tuning from non gus tables) MG (tuning from non gus tables) @@ -107,17 +107,17 @@ K MO K K -MO (need to look at this) -MO (need to look at this) -MO (need to look at this) +K (need to look at this) +K (need to look at this) +K (need to look at this) K MO MO R MO R -?? (could be put into the orthomcl graph. Rich and John should look) -?? (similar to OrthologousTranscripts) +MC (could be put into the orthomcl graph. Rich and John should look) +K (similar to OrthologousTranscripts) R R MO (PANIO will need to exist) @@ -127,10 +127,10 @@ R (transcript attributes query needs to use n K MO K -MO (rm auto_lob; don't need to loop over chunks in postgres) -?? +MO FIX (rm auto_lob; don't need to loop over chunks in postgres) +MC K K -?? (can we do project specific alphafold? or, put this in after alphafold, whereever it goes) +MC (alpha fold is cross project) MO K diff --git a/Model/lib/xml/tuningManager/webtables.org b/Model/lib/xml/tuningManager/webtables.org new file mode 100644 index 000000000..2a4be5495 --- /dev/null +++ b/Model/lib/xml/tuningManager/webtables.org @@ -0,0 +1,178 @@ +#+title: Webtables + + +* MO Tables +- Organism + - [X] OrganismAbbreviationBlast_ix + - [X] OrganismAbbreviationBlast + - move to KEEP + - [X] OrganismAbbreviation_ix + - [X] OrganismAbbreviation + - updated the abbreviation field to name_for_filenames + - [X] OrganismSelectTaxonRank_ix + - [X] OrganismSelectTaxonRank + - [X] Taxonomy_ix + - [X] Taxonomy + - [X] TaxonSpecies_ix + - [X] TaxonSpecies +- Genomic Sequence + - [X] GenomicSequenceId_ix + - [X] GenomicSequenceId + - [X] GenomicSequenceSequence_ix + - [X] GenomicSequenceSequence + - [X] SequencePieceClosure + - [X] GenomicSeqAttributes + - [s] SequenceEnzymeClass + - Temp remove this and eventually Move to ComparativeGenomics because it depends on the OrthoMCL Derived EC Numbers + +- Transcript / Protein + - [X] SignalPeptideDomains_ix + - [X] SignalPeptideDomains + - [X] TransmembraneDomains_ix + - [X] TransmembraneDomains + - [X] PdbSimilarity_ix + - [X] PdbSimilarity + - [X] ProteinSequence_ix + - [X] ProteinSequence + - [ ] ProteinAttributes_ix + - [ ] Remove Derived EC Numbers + - [ ] New Tuning table(s) for EC Derived ECs per protein (gene and transcript) + - [X] ProteinAttributes + - [ ] TranscriptAttributes_ix + - [ ] TranscriptAttributes + - no longer has products column because that is done by TM + - no longer has derived ec numbers (move to comparative genomics) + - [X] CodingSequence_ix + - [X] CodingSequence + - [X] IntronUtrCoords_ix + - [X] IntronUtrCoords + - [X] TranscriptCenDistance_ix + - [X] TranscriptCenDistance + - [ ] TranscriptPathway_ix + - [ ] TranscriptPathway + - This may need to move to comparative genomics because we need the OrthoDerived EC mappings + - [X] TranscriptSequence_ix + - [X] TranscriptSequence + - [X] ChIPchipTranscript_ix + - [X] ChIPchipTranscript + +- Gene + - [X] GeneId_ix + - [X] GeneId + - [X] GeneAttributes_ix + - [X] GeneAttributes + - remove gene_product and remove orthomclname + - [X] GeneCopyNumbers_ix + - [X] GeneCopyNumbers + - [X] GeneGoTable_ix + - [X] GeneGoTable + - [X] GeneGoTerms_ix + - [X] GeneGoTerms + - [X] GeneLocations_ix + - [X] GeneLocations + - [X] GeneModelDump_ix + - [X] GeneModelDump + - [X] GeneSummaryFilter_ix + - [X] GeneSummaryFilter + - [X] TFBSGene_ix + - [X] TFBSGene + - removed aef.* + - [ ] PathwayNodeGene_ix + - This may need to move to comparative genomics because we need the OrthoDerived EC mappings + - [ ] PathwayNodeGene + - This may need to move to comparative genomics because we need the OrthoDerived EC mappings + - [ ] PathwaysGeneTable_ix + - This may need to move to comparative genomics because we need the OrthoDerived EC mappings + - [ ] PathwaysGeneTable + - This may need to move to comparative genomics because we need the OrthoDerived EC mappings + - [X] GoTermSummary_ix + - [X] GoTermSummary + - [X] EqtlSpan_ix + - [X] EqtlSpan + +- EST + - [X] EstAttributes_ix + - move to comparative + - join to apidb.organism and filter by "is_reference_strain" + - [X] EstAttributes + - move to comparative + - join to apidb.organism and filter by "is_reference_strain" + - [X] EstSequence_ix + - move to comparative + - join to apidb.organism and filter by "is_reference_strain" + - [X] EstSequence + - move to comparative + - join to apidb.organism and filter by "is_reference_strain" + - [X] EstAlignmentGeneSummary_ix + - [X] EstAlignmentGeneSummary + +- Dataset / Other + - [X] DatasetExampleSourceId_ix + - [X] DatasetExampleSourceId + - NOTE: this depends on Profiles + - [X] PANExtDBRls + - [X] PANIO + - [X] PANIO_ix + + - [ ] +ProfileType_ix+ + - [ ] +ProfileType+ + - [ ] +Profile_ix+ + - [ ] +Profile+ + - [ ] +ProfileSamples_ix+ + - [ ] +ProfileSamples+ + + - [X] RnaSeqStats_ix + - [X] RnaSeqStats + - [X] OrganismAttributes_ix + - [X] OrganismAttributes + - removed ESTs and SNPs + - [X] ChrCopyNumbers_ix + - [X] ChrCopyNumbers + +- Junctions (Kathryn) + - [ ] IntronSupportLevel_ix + - [ ] IntronSupportLevel + - [ ] GeneIntJuncStats_ix + - [ ] GeneIntJuncStats + - [ ] GeneIntronJunction_ix + - [ ] GeneIntronJunction + - [ ] NameMappingGIJ_ix + - [ ] NameMappingGIJ + - needs to be in TM, depends on dataset presenters + - [ ] GeneMaxIntronGIJ_ix + - should be aux table and dropped + - [ ] GeneMaxIntronGIJ + - should be aux table and dropped + +- Comparative + - [ ] ProteinGroup (RENMAED from SEQUENCEATTRIBUTES) + - add orthomcl derived ec numbers here + - add gene_id + - add transcript_id + - ENSURE this has a row for every protein! + - [ ] ProteinGroupDomainAssignment (RENAMED from DOMAINASSIGNMENT) + + - [ ] GroupDomainDescriptions (RENAMED from GROUPDOMAINATTRIBUTE ) + - [ ] proteinGroupEnyzmeClass (renamed from sequenceenzymeclass) + - [ ] AlphaFoldGenes + - [ ] GroupPhylogeneticProfile + - [ ] OrthologousTranscripts + - [ ] PhyleticPattern + - no longer uses dots.sequencegroup and dots.sequencesequencegroup + +- Global + - [X] CompoundAttributes + - [X] CompoundId + - [X] CompoundProperties + - [ ] CompoundTypeAheads + - double check again after database is rebuilt. may be ok + - [X] OntologyLevels + - [X] PathwayAttributes + - [X] PathwayCompounds + - [X] PathwayNodes + - [X] PathwayReactions + +- KEEP Tuning table + - GeneProduct (CHECK) + - TranscriptProduct (NEW) + - ProteinProduct (NEW??) diff --git a/convert2webready b/convert2webready new file mode 100755 index 000000000..d705233c9 --- /dev/null +++ b/convert2webready @@ -0,0 +1,82 @@ +#!/usr/bin/perl + +# grep -irl apidbtuning | ~/sourceCode/website/ApiCommonModel/convert2wr ~/sourceCode/website/ApiCommonModel/Model ~/sourceCode/ApiCommonModel/Model + +use strict; +my @tables = ( +'CompoundAttributes', +'CompoundId', +'CompoundProperties', +'CompoundTypeAheads', +'OntologyLevels', +'PathwayAttributes', +'PathwayCompounds', +'PathwayNodes', +'PathwayReactions', +'ChIPchipTranscript', +'ChrCopyNumbers', +'CodingSequence', +'EqtlSpan', +'EstAlignmentGeneSummary', +'EstAttributes', +'EstSequence', +'GeneAttributes', +'GeneCopyNumbers', +'GeneGoTable', +'GeneGoTerms', +'GeneId', +#'GeneIntJuncStats', +#'GeneIntronJunction', +'GeneLocations', +#'GeneMaxIntronGIJ', +'GeneModelDump', +'GeneSummaryFilter', +'GenomicSeqAttributes', +'GenomicSequenceId', +'GenomicSequenceSequence', +'GoTermSummary', +#'IntronSupportLevel', +#'IntronUtrCoords', +'OrganismAbbreviation', +'OrganismSelectTaxonRank', +'PANExtDbRls', +'PANIO', +'PathwayNodeGene', +'PathwaysGeneTable', +'PdbSimilarity', +'ProteinAttributes', +'ProteinSequence', +'RnaSeqStats', +'SequencePieceClosure', +'SignalPeptideDomains', +'Taxonomy', +'TaxonSpecies', +'TFBSGene', +'TranscriptAttributes', +'TranscriptCenDistance', +'TranscriptPathway', +'TranscriptSequence', +'TransmembraneDomains' +); + +my ($sourceModelDir, $targetModelDir) = @ARGV; + +while() { + chomp; + my $filenm = $_; + print STDERR "processing $filenm\n"; + my $filetext = do { + local $/ = undef; + open my $fh, "<", "$sourceModelDir/$filenm" + or die "could not open '$sourceModelDir/$filenm': $!"; + <$fh>; + }; + + foreach my $table (@tables) { + $filetext =~ s/apidbtuning.$table/webready.$table/gi; + } + + open(FH, '>', "$targetModelDir/$filenm") or die $!; + print FH $filetext; + close FH; +}