VEuPathDB · steve-fischer-200 · May 9, 2025 · May 9, 2025 · May 9, 2025 · May 9, 2025
diff --git a/Model/lib/psql/webready/comparative/AlphaFoldGenes.psql b/Model/lib/psql/webready/comparative/AlphaFoldGenes.psql
@@ -0,0 +1,97 @@
+	drop table if exists :SCHEMA.uniprotgenes;
+
+        CREATE UNLOGGED TABLE :SCHEMA.uniprotGenes  AS
+        SELECT DISTINCT ed.name
+        , d.*
+        , edr.version
+        , aa.source_id
+        , pa.gene_source_id
+        , CASE WHEN (ed.name like '%SWISSPROT%' AND edr.version = 'xrefuniparc') THEN 1
+            WHEN (ed.name like '%SPTREMBL%' AND edr.version = 'xrefuniparc') THEN 2
+            WHEN (ed.name like '%SWISSPROT%' AND edr.version = 'xref_sprot_blastp') THEN 4
+            WHEN (ed.name like '%SPTREMBL%' and edr.version = 'xref_trembl_blastp') THEN 5
+            ELSE 6 END as rank
+        , (af.last_residue_index - af.first_residue_index + 1) as hit_length
+        FROM sres.dbref d
+        LEFT JOIN apidb.AlphaFold af ON d.primary_identifier = af.uniprot_id
+        , sres.externaldatabase ed
+        , sres.externaldatabaserelease edr
+        , dots.dbrefaafeature db
+        , dots.aafeature aa
+        , :SCHEMA.ProteinAttributes pa
+        WHERE (ed.name = 'Uniprot/SWISSPROT' OR ed.name = 'Uniprot/SPTREMBL')
+          AND (edr.version = 'xrefuniparc' OR edr.version = 'xref_sprot_blastp' OR edr.version = 'xref_trembl_blastp')
+          AND edr.external_database_id = ed.external_database_id
+          AND d.external_database_release_id = edr.external_database_release_id
+          AND db.db_ref_id = d.db_ref_id
+          AND aa.aa_feature_id = db.aa_feature_id
+          AND pa.source_id = aa.source_id
+        UNION
+        SELECT DISTINCT ed.name
+          , d.*
+          , edr.version
+          , na.source_id
+          , ta.gene_source_id
+          , 3 as rank
+          , (af.last_residue_index - af.first_residue_index + 1) as hit_length
+        FROM sres.dbref d
+          LEFT JOIN apidb.AlphaFold af ON d.primary_identifier = af.uniprot_id
+          , sres.externaldatabase ed
+          , sres.externaldatabaserelease edr
+          , dots.dbrefnafeature db
+          , dots.nafeature na
+          , :SCHEMA.TranscriptAttributes ta
+        WHERE ed.name like '%_dbxref_%niprot_%RSRC'
+          AND edr.external_database_id = ed.external_database_id
+          AND d.external_database_release_id = edr.external_database_release_id
+          AND db.db_ref_id = d.db_ref_id
+          AND na.na_feature_id = db.na_feature_id
+          AND (ta.transcript_source_id = na.source_id OR ta.gene_source_id = na.source_id)
+
+    ;
+
+    drop table if exists :SCHEMA.minrank
+;      
+        CREATE UNLOGGED TABLE :SCHEMA.minRank AS (
+            SELECT gene_source_id
+              , MIN(rank) as min_rank
+            FROM :SCHEMA.uniprotGenes  upg
+            WHERE hit_length is not null
+            GROUP BY gene_source_id
+        )
+
+    ;
+
+    drop table if exists :SCHEMA.alphafoldhits;
+
+        CREATE UNLOGGED TABLE :SCHEMA.alphaFoldHits AS (
+          SELECT DISTINCT gene_source_id
+            , last_value(primary_identifier) over (PARTITION BY gene_source_id ORDER BY hit_length ASC ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS uniprot_id
+          FROM (
+            SELECT upg.*
+            FROM :SCHEMA.uniprotGenes  upg
+              , :SCHEMA.minRank
+            WHERE upg.gene_source_id = minRank.gene_source_id
+            AND upg.rank = minRank.min_rank
+          ) t
+        )
+    ;
+
+        CREATE TABLE :SCHEMA.AlphaFoldGenes  AS (
+          SELECT afh.gene_source_id
+            , af.uniprot_id
+            , af.source_id as alphafold_id
+            , af.alphafold_version
+            , af.first_residue_index
+            , af.last_residue_index
+          FROM apidb.alphafold af
+            , :SCHEMA.alphaFoldHits afh
+          WHERE afh.uniprot_id = af.uniprot_id
+        )
+
+    ;
+
+    drop table if exists :SCHEMA.uniprotgenes;
+    drop table if exists :SCHEMA.minrank;
+    drop table if exists :SCHEMA.alphafoldhits;
+
diff --git a/Model/lib/psql/webready/comparative/AlphaFoldGenes_ix.psql b/Model/lib/psql/webready/comparative/AlphaFoldGenes_ix.psql
@@ -0,0 +1,3 @@
+        CREATE index AlphaFoldGenes_idx  ON :SCHEMA.AlphaFoldGenes  (gene_source_id, uniprot_id)                                                                     
+
+    ;
diff --git a/Model/lib/psql/webready/comparative/GroupDomainDescriptions.psql b/Model/lib/psql/webready/comparative/GroupDomainDescriptions.psql
@@ -0,0 +1,21 @@
+create table :SCHEMA.GroupDomainDescriptions as
+SELECT og.group_id AS group_name, ag.descriptions
+FROM apidb.OrthologGroup og,
+     (SELECT group_name,
+             STRING_AGG(accession ||' (' || num_proteins|| ')', ', ')  AS descriptions
+      FROM (SELECT group_name, accession, num_proteins, rnk
+            FROM (SELECT group_name, accession, num_proteins,
+	                 rank() OVER (PARTITION BY group_name ORDER BY num_proteins DESC) rnk
+		  FROM (SELECT group_name, accession, count(distinct full_id) AS num_proteins
+		        FROM :SCHEMA.ProteinDomainAssignment
+			GROUP BY group_name,accession
+		        )
+		  )
+	    WHERE rnk <= 3
+	    )
+      GROUP BY group_name
+      ORDER BY 1
+      ) ag
+WHERE  og.group_id = ag.group_name
+
+;
diff --git a/Model/lib/psql/webready/comparative/GroupDomainDescriptions_ix.psql b/Model/lib/psql/webready/comparative/GroupDomainDescriptions_ix.psql
@@ -0,0 +1,2 @@
+CREATE INDEX GroupDomainAttribute_idx ON :SCHEMA.GroupDomainDescriptions (group_name)
+;
diff --git a/Model/lib/psql/webready/comparative/LoadOrthologTables.psql b/Model/lib/psql/webready/comparative/LoadOrthologTables.psql
@@ -0,0 +1,29 @@
+Truncate table :SCHEMA.GeneOrthologGroup;
+Truncate table :SCHEMA.TranscriptOrthologGroup;
+
+insert into :SCHEMA.GeneOrthologGroup (gene_id, group_id, project_id, org_abbrev, modification_date)
+SELECT ga.gene_source_id AS gene_id
+       , ogas.group_id
+       , ga.project_id
+       , ga.org_abbrev
+       , timestamp as modification_date
+  FROM :SCHEMA.geneattributes ga,
+       apidb.orthologgroupaasequence ogas
+ WHERE ga.aa_sequence_id = ogas.aa_sequence_id;
+
+
+
+insert into :SCHEMA.TranscriptOrthologGroup (source_id, gene_id, group_id, project_id, org_abbrev, modification_date)
+SELECT ta.source_id AS source_id
+       , ta.gene_source_id as gene_id
+       , ogas.group_id
+       , ta.project_id
+       , ta.org_abbrev
+       , timestamp as modification_date
+  FROM :SCHEMA.transcriptattributes ta
+       , apidb.orthologgroupaasequence ogas
+ WHERE ta.aa_sequence_id = ogas.aa_sequence_id;
+
+
+
+
diff --git a/Model/lib/psql/webready/comparative/LoadPathwaysGeneTable.psql b/Model/lib/psql/webready/comparative/LoadPathwaysGeneTable.psql
@@ -0,0 +1,58 @@
+/* ATTENTION: This script is run using a custom workflow step class */
+/* This accommodates the required to retain an empty table on undo */
+
+TRUNCATE TABLE :SCHEMA.PathwaysGeneTable;
+
+DO $$
+    DECLARE org record:PLPGSQL_DELIM
+    BEGIN
+        FOR org IN (SELECT DISTINCT abbrev FROM apidb.organism)
+        LOOP
+            INSERT INTO :SCHEMA.PathwaysGeneTable (
+                SELECT t2.*, current_timestamp AS modification_date FROM (
+                    SELECT DISTINCT
+                    gene_source_id
+                    , pathway_source_id
+                    , pathway_name
+                    , count(reaction_source_id) AS reactions
+                    , enzyme
+                    , expasy_url
+                    , pathway_source
+                    , exact_match
+                    , project_id
+                    , org_abbrev
+                    FROM (
+                        SELECT DISTINCT
+                        tp.gene_source_id
+                        , tp.project_id
+                        , tp.pathway_source_id
+                        , tp.pathway_name
+                        , tp.org_abbrev
+                        , pr.reaction_source_id
+                        , pr.enzyme
+                        , pr.expasy_url
+                        , tp.pathway_source
+                        , CASE MAX(tp.exact_match) WHEN 1 THEN 'Yes' WHEN 0 THEN 'No' END AS exact_match
+                        FROM :SCHEMA.TranscriptPathway tp
+                        , apidbtuning.PathwayAttributes pa
+                        , apidbtuning.PathwayCompounds pc
+                        , apidbtuning.PathwayReactions pr
+                        WHERE tp.pathway_id = pa.pathway_id
+                        AND pc.pathway_id = pa.pathway_id
+                        AND pr.reaction_id = pc.reaction_id
+                        AND pr.ext_db_name = pc.ext_db_name
+                        AND tp.ec_number_pathway = pr.enzyme
+                        AND tp.wildcard_count_gene <= tp.wildcard_count_pathway
+                        AND pr.enzyme != '-.-.-.-'
+                        AND tp.org_abbrev = org.abbrev
+                        GROUP BY tp.gene_source_id, tp.project_id, tp.org_abbrev, tp.pathway_name, tp.pathway_source_id, pr.reaction_source_id, pr.enzyme, pr.expasy_url, tp.pathway_source
+                        ) t
+                    GROUP BY gene_source_id, project_id, org_abbrev, pathway_source_id, pathway_name, enzyme, expasy_url, pathway_source, exact_match
+                    ) t2
+                ORDER BY pathway_source, lower(pathway_name)
+                ):PLPGSQL_DELIM
+            COMMIT:PLPGSQL_DELIM
+        END LOOP:PLPGSQL_DELIM
+    END:PLPGSQL_DELIM
+$$ LANGUAGE PLPGSQL;
+
diff --git a/Model/lib/psql/webready/comparative/LoadTranscriptPathway.psql b/Model/lib/psql/webready/comparative/LoadTranscriptPathway.psql
@@ -0,0 +1,174 @@
+/* ATTENTION: This script is run using a custom workflow step class */
+/* This accommodates the requirement to retain empty TranscriptEC and TranscriptPathway tables on undo */
+
+/* STEP 1: Make sure temp tables have been dropped before starting */
+
+DROP TABLE IF EXISTS :SCHEMA.TranscriptEcUniverse;
+DROP TABLE IF EXISTS :SCHEMA.PathwayEcUniverse;
+DROP TABLE IF EXISTS :SCHEMA.TranscriptPathwayEcMatch;
+
+-- Just to be safe add these here
+TRUNCATE TABLE :SCHEMA.TranscriptPathway;
+TRUNCATE TABLE :SCHEMA.TranscriptEC;
+
+
+/* STEP 2: Load into the TranscriptEC table. This must be done here so that we capture ortho-derived EC numbers*/
+
+/* ATTENTION: this step loads into an empty partitioned table created in the orgSpecific graph */
+
+DO $$
+    DECLARE org RECORD:PLPGSQL_DELIM
+    BEGIN
+        FOR org IN (SELECT DISTINCT taxon_id, abbrev from apidb.organism)
+        LOOP
+            INSERT INTO :SCHEMA.TranscriptEc (
+                SELECT DISTINCT
+                ta.source_id
+                , ta.gene_source_id
+                , ec.enzyme_class_id
+                , ec.ec_number
+                , ec.ec_number_1
+                , ec.ec_number_2
+                , ec.ec_number_3
+                , ec.ec_number_4
+                , regexp_count(ec.ec_number, '-') as wildcard_count
+                , asec.evidence_code
+                , ta.project_id
+                , org.abbrev as org_abbrev
+                , current_timestamp as modification_date
+                FROM sres.EnzymeClass ec
+                , dots.AaSequenceEnzymeClass asec
+                , :SCHEMA.transcriptattributes ta
+                WHERE asec.aa_sequence_id = ta.aa_sequence_id
+                AND asec.enzyme_class_id = ec.enzyme_class_id
+                AND ta.org_abbrev = org.abbrev
+                ):PLPGSQL_DELIM
+            COMMIT:PLPGSQL_DELIM
+        END LOOP:PLPGSQL_DELIM
+    END:PLPGSQL_DELIM
+$$ LANGUAGE PLPGSQL;
+
+
+/* STEP 3: Extracts the distinct EC numbers from TranscriptEC */
+/* This represents the "universe" of EC numbers associated to transcripts */
+/* Temp table, will be dropped */
+
+CREATE TABLE :SCHEMA.TranscriptEcUniverse as (
+    SELECT DISTINCT
+    enzyme_class_id
+    , ec_number
+    , ec_number_1
+    , ec_number_2
+    , ec_number_3
+    , ec_number_4
+    , wildcard_count
+    FROM :SCHEMA.TranscriptEc
+);
+
+/* STEP 4: Extract the distinct EC number from PathwayEC */
+/* This represents the "universe" of EC numbers associated to pathways */
+/* Temp table, will be dropped */
+
+CREATE TABLE :SCHEMA.PathwayEcUniverse as (
+    SELECT DISTINCT
+    enzyme_class_id
+    , ec_number
+    , ec_number_1
+    , ec_number_2
+    , ec_number_3
+    , ec_number_4
+    , wildcard_count
+    FROM :SCHEMA.PathwayEc
+);
+
+
+/* STEP 5: Match EC numbers from the transcript universe and EC numbers from the pathway universe */
+/* Use the universe tables to avoid redundancy */
+/* Temp table, will be dropped */
+
+CREATE TABLE :SCHEMA.TranscriptPathwayEcMatch as (
+    SELECT DISTINCT
+    teu.enzyme_class_id AS transcript_enzyme_class_id
+    , peu.enzyme_class_id AS pathway_enzyme_class_id
+    , teu.wildcard_count AS wildcard_count_transcript
+    , peu.wildcard_count AS wildcard_count_pathway
+    , teu.ec_number AS ec_number_transcript
+    , peu.ec_number AS ec_number_pathway
+    FROM :SCHEMA.TranscriptEcUniverse teu
+    , :SCHEMA.PathwayEc peu
+
+    -- this part does ec number expansion using the individual digits to avoid slow like syntax
+    WHERE (teu.ec_number_1 = peu.ec_number_1 or teu.ec_number_1 is null or peu.ec_number_1 is null)
+    AND (teu.ec_number_2 = peu.ec_number_2 or teu.ec_number_2 is null or peu.ec_number_2 is null)
+    AND (teu.ec_number_3 = peu.ec_number_3 or teu.ec_number_3 is null or peu.ec_number_3 is null)
+    AND (teu.ec_number_4 = peu.ec_number_4 or teu.ec_number_4 is null or peu.ec_number_4 is null)
+);
+
+
+/* STEP 6: Map the matched EC numbers to map back to both pathways and transcripts */
+/* ATTENTION: this step loads into an empty partitioned table created in the orgSpecific graph */
+
+/* This is the equivalent of the old TranscriptPathway tuning table */
+
+DO $$
+    DECLARE org RECORD:PLPGSQL_DELIM
+    BEGIN
+        FOR org IN (SELECT DISTINCT taxon_id, abbrev FROM apidb.organism)
+        LOOP
+            INSERT INTO :SCHEMA.transcriptpathway (
+                SELECT DISTINCT
+
+                -- gene info
+                ta.source_id
+                , ta.gene_source_id
+
+                -- pathway info
+                , pa.source_id AS pathway_source_id
+                , pa.name AS pathway_name
+                , pa.pathway_id
+                , pa.pathway_source
+                , pec.external_database_release_id
+
+                -- info about match
+                , tpem.ec_number_transcript AS ec_number_gene
+                , tpem.wildcard_count_transcript AS wildcard_count_gene
+                , tpem.ec_number_pathway
+                , tpem.wildcard_count_pathway
+                , CASE WHEN tpem.ec_number_pathway = tpem.ec_number_transcript
+                    THEN 1
+                    ELSE 0 END AS exact_match
+                , CASE WHEN tpem.wildcard_count_pathway + tpem.wildcard_count_transcript = 0
+                    THEN 1
+                    ELSE 0 END AS complete_ec
+
+                -- for partitioning
+                , ta.project_id
+                , org.abbrev AS org_abbrev
+                , current_timestamp AS modification_date
+                FROM :SCHEMA.TranscriptPathwayEcMatch tpem
+                , :SCHEMA.PathwayEc pec
+                , :SCHEMA.PathwayAttributes pa
+                , :SCHEMA.TranscriptAttributes ta
+                , :SCHEMA.TranscriptEc tec
+                 WHERE tpem.ec_number_transcript = tec.ec_number
+                AND tpem.ec_number_pathway = pec.ec_number
+                AND pa.pathway_id = pec.pathway_id
+                AND ta.source_id = tec.source_id
+                AND tec.org_abbrev = org.abbrev
+                AND ta.org_abbrev = org.abbrev
+                -- JB:  It is unclear what the intention here was.  This will not remove any rows and shouldn't
+                -- AND (
+                --     (ta.orthomcl_name IS NULL AND tec.evidence_code != 'OrthoMCLDerived')
+                --     OR ta.orthomcl_name IS NOT NULL
+                -- )
+                ):PLPGSQL_DELIM
+            COMMIT:PLPGSQL_DELIM
+        END LOOP:PLPGSQL_DELIM
+    END:PLPGSQL_DELIM
+$$ LANGUAGE PLPGSQL;
+
+
+/* STEP 7: Delete temp tables */
+DROP TABLE :SCHEMA.TranscriptEcUniverse;
+DROP TABLE :SCHEMA.PathwayEcUniverse;
+DROP TABLE :SCHEMA.TranscriptPathwayEcMatch;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		CREATE index AlphaFoldGenes_idx ON :SCHEMA.AlphaFoldGenes (gene_source_id, uniprot_id)

		;
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		CREATE INDEX GroupDomainAttribute_idx ON :SCHEMA.GroupDomainDescriptions (group_name)
		;