-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integrate 1KG and TCGA into new DB + Fixes + Support for ethnicity
- Loading branch information
Showing
10 changed files
with
200 additions
and
64 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
CREATE MATERIALIZED VIEW dw.genomes_metadata | ||
TABLESPACE default_ts | ||
AS SELECT x.donor_id, | ||
x.donor_source_id, | ||
x.item_id, | ||
x.item_source_id, | ||
x.file_name, | ||
x.local_url, | ||
lower(x.assembly::text) AS assembly, | ||
x.gender, | ||
x.health_status, | ||
x.population, | ||
p1.value AS super_population, | ||
p2.value AS dna_source | ||
FROM ( SELECT biosample.donor_id, | ||
donor.donor_source_id, | ||
item.item_id, | ||
item.item_source_id, | ||
item.file_name, | ||
item.local_url, | ||
dataset.assembly, | ||
donor.gender, | ||
biosample.is_healthy AS health_status, | ||
donor.ethnicity AS population | ||
FROM dw.item | ||
JOIN dataset USING (dataset_id) | ||
JOIN replicate2item USING (item_id) | ||
JOIN dw.replicate USING (replicate_id) | ||
JOIN biosample USING (biosample_id) | ||
JOIN donor USING (donor_id) | ||
WHERE dataset.dataset_name::text ~~* '%1000GENOMES%'::text OR dataset.dataset_name::text ~~* '%TCGA_somatic_mutation%'::text OR dataset.dataset_name::text ~~* '%TCGA_dnaseq'::text) x | ||
LEFT JOIN pair p1 ON x.item_id = p1.item_id AND p1.key::text = 'super_population'::text | ||
LEFT JOIN pair p2 ON x.item_id = p2.item_id AND p2.key::text = 'dna_source_from_coriell'::text | ||
WITH DATA; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
CREATE MATERIALIZED VIEW dw.genomes_metadata_2 | ||
TABLESPACE default_ts | ||
AS SELECT x.donor_id, | ||
x.donor_source_id, | ||
x.item_id, | ||
x.item_source_id, | ||
x.file_name, | ||
x.local_url, | ||
lower(x.assembly::text) AS assembly, | ||
COALESCE(x.gender, 'not reported'::character varying) AS gender, | ||
x.health_status, | ||
CASE | ||
WHEN x.dataset_name::text ~~* '%TCGA%'::text THEN NULL::character varying | ||
ELSE x.population | ||
END AS population, | ||
CASE | ||
WHEN x.dataset_name::text ~~* '%1000GENOMES%'::text THEN dw.kgenomes_ethnicity(p1.value) | ||
ELSE COALESCE(x.population, 'not reported'::character varying) | ||
END AS ethnicity, | ||
p1.value AS super_population, | ||
p2.value AS dna_source | ||
FROM ( SELECT biosample.donor_id, | ||
donor.donor_source_id, | ||
item.item_id, | ||
item.item_source_id, | ||
item.file_name, | ||
item.local_url, | ||
dataset.assembly, | ||
donor.gender, | ||
biosample.is_healthy AS health_status, | ||
donor.ethnicity AS population, | ||
dataset.dataset_name | ||
FROM dw.item | ||
JOIN dataset USING (dataset_id) | ||
JOIN replicate2item USING (item_id) | ||
JOIN dw.replicate USING (replicate_id) | ||
JOIN biosample USING (biosample_id) | ||
JOIN donor USING (donor_id) | ||
WHERE dataset.dataset_name::text ~~* '%1000GENOMES%'::text OR dataset.dataset_name::text ~~* '%TCGA_somatic_mutation%'::text OR dataset.dataset_name::text ~~* '%TCGA_dnaseq'::text) x | ||
LEFT JOIN pair p1 ON x.item_id = p1.item_id AND p1.key::text = 'super_population'::text | ||
LEFT JOIN pair p2 ON x.item_id = p2.item_id AND p2.key::text = 'dna_source_from_coriell'::text | ||
WITH DATA; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
CREATE OR REPLACE FUNCTION dw.kgenomes_ethnicity(super_population varchar) | ||
RETURNS varchar AS $$ | ||
declare res varchar; | ||
begin | ||
if super_population = 'AMR' then | ||
res := 'latin american'; | ||
elsif super_population = 'EUR' then | ||
res := 'white'; | ||
elsif super_population = 'AFR' then | ||
res := 'black or african american'; | ||
elsif super_population = 'SAS' or super_population = 'EAS' then | ||
res := 'asian'; | ||
else | ||
res := 'not reported'; | ||
end if; | ||
return res; | ||
-- this is a workaround to assign ethnicities to 1000 genomes. The correct way would be | ||
-- to rerun metadata manager and assign the values from them, but the need for ethnicity | ||
-- in 1000 genomes is a need that arrived too late to re-run metadata manager (it takes 2 weeks). | ||
end; | ||
$$ language plpgsql; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.