diff --git a/Python/disease_gene probability checker/data/family0.csv b/Python/disease_gene probability checker/data/family0.csv new file mode 100644 index 0000000..fa103d1 --- /dev/null +++ b/Python/disease_gene probability checker/data/family0.csv @@ -0,0 +1,4 @@ +name,mother,father,trait +Harry,Lily,James, +James,,,1 +Lily,,,0 diff --git a/Python/disease_gene probability checker/data/family1.csv b/Python/disease_gene probability checker/data/family1.csv new file mode 100644 index 0000000..786b300 --- /dev/null +++ b/Python/disease_gene probability checker/data/family1.csv @@ -0,0 +1,7 @@ +name,mother,father,trait +Arthur,,,0 +Charlie,Molly,Arthur,0 +Fred,Molly,Arthur,1 +Ginny,Molly,Arthur, +Molly,,,0 +Ron,Molly,Arthur, diff --git a/Python/disease_gene probability checker/data/family2.csv b/Python/disease_gene probability checker/data/family2.csv new file mode 100644 index 0000000..b609dc0 --- /dev/null +++ b/Python/disease_gene probability checker/data/family2.csv @@ -0,0 +1,6 @@ +name,mother,father,trait +Arthur,,,0 +Hermione,,,0 +Molly,,, +Ron,Molly,Arthur,0 +Rose,Ron,Hermione,1 diff --git a/Python/disease_gene probability checker/heredity.py b/Python/disease_gene probability checker/heredity.py new file mode 100644 index 0000000..e2c1f37 --- /dev/null +++ b/Python/disease_gene probability checker/heredity.py @@ -0,0 +1,197 @@ +import csv +import itertools +import sys + +PROBS = { + + # Unconditional probabilities for having gene + "gene": { + 2: 0.01, + 1: 0.03, + 0: 0.96 + }, + + "trait": { + + # Probability of trait given two copies of gene + 2: { + True: 0.65, + False: 0.35 + }, + + # Probability of trait given one copy of gene + 1: { + True: 0.56, + False: 0.44 + }, + + # Probability of trait given no gene + 0: { + True: 0.01, + False: 0.99 + } + }, + + # Mutation probability + "mutation": 0.01 +} + + +def main(): + + # Check for proper usage + if len(sys.argv) != 2: + sys.exit("Usage: python heredity.py data.csv") + people = load_data(sys.argv[1]) + + # Keep track of gene and trait probabilities for each person + probabilities = { + person: { + "gene": { + 2: 0, + 1: 0, + 0: 0 + }, + "trait": { + True: 0, + False: 0 + } + } + for person in people + } + + # Loop over all sets of people who might have the trait + names = set(people) + for have_trait in powerset(names): + + # Check if current set of people violates known information + fails_evidence = any( + (people[person]["trait"] is not None and + people[person]["trait"] != (person in have_trait)) + for person in names + ) + if fails_evidence: + continue + + # Loop over all sets of people who might have the gene + for one_gene in powerset(names): + for two_genes in powerset(names - one_gene): + + # Update probabilities with new joint probability + p = joint_probability(people, one_gene, two_genes, have_trait) + update(probabilities, one_gene, two_genes, have_trait, p) + + # Ensure probabilities sum to 1 + normalize(probabilities) + + # Print results + for person in people: + print(f"{person}:") + for field in probabilities[person]: + print(f" {field.capitalize()}:") + for value in probabilities[person][field]: + p = probabilities[person][field][value] + print(f" {value}: {p:.4f}") + + +def load_data(filename): + """ + Load gene and trait data from a file into a dictionary. + File assumed to be a CSV containing fields name, mother, father, trait. + mother, father must both be blank, or both be valid names in the CSV. + trait should be 0 or 1 if trait is known, blank otherwise. + """ + data = dict() + with open(filename) as f: + reader = csv.DictReader(f) + for row in reader: + name = row["name"] + data[name] = { + "name": name, + "mother": row["mother"] or None, + "father": row["father"] or None, + "trait": (True if row["trait"] == "1" else + False if row["trait"] == "0" else None) + } + return data + + +def powerset(s): + """ + Return a list of all possible subsets of set s. + """ + s = list(s) + return [ + set(s) for s in itertools.chain.from_iterable( + itertools.combinations(s, r) for r in range(len(s) + 1) + ) + ] + + +def joint_probability(people, one_gene, two_genes, have_trait): + """ + Compute and return a joint probability. + The probability returned should be the probability that + * everyone in set `one_gene` has one copy of the gene, and + * everyone in set `two_genes` has two copies of the gene, and + * everyone not in `one_gene` or `two_gene` does not have the gene, and + * everyone in set `have_trait` has the trait, and + * everyone not in set` have_trait` does not have the trait. + """ + p=float(1) + for i in people.keys(): + trait = i in have_trait + father = people[i]['father'] + mother = people[i]['mother'] + if father is None: + p *= PROBS['gene'][1] if i in one_gene else PROBS['gene'][2] if i in two_genes else PROBS['gene'][0] + else: + motherpass = 1-PROBS["mutation"] if mother in two_genes else 0.5 if mother in one_gene else PROBS["mutation"] + fatherpass = 1-PROBS["mutation"] if father in two_genes else 0.5 if father in one_gene else PROBS["mutation"] + + p *= ((motherpass )* (1 - fatherpass) + (fatherpass )* (1 - motherpass)) if i in one_gene else (motherpass * fatherpass) if i in two_genes else ((1-motherpass) * (1-fatherpass)) + p *= PROBS["trait"][1 if i in one_gene else 2 if i in two_genes else 0][trait] + return p + + + + + +def update(probabilities, one_gene, two_genes, have_trait, p): + """ + Add to `probabilities` a new joint probability `p`. + Each person should have their "gene" and "trait" distributions updated. + Which value for each distribution is updated depends on whether + the person is in `have_gene` and `have_trait`, respectively. + """ + for i in probabilities: + if i in one_gene: + probabilities[i]['gene'][1] += p + elif i in two_genes: + probabilities[i]['gene'][2] += p + else: + probabilities[i]['gene'][0] += p + if i in have_trait: + probabilities[i]['trait'][True] += p + else: + probabilities[i]['trait'][False] += p + + + +def normalize(probabilities): + """ + Update `probabilities` such that each probability distribution + is normalized (i.e., sums to 1, with relative proportions the same). + """ + for i in probabilities: + sump = probabilities[i]['gene'][1] + probabilities[i]['gene'][2] + probabilities[i]['gene'][0] + sumt = probabilities[i]['trait'][True] + probabilities[i]['trait'][False] + for a in range(0,3): + probabilities[i]['gene'][a] = probabilities[i]['gene'][a]/sump + probabilities[i]['trait'][True] = probabilities[i]['trait'][True]/sumt + probabilities[i]['trait'][False] = probabilities[i]['trait'][False]/sumt + + +if __name__ == "__main__": + main() +