TraXIcoN · saranshkotnala · Oct 17, 2020
diff --git a/Python/disease_gene probability checker/data/family0.csv b/Python/disease_gene probability checker/data/family0.csv
@@ -0,0 +1,4 @@
+name,mother,father,trait
+Harry,Lily,James,
+James,,,1
+Lily,,,0
diff --git a/Python/disease_gene probability checker/data/family1.csv b/Python/disease_gene probability checker/data/family1.csv
@@ -0,0 +1,7 @@
+name,mother,father,trait
+Arthur,,,0
+Charlie,Molly,Arthur,0
+Fred,Molly,Arthur,1
+Ginny,Molly,Arthur,
+Molly,,,0
+Ron,Molly,Arthur,
diff --git a/Python/disease_gene probability checker/data/family2.csv b/Python/disease_gene probability checker/data/family2.csv
@@ -0,0 +1,6 @@
+name,mother,father,trait
+Arthur,,,0
+Hermione,,,0
+Molly,,,
+Ron,Molly,Arthur,0
+Rose,Ron,Hermione,1
diff --git a/Python/disease_gene probability checker/heredity.py b/Python/disease_gene probability checker/heredity.py
@@ -0,0 +1,197 @@
+import csv
+import itertools
+import sys
+
+PROBS = {
+
+    # Unconditional probabilities for having gene
+    "gene": {
+        2: 0.01,
+        1: 0.03,
+        0: 0.96
+    },
+
+    "trait": {
+
+        # Probability of trait given two copies of gene
+        2: {
+            True: 0.65,
+            False: 0.35
+        },
+
+        # Probability of trait given one copy of gene
+        1: {
+            True: 0.56,
+            False: 0.44
+        },
+
+        # Probability of trait given no gene
+        0: {
+            True: 0.01,
+            False: 0.99
+        }
+    },
+
+    # Mutation probability
+    "mutation": 0.01
+}
+
+
+def main():
+
+    # Check for proper usage
+    if len(sys.argv) != 2:
+        sys.exit("Usage: python heredity.py data.csv")
+    people = load_data(sys.argv[1])
+
+    # Keep track of gene and trait probabilities for each person
+    probabilities = {
+        person: {
+            "gene": {
+                2: 0,
+                1: 0,
+                0: 0
+            },
+            "trait": {
+                True: 0,
+                False: 0
+            }
+        }
+        for person in people
+    }
+
+    # Loop over all sets of people who might have the trait
+    names = set(people)
+    for have_trait in powerset(names):
+
+        # Check if current set of people violates known information
+        fails_evidence = any(
+            (people[person]["trait"] is not None and
+             people[person]["trait"] != (person in have_trait))
+            for person in names
+        )
+        if fails_evidence:
+            continue
+
+        # Loop over all sets of people who might have the gene
+        for one_gene in powerset(names):
+            for two_genes in powerset(names - one_gene):
+
+                # Update probabilities with new joint probability
+                p = joint_probability(people, one_gene, two_genes, have_trait)
+                update(probabilities, one_gene, two_genes, have_trait, p)
+
+    # Ensure probabilities sum to 1
+    normalize(probabilities)
+
+    # Print results
+    for person in people:
+        print(f"{person}:")
+        for field in probabilities[person]:
+            print(f"  {field.capitalize()}:")
+            for value in probabilities[person][field]:
+                p = probabilities[person][field][value]
+                print(f"    {value}: {p:.4f}")
+
+
+def load_data(filename):
+    """
+    Load gene and trait data from a file into a dictionary.
+    File assumed to be a CSV containing fields name, mother, father, trait.
+    mother, father must both be blank, or both be valid names in the CSV.
+    trait should be 0 or 1 if trait is known, blank otherwise.
+    """
+    data = dict()
+    with open(filename) as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            name = row["name"]
+            data[name] = {
+                "name": name,
+                "mother": row["mother"] or None,
+                "father": row["father"] or None,
+                "trait": (True if row["trait"] == "1" else
+                          False if row["trait"] == "0" else None)
+            }
+    return data
+
+
+def powerset(s):
+    """
+    Return a list of all possible subsets of set s.
+    """
+    s = list(s)
+    return [
+        set(s) for s in itertools.chain.from_iterable(
+            itertools.combinations(s, r) for r in range(len(s) + 1)
+        )
+    ]
+
+
+def joint_probability(people, one_gene, two_genes, have_trait):
+    """
+    Compute and return a joint probability.
+    The probability returned should be the probability that
+        * everyone in set `one_gene` has one copy of the gene, and
+        * everyone in set `two_genes` has two copies of the gene, and
+        * everyone not in `one_gene` or `two_gene` does not have the gene, and
+        * everyone in set `have_trait` has the trait, and
+        * everyone not in set` have_trait` does not have the trait.
+    """
+    p=float(1)
+    for i in people.keys():
+        trait = i in have_trait
+        father = people[i]['father']
+        mother = people[i]['mother']
+        if father is None:
+            p *= PROBS['gene'][1] if i in one_gene else PROBS['gene'][2] if i in two_genes else PROBS['gene'][0]
+        else:
+            motherpass = 1-PROBS["mutation"] if mother in two_genes else 0.5 if mother in one_gene else PROBS["mutation"]
+            fatherpass = 1-PROBS["mutation"] if father in two_genes else 0.5 if father in one_gene else PROBS["mutation"]
+
+            p *= ((motherpass )* (1 - fatherpass) + (fatherpass )* (1 - motherpass)) if i in one_gene else (motherpass * fatherpass) if i in two_genes else ((1-motherpass) * (1-fatherpass))
+        p *= PROBS["trait"][1 if i in one_gene else 2 if i in two_genes else 0][trait]
+    return p
+
+
+
+
+
+def update(probabilities, one_gene, two_genes, have_trait, p):
+    """
+    Add to `probabilities` a new joint probability `p`.
+    Each person should have their "gene" and "trait" distributions updated.
+    Which value for each distribution is updated depends on whether
+    the person is in `have_gene` and `have_trait`, respectively.
+    """
+    for i in probabilities:
+        if i in one_gene:
+            probabilities[i]['gene'][1] += p
+        elif i in two_genes:
+            probabilities[i]['gene'][2] += p
+        else:
+            probabilities[i]['gene'][0] += p
+        if i in have_trait:
+            probabilities[i]['trait'][True] += p
+        else:
+            probabilities[i]['trait'][False] += p
+
+
+
+def normalize(probabilities):
+    """
+    Update `probabilities` such that each probability distribution
+    is normalized (i.e., sums to 1, with relative proportions the same).
+    """
+    for i in probabilities:
+        sump = probabilities[i]['gene'][1] + probabilities[i]['gene'][2] + probabilities[i]['gene'][0]
+        sumt = probabilities[i]['trait'][True] + probabilities[i]['trait'][False]
+        for a in range(0,3):
+            probabilities[i]['gene'][a] = probabilities[i]['gene'][a]/sump
+        probabilities[i]['trait'][True] = probabilities[i]['trait'][True]/sumt
+        probabilities[i]['trait'][False] = probabilities[i]['trait'][False]/sumt
+
+
+if __name__ == "__main__":
+    main()
+