Adding nsfg files

Qs-Tim · Feb 28, 2019 · 633185b · 633185b
1 parent b527993
commit 633185b
Show file tree

Hide file tree

Showing 2 changed files with 325 additions and 0 deletions.
diff --git a/solutions/first.py b/solutions/first.py
@@ -0,0 +1,160 @@
+"""This file contains code used in "Think Stats",
+by Allen B. Downey, available from greenteapress.com
+
+Copyright 2014 Allen B. Downey
+License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
+"""
+
+from __future__ import print_function
+
+import math
+import numpy as np
+
+import nsfg
+import thinkstats2
+import thinkplot
+
+
+def MakeFrames():
+    """Reads pregnancy data and partitions first babies and others.
+
+    returns: DataFrames (all live births, first babies, others)
+    """
+    preg = nsfg.ReadFemPreg()
+
+    live = preg[preg.outcome == 1]
+    firsts = live[live.birthord == 1]
+    others = live[live.birthord != 1]
+
+    assert len(live) == 9148
+    assert len(firsts) == 4413
+    assert len(others) == 4735
+
+    return live, firsts, others
+
+
+def Summarize(live, firsts, others):
+    """Print various summary statistics."""
+
+    mean = live.prglngth.mean()
+    var = live.prglngth.var()
+    std = live.prglngth.std()
+
+    print('Live mean', mean)
+    print('Live variance', var)
+    print('Live std', std)
+
+    mean1 = firsts.prglngth.mean()
+    mean2 = others.prglngth.mean()
+
+    var1 = firsts.prglngth.var()
+    var2 = others.prglngth.var()
+
+    print('Mean')
+    print('First babies', mean1)
+    print('Others', mean2)
+
+    print('Variance')
+    print('First babies', var1)
+    print('Others', var2)
+
+    print('Difference in weeks', mean1 - mean2)
+    print('Difference in hours', (mean1 - mean2) * 7 * 24)
+
+    print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100)
+
+    d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth)
+    print('Cohen d', d)
+
+
+def PrintExtremes(live):
+    """Plots the histogram of pregnancy lengths and prints the extremes.
+
+    live: DataFrame of live births
+    """
+    hist = thinkstats2.Hist(live.prglngth)
+    thinkplot.Hist(hist, label='live births')
+
+    thinkplot.Save(root='first_nsfg_hist_live', 
+                   title='Histogram',
+                   xlabel='weeks',
+                   ylabel='frequency')
+
+    print('Shortest lengths:')
+    for weeks, freq in hist.Smallest(10):
+        print(weeks, freq)
+
+    print('Longest lengths:')
+    for weeks, freq in hist.Largest(10):
+        print(weeks, freq)
+
+
+def MakeHists(live):
+    """Plot Hists for live births
+
+    live: DataFrame
+    others: DataFrame
+    """
+    hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')
+    thinkplot.Hist(hist)
+    thinkplot.Save(root='first_wgt_lb_hist', 
+                   xlabel='pounds',
+                   ylabel='frequency',
+                   axis=[-1, 14, 0, 3200])
+
+    hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz')
+    thinkplot.Hist(hist)
+    thinkplot.Save(root='first_wgt_oz_hist', 
+                   xlabel='ounces',
+                   ylabel='frequency',
+                   axis=[-1, 16, 0, 1200])
+
+    hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
+    thinkplot.Hist(hist)
+    thinkplot.Save(root='first_agepreg_hist', 
+                   xlabel='years',
+                   ylabel='frequency')
+
+    hist = thinkstats2.Hist(live.prglngth, label='prglngth')
+    thinkplot.Hist(hist)
+    thinkplot.Save(root='first_prglngth_hist', 
+                   xlabel='weeks',
+                   ylabel='frequency',
+                   axis=[-1, 53, 0, 5000])
+
+
+def MakeComparison(firsts, others):
+    """Plots histograms of pregnancy length for first babies and others.
+
+    firsts: DataFrame
+    others: DataFrame
+    """
+    first_hist = thinkstats2.Hist(firsts.prglngth, label='first')
+    other_hist = thinkstats2.Hist(others.prglngth, label='other')
+
+    width = 0.45
+    thinkplot.PrePlot(2)
+    thinkplot.Hist(first_hist, align='right', width=width)
+    thinkplot.Hist(other_hist, align='left', width=width)
+
+    thinkplot.Save(root='first_nsfg_hist', 
+                   title='Histogram',
+                   xlabel='weeks',
+                   ylabel='frequency',
+                   axis=[27, 46, 0, 2700])
+
+
+def main(script):
+    live, firsts, others = MakeFrames()
+
+    MakeHists(live)
+    PrintExtremes(live)
+    MakeComparison(firsts, others)
+    Summarize(live, firsts, others)
+
+
+if __name__ == '__main__':
+    import sys
+    main(*sys.argv)
+
+
diff --git a/solutions/nsfg.py b/solutions/nsfg.py
@@ -0,0 +1,165 @@
+"""This file contains code for use with "Think Stats",
+by Allen B. Downey, available from greenteapress.com
+
+Copyright 2010 Allen B. Downey
+License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
+"""
+
+from __future__ import print_function, division
+
+import sys
+import numpy as np
+import thinkstats2
+
+from collections import defaultdict
+
+
+def ReadFemResp(dct_file='2002FemResp.dct',
+                dat_file='2002FemResp.dat.gz',
+                nrows=None):
+    """Reads the NSFG respondent data.
+
+    dct_file: string file name
+    dat_file: string file name
+
+    returns: DataFrame
+    """
+    dct = thinkstats2.ReadStataDct(dct_file)
+    df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
+    CleanFemResp(df)
+    return df
+
+
+def CleanFemResp(df):
+    """Recodes variables from the respondent frame.
+
+    df: DataFrame
+    """
+    pass
+
+
+def ReadFemPreg(dct_file='2002FemPreg.dct',
+                dat_file='2002FemPreg.dat.gz'):
+    """Reads the NSFG pregnancy data.
+
+    dct_file: string file name
+    dat_file: string file name
+
+    returns: DataFrame
+    """
+    dct = thinkstats2.ReadStataDct(dct_file)
+    df = dct.ReadFixedWidth(dat_file, compression='gzip')
+    CleanFemPreg(df)
+    return df
+
+
+def CleanFemPreg(df):
+    """Recodes variables from the pregnancy frame.
+
+    df: DataFrame
+    """
+    # mother's age is encoded in centiyears; convert to years
+    df.agepreg /= 100.0
+
+    # birthwgt_lb contains at least one bogus value (51 lbs)
+    # replace with NaN
+    df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan
+
+    # replace 'not ascertained', 'refused', 'don't know' with NaN
+    na_vals = [97, 98, 99]
+    df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
+    df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
+    df.hpagelb.replace(na_vals, np.nan, inplace=True)
+
+    df.babysex.replace([7, 9], np.nan, inplace=True)
+    df.nbrnaliv.replace([9], np.nan, inplace=True)
+
+    # birthweight is stored in two columns, lbs and oz.
+    # convert to a single column in lb
+    # NOTE: creating a new column requires dictionary syntax,
+    # not attribute assignment (like df.totalwgt_lb)
+    df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0    
+
+    # due to a bug in ReadStataDct, the last variable gets clipped;
+    # so for now set it to NaN
+    df.cmintvw = np.nan
+
+
+def ValidatePregnum(resp, preg):
+    """Validate pregnum in the respondent file.
+
+    resp: respondent DataFrame
+    preg: pregnancy DataFrame
+    """
+    # make the map from caseid to list of pregnancy indices
+    preg_map = MakePregMap(preg)
+
+    # iterate through the respondent pregnum series
+    for index, pregnum in resp.pregnum.iteritems():
+        caseid = resp.caseid[index]
+        indices = preg_map[caseid]
+
+        # check that pregnum from the respondent file equals
+        # the number of records in the pregnancy file
+        if len(indices) != pregnum:
+            print(caseid, len(indices), pregnum)
+            return False
+
+    return True
+
+
+def MakePregMap(df):
+    """Make a map from caseid to list of preg indices.
+
+    df: DataFrame
+
+    returns: dict that maps from caseid to list of indices into `preg`
+    """
+    d = defaultdict(list)
+    for index, caseid in df.caseid.iteritems():
+        d[caseid].append(index)
+    return d
+
+
+def main():
+    """Tests the functions in this module.
+
+    script: string script name
+    """
+    # read and validate the respondent file
+    resp = ReadFemResp()
+
+    assert(len(resp) == 7643)
+    assert(resp.pregnum.value_counts()[1] == 1267)
+
+    # read and validate the pregnancy file
+    preg = ReadFemPreg()
+    print(preg.shape)
+
+    assert len(preg) == 13593
+    assert preg.caseid[13592] == 12571
+    assert preg.pregordr.value_counts()[1] == 5033
+    assert preg.nbrnaliv.value_counts()[1] == 8981
+    assert preg.babysex.value_counts()[1] == 4641
+    assert preg.birthwgt_lb.value_counts()[7] == 3049
+    assert preg.birthwgt_oz.value_counts()[0] == 1037
+    assert preg.prglngth.value_counts()[39] == 4744
+    assert preg.outcome.value_counts()[1] == 9148
+    assert preg.birthord.value_counts()[1] == 4413
+    assert preg.agepreg.value_counts()[22.75] == 100
+    assert preg.totalwgt_lb.value_counts()[7.5] == 302
+
+    weights = preg.finalwgt.value_counts()
+    key = max(weights.keys())
+    assert preg.finalwgt.value_counts()[key] == 6
+
+    # validate that the pregnum column in `resp` matches the number
+    # of entries in `preg`
+    assert(ValidatePregnum(resp, preg))
+
+
+    print('All tests passed.')
+
+
+if __name__ == '__main__':
+    main()