diff --git a/solutions/first.py b/solutions/first.py new file mode 100644 index 000000000..4c5fccf19 --- /dev/null +++ b/solutions/first.py @@ -0,0 +1,160 @@ +"""This file contains code used in "Think Stats", +by Allen B. Downey, available from greenteapress.com + +Copyright 2014 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function + +import math +import numpy as np + +import nsfg +import thinkstats2 +import thinkplot + + +def MakeFrames(): + """Reads pregnancy data and partitions first babies and others. + + returns: DataFrames (all live births, first babies, others) + """ + preg = nsfg.ReadFemPreg() + + live = preg[preg.outcome == 1] + firsts = live[live.birthord == 1] + others = live[live.birthord != 1] + + assert len(live) == 9148 + assert len(firsts) == 4413 + assert len(others) == 4735 + + return live, firsts, others + + +def Summarize(live, firsts, others): + """Print various summary statistics.""" + + mean = live.prglngth.mean() + var = live.prglngth.var() + std = live.prglngth.std() + + print('Live mean', mean) + print('Live variance', var) + print('Live std', std) + + mean1 = firsts.prglngth.mean() + mean2 = others.prglngth.mean() + + var1 = firsts.prglngth.var() + var2 = others.prglngth.var() + + print('Mean') + print('First babies', mean1) + print('Others', mean2) + + print('Variance') + print('First babies', var1) + print('Others', var2) + + print('Difference in weeks', mean1 - mean2) + print('Difference in hours', (mean1 - mean2) * 7 * 24) + + print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100) + + d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth) + print('Cohen d', d) + + +def PrintExtremes(live): + """Plots the histogram of pregnancy lengths and prints the extremes. + + live: DataFrame of live births + """ + hist = thinkstats2.Hist(live.prglngth) + thinkplot.Hist(hist, label='live births') + + thinkplot.Save(root='first_nsfg_hist_live', + title='Histogram', + xlabel='weeks', + ylabel='frequency') + + print('Shortest lengths:') + for weeks, freq in hist.Smallest(10): + print(weeks, freq) + + print('Longest lengths:') + for weeks, freq in hist.Largest(10): + print(weeks, freq) + + +def MakeHists(live): + """Plot Hists for live births + + live: DataFrame + others: DataFrame + """ + hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb') + thinkplot.Hist(hist) + thinkplot.Save(root='first_wgt_lb_hist', + xlabel='pounds', + ylabel='frequency', + axis=[-1, 14, 0, 3200]) + + hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz') + thinkplot.Hist(hist) + thinkplot.Save(root='first_wgt_oz_hist', + xlabel='ounces', + ylabel='frequency', + axis=[-1, 16, 0, 1200]) + + hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') + thinkplot.Hist(hist) + thinkplot.Save(root='first_agepreg_hist', + xlabel='years', + ylabel='frequency') + + hist = thinkstats2.Hist(live.prglngth, label='prglngth') + thinkplot.Hist(hist) + thinkplot.Save(root='first_prglngth_hist', + xlabel='weeks', + ylabel='frequency', + axis=[-1, 53, 0, 5000]) + + +def MakeComparison(firsts, others): + """Plots histograms of pregnancy length for first babies and others. + + firsts: DataFrame + others: DataFrame + """ + first_hist = thinkstats2.Hist(firsts.prglngth, label='first') + other_hist = thinkstats2.Hist(others.prglngth, label='other') + + width = 0.45 + thinkplot.PrePlot(2) + thinkplot.Hist(first_hist, align='right', width=width) + thinkplot.Hist(other_hist, align='left', width=width) + + thinkplot.Save(root='first_nsfg_hist', + title='Histogram', + xlabel='weeks', + ylabel='frequency', + axis=[27, 46, 0, 2700]) + + +def main(script): + live, firsts, others = MakeFrames() + + MakeHists(live) + PrintExtremes(live) + MakeComparison(firsts, others) + Summarize(live, firsts, others) + + +if __name__ == '__main__': + import sys + main(*sys.argv) + + diff --git a/solutions/nsfg.py b/solutions/nsfg.py new file mode 100644 index 000000000..3cb91587c --- /dev/null +++ b/solutions/nsfg.py @@ -0,0 +1,165 @@ +"""This file contains code for use with "Think Stats", +by Allen B. Downey, available from greenteapress.com + +Copyright 2010 Allen B. Downey +License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html +""" + +from __future__ import print_function, division + +import sys +import numpy as np +import thinkstats2 + +from collections import defaultdict + + +def ReadFemResp(dct_file='2002FemResp.dct', + dat_file='2002FemResp.dat.gz', + nrows=None): + """Reads the NSFG respondent data. + + dct_file: string file name + dat_file: string file name + + returns: DataFrame + """ + dct = thinkstats2.ReadStataDct(dct_file) + df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows) + CleanFemResp(df) + return df + + +def CleanFemResp(df): + """Recodes variables from the respondent frame. + + df: DataFrame + """ + pass + + +def ReadFemPreg(dct_file='2002FemPreg.dct', + dat_file='2002FemPreg.dat.gz'): + """Reads the NSFG pregnancy data. + + dct_file: string file name + dat_file: string file name + + returns: DataFrame + """ + dct = thinkstats2.ReadStataDct(dct_file) + df = dct.ReadFixedWidth(dat_file, compression='gzip') + CleanFemPreg(df) + return df + + +def CleanFemPreg(df): + """Recodes variables from the pregnancy frame. + + df: DataFrame + """ + # mother's age is encoded in centiyears; convert to years + df.agepreg /= 100.0 + + # birthwgt_lb contains at least one bogus value (51 lbs) + # replace with NaN + df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan + + # replace 'not ascertained', 'refused', 'don't know' with NaN + na_vals = [97, 98, 99] + df.birthwgt_lb.replace(na_vals, np.nan, inplace=True) + df.birthwgt_oz.replace(na_vals, np.nan, inplace=True) + df.hpagelb.replace(na_vals, np.nan, inplace=True) + + df.babysex.replace([7, 9], np.nan, inplace=True) + df.nbrnaliv.replace([9], np.nan, inplace=True) + + # birthweight is stored in two columns, lbs and oz. + # convert to a single column in lb + # NOTE: creating a new column requires dictionary syntax, + # not attribute assignment (like df.totalwgt_lb) + df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0 + + # due to a bug in ReadStataDct, the last variable gets clipped; + # so for now set it to NaN + df.cmintvw = np.nan + + +def ValidatePregnum(resp, preg): + """Validate pregnum in the respondent file. + + resp: respondent DataFrame + preg: pregnancy DataFrame + """ + # make the map from caseid to list of pregnancy indices + preg_map = MakePregMap(preg) + + # iterate through the respondent pregnum series + for index, pregnum in resp.pregnum.iteritems(): + caseid = resp.caseid[index] + indices = preg_map[caseid] + + # check that pregnum from the respondent file equals + # the number of records in the pregnancy file + if len(indices) != pregnum: + print(caseid, len(indices), pregnum) + return False + + return True + + +def MakePregMap(df): + """Make a map from caseid to list of preg indices. + + df: DataFrame + + returns: dict that maps from caseid to list of indices into `preg` + """ + d = defaultdict(list) + for index, caseid in df.caseid.iteritems(): + d[caseid].append(index) + return d + + +def main(): + """Tests the functions in this module. + + script: string script name + """ + # read and validate the respondent file + resp = ReadFemResp() + + assert(len(resp) == 7643) + assert(resp.pregnum.value_counts()[1] == 1267) + + # read and validate the pregnancy file + preg = ReadFemPreg() + print(preg.shape) + + assert len(preg) == 13593 + assert preg.caseid[13592] == 12571 + assert preg.pregordr.value_counts()[1] == 5033 + assert preg.nbrnaliv.value_counts()[1] == 8981 + assert preg.babysex.value_counts()[1] == 4641 + assert preg.birthwgt_lb.value_counts()[7] == 3049 + assert preg.birthwgt_oz.value_counts()[0] == 1037 + assert preg.prglngth.value_counts()[39] == 4744 + assert preg.outcome.value_counts()[1] == 9148 + assert preg.birthord.value_counts()[1] == 4413 + assert preg.agepreg.value_counts()[22.75] == 100 + assert preg.totalwgt_lb.value_counts()[7.5] == 302 + + weights = preg.finalwgt.value_counts() + key = max(weights.keys()) + assert preg.finalwgt.value_counts()[key] == 6 + + # validate that the pregnum column in `resp` matches the number + # of entries in `preg` + assert(ValidatePregnum(resp, preg)) + + + print('All tests passed.') + + +if __name__ == '__main__': + main()