Skip to content

Commit

Permalink
Adding nsfg files
Browse files Browse the repository at this point in the history
  • Loading branch information
AllenDowney committed Feb 28, 2019
1 parent b527993 commit 633185b
Show file tree
Hide file tree
Showing 2 changed files with 325 additions and 0 deletions.
160 changes: 160 additions & 0 deletions solutions/first.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
"""This file contains code used in "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2014 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

from __future__ import print_function

import math
import numpy as np

import nsfg
import thinkstats2
import thinkplot


def MakeFrames():
"""Reads pregnancy data and partitions first babies and others.
returns: DataFrames (all live births, first babies, others)
"""
preg = nsfg.ReadFemPreg()

live = preg[preg.outcome == 1]
firsts = live[live.birthord == 1]
others = live[live.birthord != 1]

assert len(live) == 9148
assert len(firsts) == 4413
assert len(others) == 4735

return live, firsts, others


def Summarize(live, firsts, others):
"""Print various summary statistics."""

mean = live.prglngth.mean()
var = live.prglngth.var()
std = live.prglngth.std()

print('Live mean', mean)
print('Live variance', var)
print('Live std', std)

mean1 = firsts.prglngth.mean()
mean2 = others.prglngth.mean()

var1 = firsts.prglngth.var()
var2 = others.prglngth.var()

print('Mean')
print('First babies', mean1)
print('Others', mean2)

print('Variance')
print('First babies', var1)
print('Others', var2)

print('Difference in weeks', mean1 - mean2)
print('Difference in hours', (mean1 - mean2) * 7 * 24)

print('Difference relative to 39 weeks', (mean1 - mean2) / 39 * 100)

d = thinkstats2.CohenEffectSize(firsts.prglngth, others.prglngth)
print('Cohen d', d)


def PrintExtremes(live):
"""Plots the histogram of pregnancy lengths and prints the extremes.
live: DataFrame of live births
"""
hist = thinkstats2.Hist(live.prglngth)
thinkplot.Hist(hist, label='live births')

thinkplot.Save(root='first_nsfg_hist_live',
title='Histogram',
xlabel='weeks',
ylabel='frequency')

print('Shortest lengths:')
for weeks, freq in hist.Smallest(10):
print(weeks, freq)

print('Longest lengths:')
for weeks, freq in hist.Largest(10):
print(weeks, freq)


def MakeHists(live):
"""Plot Hists for live births
live: DataFrame
others: DataFrame
"""
hist = thinkstats2.Hist(live.birthwgt_lb, label='birthwgt_lb')
thinkplot.Hist(hist)
thinkplot.Save(root='first_wgt_lb_hist',
xlabel='pounds',
ylabel='frequency',
axis=[-1, 14, 0, 3200])

hist = thinkstats2.Hist(live.birthwgt_oz, label='birthwgt_oz')
thinkplot.Hist(hist)
thinkplot.Save(root='first_wgt_oz_hist',
xlabel='ounces',
ylabel='frequency',
axis=[-1, 16, 0, 1200])

hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg')
thinkplot.Hist(hist)
thinkplot.Save(root='first_agepreg_hist',
xlabel='years',
ylabel='frequency')

hist = thinkstats2.Hist(live.prglngth, label='prglngth')
thinkplot.Hist(hist)
thinkplot.Save(root='first_prglngth_hist',
xlabel='weeks',
ylabel='frequency',
axis=[-1, 53, 0, 5000])


def MakeComparison(firsts, others):
"""Plots histograms of pregnancy length for first babies and others.
firsts: DataFrame
others: DataFrame
"""
first_hist = thinkstats2.Hist(firsts.prglngth, label='first')
other_hist = thinkstats2.Hist(others.prglngth, label='other')

width = 0.45
thinkplot.PrePlot(2)
thinkplot.Hist(first_hist, align='right', width=width)
thinkplot.Hist(other_hist, align='left', width=width)

thinkplot.Save(root='first_nsfg_hist',
title='Histogram',
xlabel='weeks',
ylabel='frequency',
axis=[27, 46, 0, 2700])


def main(script):
live, firsts, others = MakeFrames()

MakeHists(live)
PrintExtremes(live)
MakeComparison(firsts, others)
Summarize(live, firsts, others)


if __name__ == '__main__':
import sys
main(*sys.argv)


165 changes: 165 additions & 0 deletions solutions/nsfg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
"""This file contains code for use with "Think Stats",
by Allen B. Downey, available from greenteapress.com
Copyright 2010 Allen B. Downey
License: GNU GPLv3 http://www.gnu.org/licenses/gpl.html
"""

from __future__ import print_function, division

import sys
import numpy as np
import thinkstats2

from collections import defaultdict


def ReadFemResp(dct_file='2002FemResp.dct',
dat_file='2002FemResp.dat.gz',
nrows=None):
"""Reads the NSFG respondent data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip', nrows=nrows)
CleanFemResp(df)
return df


def CleanFemResp(df):
"""Recodes variables from the respondent frame.
df: DataFrame
"""
pass


def ReadFemPreg(dct_file='2002FemPreg.dct',
dat_file='2002FemPreg.dat.gz'):
"""Reads the NSFG pregnancy data.
dct_file: string file name
dat_file: string file name
returns: DataFrame
"""
dct = thinkstats2.ReadStataDct(dct_file)
df = dct.ReadFixedWidth(dat_file, compression='gzip')
CleanFemPreg(df)
return df


def CleanFemPreg(df):
"""Recodes variables from the pregnancy frame.
df: DataFrame
"""
# mother's age is encoded in centiyears; convert to years
df.agepreg /= 100.0

# birthwgt_lb contains at least one bogus value (51 lbs)
# replace with NaN
df.loc[df.birthwgt_lb > 20, 'birthwgt_lb'] = np.nan

# replace 'not ascertained', 'refused', 'don't know' with NaN
na_vals = [97, 98, 99]
df.birthwgt_lb.replace(na_vals, np.nan, inplace=True)
df.birthwgt_oz.replace(na_vals, np.nan, inplace=True)
df.hpagelb.replace(na_vals, np.nan, inplace=True)

df.babysex.replace([7, 9], np.nan, inplace=True)
df.nbrnaliv.replace([9], np.nan, inplace=True)

# birthweight is stored in two columns, lbs and oz.
# convert to a single column in lb
# NOTE: creating a new column requires dictionary syntax,
# not attribute assignment (like df.totalwgt_lb)
df['totalwgt_lb'] = df.birthwgt_lb + df.birthwgt_oz / 16.0

# due to a bug in ReadStataDct, the last variable gets clipped;
# so for now set it to NaN
df.cmintvw = np.nan


def ValidatePregnum(resp, preg):
"""Validate pregnum in the respondent file.
resp: respondent DataFrame
preg: pregnancy DataFrame
"""
# make the map from caseid to list of pregnancy indices
preg_map = MakePregMap(preg)

# iterate through the respondent pregnum series
for index, pregnum in resp.pregnum.iteritems():
caseid = resp.caseid[index]
indices = preg_map[caseid]

# check that pregnum from the respondent file equals
# the number of records in the pregnancy file
if len(indices) != pregnum:
print(caseid, len(indices), pregnum)
return False

return True


def MakePregMap(df):
"""Make a map from caseid to list of preg indices.
df: DataFrame
returns: dict that maps from caseid to list of indices into `preg`
"""
d = defaultdict(list)
for index, caseid in df.caseid.iteritems():
d[caseid].append(index)
return d


def main():
"""Tests the functions in this module.
script: string script name
"""
# read and validate the respondent file
resp = ReadFemResp()

assert(len(resp) == 7643)
assert(resp.pregnum.value_counts()[1] == 1267)

# read and validate the pregnancy file
preg = ReadFemPreg()
print(preg.shape)

assert len(preg) == 13593
assert preg.caseid[13592] == 12571
assert preg.pregordr.value_counts()[1] == 5033
assert preg.nbrnaliv.value_counts()[1] == 8981
assert preg.babysex.value_counts()[1] == 4641
assert preg.birthwgt_lb.value_counts()[7] == 3049
assert preg.birthwgt_oz.value_counts()[0] == 1037
assert preg.prglngth.value_counts()[39] == 4744
assert preg.outcome.value_counts()[1] == 9148
assert preg.birthord.value_counts()[1] == 4413
assert preg.agepreg.value_counts()[22.75] == 100
assert preg.totalwgt_lb.value_counts()[7.5] == 302

weights = preg.finalwgt.value_counts()
key = max(weights.keys())
assert preg.finalwgt.value_counts()[key] == 6

# validate that the pregnum column in `resp` matches the number
# of entries in `preg`
assert(ValidatePregnum(resp, preg))


print('All tests passed.')


if __name__ == '__main__':
main()

0 comments on commit 633185b

Please sign in to comment.