From 5a2d7e3d7173d68bf9716b5b05f1fcec6c29156b Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Tue, 13 Aug 2019 12:30:40 +0100 Subject: [PATCH 01/15] Added practice join date. --- savsnet/prev_ts_GP.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/savsnet/prev_ts_GP.py b/savsnet/prev_ts_GP.py index f8e705f..4ff896b 100644 --- a/savsnet/prev_ts_GP.py +++ b/savsnet/prev_ts_GP.py @@ -95,11 +95,18 @@ def extractData(dat,species,condition): N -- total number of cases seen weekly for species """ - dat = dat[dat.Species==species] - dates = pd.DatetimeIndex(dat.Date) - minDate = np.min(dates) - week = (dates-minDate).days // 7 - + dat = dat.copy()[dat.Species==species] + dat.Date = pd.DatetimeIndex(dat.Date) + + # Throw out recently added practices + byPractice = dat.groupby(['Practice_ID']) + firstAppear = byPractice['Date'].agg([['mindate','min']]) + dat = pd.merge(dat, firstAppear, how='left', on='Practice_ID') + dat = dat[dat.mindate<'2018-06-01'] + + # Aggregate by week + minDate = np.min(dat.Date) + week = (pd.DatetimeIndex(dat.Date)-minDate).days // 7 dat = dat.groupby(week) aggTable = dat['Consult_reason'].agg([['cases',lambda x: np.sum(x==condition)], ['N',len]]) @@ -165,7 +172,7 @@ def predProb(y, ystar): mcmc_iter=args.iterations[0]) filename = "%s_%s.pkl" % (species, condition) if args.prefix is not None: - filename = "%s_%s" % (args.prefix,filename) + filename = "%s%s" % (args.prefix,filename) print ("Saving '%s'" % filename) with open(filename, 'wb') as f: pickle.dump(res,f,pickle.HIGHEST_PROTOCOL) From cfc4b26ab77fbe8b61638d3be782ad9cd69a5b35 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Thu, 15 Aug 2019 12:42:06 +0100 Subject: [PATCH 02/15] Cleaned code. Added tests. --- requirements.txt | 3 + savsnet/prev_ts_GP.py | 112 ++++++++++++++++++---------------- tests/test_data_extraction.py | 18 ++++++ tests/test_prediction.py | 1 + 4 files changed, 83 insertions(+), 51 deletions(-) create mode 100644 requirements.txt create mode 100644 tests/test_data_extraction.py create mode 100644 tests/test_prediction.py diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c49b4b9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +pandas +theano +pymc3 diff --git a/savsnet/prev_ts_GP.py b/savsnet/prev_ts_GP.py index 4ff896b..a6d3864 100644 --- a/savsnet/prev_ts_GP.py +++ b/savsnet/prev_ts_GP.py @@ -1,6 +1,7 @@ #!/usr/bin/python -o # SAVSNet model +import sys import pandas as pd import numpy as np import pymc3 as pm @@ -9,7 +10,7 @@ import theano.tensor.slinalg as tla -def BinomGP(y, N, X, X_star,mcmc_iter,start={}): +def BinomGP(y, N, X, prac_idx, X_star, mcmc_iter, start={}): """Fits a logistic Gaussian process regression timeseries model. Details @@ -31,6 +32,7 @@ def BinomGP(y, N, X, X_star,mcmc_iter,start={}): y -- a vector of cases N -- a vector of number tested X -- a vector of times at which y is observed + T -- a vector of time since recruitment X_star -- a vector of times at which predictons are to be made start -- a dictionary of starting values for the MCMC. @@ -38,32 +40,32 @@ def BinomGP(y, N, X, X_star,mcmc_iter,start={}): ======= A tuple of (model,trace,pred) where model is a PyMC3 Model object, trace is a PyMC3 Multitrace object, and pred is a 5000 x X_star.shape[0] matrix of draws from the posterior predictive distribution of $\pi_t$. """ - - X = np.array(X)[:,None] # Inputs must be arranged as column vector - X_star = X_star[:,None] + prac_idx = np.array(prac_idx) + n_prac = np.unique(prac_idx).shape[0] + X = np.array(X)[:, None] # Inputs must be arranged as column vector + X_star = X_star[:, None] model = pm.Model() with model: - - alpha = pm.Normal('alpha',0, 1000, testval=0.) - beta = pm.Normal('beta',0, 100, testval=0.) - sigmasq_s = pm.Gamma('sigmasq_s',.1,.1,testval=0.1) - phi_s = pm.Gamma('phi_s', 1., 1., testval=0.5) - tau2 = pm.Gamma('tau2',.1,.1,testval=0.1) + alpha = pm.Normal('alpha', 0, 1000, testval=0.) + beta = pm.Normal('beta', 0, 100, testval=0.) + sigmasq_s = pm.Gamma('sigmasq_s', .1, .1, testval=0.1) + phi_s = pm.Gamma('phi_s', 1., 1., testval=0.5) + tau2 = pm.Gamma('tau2', .1, .1, testval=0.1) # Construct GPs - cov_s = sigmasq_s * pm.gp.cov.Periodic(1,365.,phi_s) + cov_s = sigmasq_s * pm.gp.cov.Periodic(1, 365., phi_s) mean_f = pm.gp.mean.Linear(coeffs=beta, intercept=alpha) - gp_s = pm.gp.Latent(mean_func=mean_f,cov_func=cov_s) - + gp_s = pm.gp.Latent(mean_func=mean_f, cov_func=cov_s) + cov_nugget = pm.gp.cov.WhiteNoise(tau2) nugget = pm.gp.Latent(cov_func=cov_nugget) gp = gp_s + nugget model.gp = gp - s = gp.prior('s',X=X) - + s = gp.prior('s', X=X) + Y_obs = pm.Binomial('y_obs', N, pm.invlogit(s), observed=y) # Sample @@ -72,12 +74,12 @@ def BinomGP(y, N, X, X_star,mcmc_iter,start={}): start=start) # Predictions s_star = gp.conditional('s_star', X_star) - pred = pm.sample_ppc(trace, vars=[s_star,Y_obs]) + pred = pm.sample_ppc(trace, vars=[s_star, Y_obs]) - return (model,trace,pred) + return model, trace, pred -def extractData(dat,species,condition): +def extractData(dat, species, condition): """Extracts weekly records for 'condition' in 'species'. Parameters @@ -95,27 +97,34 @@ def extractData(dat,species,condition): N -- total number of cases seen weekly for species """ - dat = dat.copy()[dat.Species==species] + dat = dat.copy()[dat['Species'] == species] dat.Date = pd.DatetimeIndex(dat.Date) - # Throw out recently added practices - byPractice = dat.groupby(['Practice_ID']) - firstAppear = byPractice['Date'].agg([['mindate','min']]) - dat = pd.merge(dat, firstAppear, how='left', on='Practice_ID') - dat = dat[dat.mindate<'2018-06-01'] + # # Throw out recently added practices + # byPractice = dat.groupby(['Practice_ID']) + # firstAppear = byPractice['Date'].agg([['mindate','min']]) + # dat = pd.merge(dat, firstAppear, how='left', on='Practice_ID') + # dat = dat[dat.mindate<'2018-06-01'] - # Aggregate by week + # Turn date into days, and quantize into weeks minDate = np.min(dat.Date) - week = (pd.DatetimeIndex(dat.Date)-minDate).days // 7 - dat = dat.groupby(week) - aggTable = dat['Consult_reason'].agg([['cases',lambda x: np.sum(x==condition)], - ['N',len]]) + day = ((pd.DatetimeIndex(dat.Date) - minDate).days // 7) * 7 + day.name = 'Day' + dat['day'] = day - aggTable['day'] = aggTable.index * 7 - aggTable['date'] = [minDate + pd.Timedelta(dt*7, unit='D') for dt in aggTable.index] - - return (aggTable) + # Calculate the earliest day for each practice + day0 = dat.groupby(dat['Practice_ID'])['day'].agg([['day0', np.min]]) + # Now group by Practice/day combination + dat = dat.groupby(by=[dat['Practice_ID'], day]) + aggTable = dat['Consult_reason'].agg([['cases', lambda x: np.sum(x == condition)], + ['N', len]]) + aggTable = aggTable.reset_index() + aggTable['date'] = [minDate + pd.Timedelta(dt, unit='D') for dt in aggTable['Day']] + aggTable = pd.merge(aggTable, day0, how='left', on='Practice_ID') + aggTable['time_since_recruitment'] = aggTable['Day'] - aggTable['day0'] + aggTable['Practice_ID'] = aggTable['Practice_ID'].astype('category') + return aggTable def predProb(y, ystar): @@ -135,28 +144,23 @@ def predProb(y, ystar): return q - - - - - -if __name__=='__main__': +if __name__ == '__main__': import argparse import pickle parser = argparse.ArgumentParser(description='Fit Binomial GP timeseries model') - parser.add_argument("data", nargs=1, type=str, - help="Input data file with (at least) columns Date, Species, Consult_reason") - parser.add_argument("-o", "--prefix", dest="prefix",type=str,default=None, + parser.add_argument("data", nargs=1, type=str, + help="Input data file with (at least) columns Date, Species, Consult_reason") + parser.add_argument("-o", "--prefix", dest="prefix", type=str, default=None, help="Output file prefix [optional].") - parser.add_argument("-c", "--condition", dest="condition",nargs='+',type=str, + parser.add_argument("-c", "--condition", dest="condition", nargs='+', type=str, required=True, help="One or more space-separated conditions to analyse") - parser.add_argument("-s", "--species", dest="species",nargs='+',type=str, + parser.add_argument("-s", "--species", dest="species", nargs='+', type=str, required=True, help="One or more space-separated species to analyse") - parser.add_argument("-i", "--iterations", dest="iterations", type=int, + parser.add_argument("-i", "--iterations", dest="iterations", type=int, default=5000, nargs=1, help="Number of MCMC iterations (default 5000)") args = parser.parse_args() @@ -165,14 +169,20 @@ def predProb(y, ystar): for species in args.species: for condition in args.condition: - print ("Running GP smoother for condition '%s' in species '%s'" % (condition,species)) + print("Running GP smoother for condition '%s' in species '%s'" % (condition, species)) + sys.stdout.write("Extracting data...") + sys.stdout.flush() d = extractData(data, species, condition) - res = BinomGP(np.array(d.cases),np.array(d.N), - np.array(d.day), np.array(d.day), + sys.stdout.write("done\nCalculating...") + sys.stdout.flush() + res = BinomGP(np.array(d.cases), np.array(d.N), + np.array(d.Day), np.array(d.Practice_ID.cat.codes), np.array(d.Day), mcmc_iter=args.iterations[0]) + sys.stdout.write("done\n") + sys.stdout.flush() filename = "%s_%s.pkl" % (species, condition) if args.prefix is not None: - filename = "%s%s" % (args.prefix,filename) - print ("Saving '%s'" % filename) + filename = "%s%s" % (args.prefix, filename) + print("Saving '%s'" % filename) with open(filename, 'wb') as f: - pickle.dump(res,f,pickle.HIGHEST_PROTOCOL) + pickle.dump(res, f, pickle.HIGHEST_PROTOCOL) diff --git a/tests/test_data_extraction.py b/tests/test_data_extraction.py new file mode 100644 index 0000000..ab4d1bb --- /dev/null +++ b/tests/test_data_extraction.py @@ -0,0 +1,18 @@ +# Data extraction tests + +import unittest +import os +import pandas as pd + +from savsnet.prev_ts_GP import extractData + +class TestDataExtraction(unittest.TestCase): + + def setUp(self): + test_dir = os.path.dirname(os.path.realpath(__file__)) + self.data_path = os.path.join(test_dir, "../data/savsnet_2019_08_06.csv") + + def test_extract(self): + data_csv = pd.read_csv(self.data_path) + data = extractData(data_csv, 'cat', 'gastroenteric') + print(data) \ No newline at end of file diff --git a/tests/test_prediction.py b/tests/test_prediction.py new file mode 100644 index 0000000..f61c288 --- /dev/null +++ b/tests/test_prediction.py @@ -0,0 +1 @@ +from prev_ts_GP \ No newline at end of file From b36bbddb416136b71b001ebef335c33e804cf5c9 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Fri, 16 Aug 2019 14:40:59 +0100 Subject: [PATCH 03/15] Random effects model for Practice_ID working. Now need to sort out the plotting. --- savsnet/plot_ts_gp.py | 2 +- savsnet/prev_ts_GP.py | 45 ++++++++++++++++++++++++++++--------------- 2 files changed, 30 insertions(+), 17 deletions(-) diff --git a/savsnet/plot_ts_gp.py b/savsnet/plot_ts_gp.py index 9d61c64..7dc4548 100644 --- a/savsnet/plot_ts_gp.py +++ b/savsnet/plot_ts_gp.py @@ -88,7 +88,7 @@ def plotAllPred(data,predictions,species,condition,lag=None): for i in range(nConditions): for j in range(nSpecies): - plotPrediction(ax[i,j], data[i][j].day, + plotPrediction(ax[i,j], data[i][j].Day, data[i][j].cases,data[i][j].N, predictions[i][j]['s_star'], np.min(data[i][j].date),lag=lag,prev_mult=1000) diff --git a/savsnet/prev_ts_GP.py b/savsnet/prev_ts_GP.py index a6d3864..7537940 100644 --- a/savsnet/prev_ts_GP.py +++ b/savsnet/prev_ts_GP.py @@ -10,7 +10,15 @@ import theano.tensor.slinalg as tla -def BinomGP(y, N, X, prac_idx, X_star, mcmc_iter, start={}): +def match(x, y): + """Returns the positions of the first occurrence of values of x in y.""" + def indexof(a, b): + i = np.where(b == a)[0] + return i if i.shape[0] > 0 else [np.nan] + return np.concatenate([indexof(xx, y) for xx in x]) + + +def BinomGP(y, N, X, prac_id, X_star, mcmc_iter, start={}): """Fits a logistic Gaussian process regression timeseries model. Details @@ -40,40 +48,45 @@ def BinomGP(y, N, X, prac_idx, X_star, mcmc_iter, start={}): ======= A tuple of (model,trace,pred) where model is a PyMC3 Model object, trace is a PyMC3 Multitrace object, and pred is a 5000 x X_star.shape[0] matrix of draws from the posterior predictive distribution of $\pi_t$. """ - prac_idx = np.array(prac_idx) - n_prac = np.unique(prac_idx).shape[0] + X = np.array(X)[:, None] # Inputs must be arranged as column vector - X_star = X_star[:, None] + X = X - np.mean(X) + T = np.unique(X) + prac_unique = np.unique(prac_id) model = pm.Model() with model: alpha = pm.Normal('alpha', 0, 1000, testval=0.) beta = pm.Normal('beta', 0, 100, testval=0.) - sigmasq_s = pm.Gamma('sigmasq_s', .1, .1, testval=0.1) - phi_s = pm.Gamma('phi_s', 1., 1., testval=0.5) - tau2 = pm.Gamma('tau2', .1, .1, testval=0.1) + sigmasq_s = pm.HalfNormal('sigmasq_s', 5., testval=0.1) + phi_s = 0.16 #pm.HalfNormal('phi_s', 5., testval=0.5) + tau2 = pm.HalfNormal('tau2', 5., testval=0.1) # Construct GPs cov_s = sigmasq_s * pm.gp.cov.Periodic(1, 365., phi_s) mean_f = pm.gp.mean.Linear(coeffs=beta, intercept=alpha) - gp_s = pm.gp.Latent(mean_func=mean_f, cov_func=cov_s) + gp_t = pm.gp.Latent(mean_func=mean_f, cov_func=cov_s) - cov_nugget = pm.gp.cov.WhiteNoise(tau2) - nugget = pm.gp.Latent(cov_func=cov_nugget) + # cov_nugget = pm.gp.cov.WhiteNoise(tau2) + # nugget = pm.gp.Latent(cov_func=cov_nugget) - gp = gp_s + nugget - model.gp = gp - s = gp.prior('s', X=X) + s_t = gp_t.prior('gp_t', X=T[:, None]) # + nugget + + u = pm.Normal('u', mu=0., sd=1., shape=prac_unique.shape[0]) + u_i = tt.sqrt(tau2) * u + + s = s_t[match(X.flatten(), T)] + u_i[match(prac_id, prac_unique)] Y_obs = pm.Binomial('y_obs', N, pm.invlogit(s), observed=y) # Sample trace = pm.sample(mcmc_iter, chains=1, - start=start) + start=start, + tune=500) # Predictions - s_star = gp.conditional('s_star', X_star) + s_star = gp_t.conditional('s_star', T[:, None]) pred = pm.sample_ppc(trace, vars=[s_star, Y_obs]) return model, trace, pred @@ -99,7 +112,7 @@ def extractData(dat, species, condition): """ dat = dat.copy()[dat['Species'] == species] dat.Date = pd.DatetimeIndex(dat.Date) - + dat = dat[dat.Date > '2016-08-01'] # # Throw out recently added practices # byPractice = dat.groupby(['Practice_ID']) # firstAppear = byPractice['Date'].agg([['mindate','min']]) From 5be56e0fd5fd01de66e2ef4dd86d99e26e0133bd Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Thu, 13 Feb 2020 10:32:36 +0000 Subject: [PATCH 04/15] Minor changes. --- savsnet/prev_ts_GP.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/savsnet/prev_ts_GP.py b/savsnet/prev_ts_GP.py index a6d3864..84d0f93 100644 --- a/savsnet/prev_ts_GP.py +++ b/savsnet/prev_ts_GP.py @@ -40,8 +40,8 @@ def BinomGP(y, N, X, prac_idx, X_star, mcmc_iter, start={}): ======= A tuple of (model,trace,pred) where model is a PyMC3 Model object, trace is a PyMC3 Multitrace object, and pred is a 5000 x X_star.shape[0] matrix of draws from the posterior predictive distribution of $\pi_t$. """ - prac_idx = np.array(prac_idx) n_prac = np.unique(prac_idx).shape[0] + print(f"Received {n_prac} practices") X = np.array(X)[:, None] # Inputs must be arranged as column vector X_star = X_star[:, None] @@ -62,9 +62,12 @@ def BinomGP(y, N, X, prac_idx, X_star, mcmc_iter, start={}): cov_nugget = pm.gp.cov.WhiteNoise(tau2) nugget = pm.gp.Latent(cov_func=cov_nugget) + sigma_u = pm.HalfNormal('sigma_u', 5.) + u = pm.Normal('u', alpha, sigma_u, shape=n_prac) + gp = gp_s + nugget model.gp = gp - s = gp.prior('s', X=X) + s = gp.prior('s', X=X) + u[prac_idx] Y_obs = pm.Binomial('y_obs', N, pm.invlogit(s), observed=y) From 967f909b61e9ebced72f84524b588db11fad2880 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Sun, 16 Feb 2020 09:28:15 +0000 Subject: [PATCH 05/15] Added logistic2D --- savsnet/logistic2D.py | 81 +++++++++++++++++++++++++++++++++++++++++++ savsnet/prev_ts_GP.py | 42 ++++++++++++---------- 2 files changed, 104 insertions(+), 19 deletions(-) create mode 100644 savsnet/logistic2D.py diff --git a/savsnet/logistic2D.py b/savsnet/logistic2D.py new file mode 100644 index 0000000..f6d4e63 --- /dev/null +++ b/savsnet/logistic2D.py @@ -0,0 +1,81 @@ +"""Functions to perform 2D logistic kriging""" + +import numpy as np +import theano as t +import theano.tensor as tt +from theano.tensor.nlinalg import matrix_inverse +import pymc3 as pm +from pymc3.gp.util import stabilize +from pymc3.gp.util import kmeans_inducing_points + +class Data: + def __init__(self, y, coords_obs, coords_knots): + self.y = np.squeeze(np.array(y)) + self.coords = np.array(coords_obs) + self.knots = np.array(coords_knots) + assert y.shape[0] == coords_obs.shape[0] + assert self.coords.shape[1] == 2 + assert self.knots.shape[1] == 2 + + def __len__(self): + return self.y.shape[0] + + + +class Logistic2D(pm.Model): + def __init__(self, y, coords, n_knots, name='', model=None): + super().__init__(name, model) + + alpha = pm.Normal('alpha', mu=0., sd=1.) + sigma_sq = pm.Gamma('sigma_sq', 1., 1.) + phi = pm.Gamma('phi', 1., 1.) + + # Inducing points + knots = kmeans_inducing_points(n_knots, coords) + spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi) + spatial_gp = pm.gp.Latent(cov_func=spatial_cov) + inducing = spatial_gp.prior('k', X=knots) + + # Project inducing points onto real points + cov_kk = spatial_cov(knots) + kk_inducing = tt.dot(matrix_inverse(stabilize(cov_kk)), inducing) + cov_sk = spatial_cov(coords, knots) + s = tt.dot(cov_sk, kk_inducing) + + eta = alpha + s + y_rv = pm.Bernoulli('y', pm.invlogit(eta), observed=y) + + + def sample(self, *args, **kwargs): + with self: + return pm.sample(*args, **kwargs) + + +if __name__ == '__main__': + + import argparse + import pickle as pkl + import pandas as pd + + parser = argparse.ArgumentParser(description='Fit Binomial GP timeseries model') + parser.add_argument("data", nargs=1, type=str, + help="Input data file with (at least) columns Date, Species, Consult_reason") + args = parser.parse_args() + + data = pd.read_csv(args.data[0]) + + data = data[('2020-01-01' <= data['consult_date']) & (data['consult_date'] < '2020-01-07')] + print(data.columns) + + y = np.array(data['gi_mpc_selected']).astype(np.int) + coords = np.array(data[['Easting','Northing']], dtype='float32') + + model = Logistic2D(y, coords, n_knots=300) + trace = model.sample(5000, chains=1) + with model: + pm.traceplot(trace) + with open('vom_dog_post.pkl','wb') as f: + pkl.dump(trace, f) + + + diff --git a/savsnet/prev_ts_GP.py b/savsnet/prev_ts_GP.py index bffa1b4..250aae0 100644 --- a/savsnet/prev_ts_GP.py +++ b/savsnet/prev_ts_GP.py @@ -49,9 +49,6 @@ def BinomGP(y, N, X, prac_id, X_star, mcmc_iter, start={}): A tuple of (model,trace,pred) where model is a PyMC3 Model object, trace is a PyMC3 Multitrace object, and pred is a 5000 x X_star.shape[0] matrix of draws from the posterior predictive distribution of $\pi_t$. """ - n_prac = np.unique(prac_idx).shape[0] - print(f"Received {n_prac} practices") - X = np.array(X)[:, None] # Inputs must be arranged as column vector X = X - np.mean(X) # Center covariates T = np.unique(X) @@ -67,27 +64,37 @@ def BinomGP(y, N, X, prac_id, X_star, mcmc_iter, start={}): tau2 = pm.HalfNormal('tau2', 5., testval=0.1) # Construct GPs - cov_s = sigmasq_s * pm.gp.cov.Periodic(1, 365., phi_s) - mean_f = pm.gp.mean.Linear(coeffs=beta, intercept=alpha) - gp_t = pm.gp.Latent(mean_func=mean_f, cov_func=cov_s) + cov_t = sigmasq_s * pm.gp.cov.Periodic(1, 365., phi_s) + mean_t = pm.gp.mean.Linear(coeffs=beta, intercept=alpha) + gp_t = pm.gp.Latent(mean_func=mean_t, cov_func=cov_t) + + #cov_nugget = pm.gp.cov.WhiteNoise(tau2) + #nugget = pm.gp.Latent(cov_func=cov_nugget) - cov_nugget = pm.gp.cov.WhiteNoise(tau2) - nugget = pm.gp.Latent(cov_func=cov_nugget) + s_t = gp_t.prior('gp_t', X=T[:, None]) # + nugget - sigma_u = pm.HalfNormal('sigma_u', 5.) - u = pm.Normal('u', alpha, sigma_u, shape=n_prac) + u = pm.Normal('u', mu=0., sd=1., shape=prac_unique.shape[0]) + u_i = tt.sqrt(tau2) * u - gp = gp_s + nugget - model.gp = gp - s = gp.prior('s', X=X) + u[prac_idx] + s = s_t[match(X.flatten(), T)] + u_i[match(prac_id, prac_unique)] Y_obs = pm.Binomial('y_obs', N, pm.invlogit(s), observed=y) # Sample + step_u = pm.Metropolis([u]) + step_gp_t = pm.Metropolis([s_t]) + step_tau2 = pm.Metropolis([tau2, sigmasq_s]) + step_beta = pm.Metropolis([alpha, beta]) trace = pm.sample(mcmc_iter, chains=1, start=start, - tune=500) + tune=500, + step=[step_u, step_gp_t, step_tau2, step_beta], + #max_tree_depth=6, + #early_max_treedepth=4, + #t0=100, + #target_accept=0.99, + adapt_step_size=True) # Predictions s_star = gp_t.conditional('s_star', T[:, None]) pred = pm.sample_ppc(trace, vars=[s_star, Y_obs]) @@ -116,11 +123,8 @@ def extractData(dat, species, condition): dat = dat.copy()[dat['Species'] == species] dat.Date = pd.DatetimeIndex(dat.Date) dat = dat[dat.Date > '2016-08-01'] - # # Throw out recently added practices - # byPractice = dat.groupby(['Practice_ID']) - # firstAppear = byPractice['Date'].agg([['mindate','min']]) - # dat = pd.merge(dat, firstAppear, how='left', on='Practice_ID') - # dat = dat[dat.mindate<'2018-06-01'] + dat = dat[dat.Date <= '2019-08-01'] + # Turn date into days, and quantize into weeks minDate = np.min(dat.Date) From 8d705fe6655090cb924ba13c60f7b7f78caab91a Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Mon, 17 Feb 2020 11:21:47 +0000 Subject: [PATCH 06/15] Algorithm to do logistic kriging with raster prediction. --- requirements.txt | 4 ++ savsnet/gis_util.py | 80 +++++++++++++++++++++++++++++++ savsnet/logistic2D.py | 108 +++++++++++++++++++++++++++++------------- savsnet/raster.py | 18 +++++++ 4 files changed, 176 insertions(+), 34 deletions(-) create mode 100644 savsnet/gis_util.py create mode 100644 savsnet/raster.py diff --git a/requirements.txt b/requirements.txt index c49b4b9..60fd778 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,7 @@ +numpy +matplotlib +rasterio +geopandas pandas theano pymc3 diff --git a/savsnet/gis_util.py b/savsnet/gis_util.py new file mode 100644 index 0000000..ddc933c --- /dev/null +++ b/savsnet/gis_util.py @@ -0,0 +1,80 @@ +# Raster generator +import numpy as np +import rasterio +import rasterio.mask +from rasterio.io import MemoryFile +from rasterio.transform import from_origin +from rasterio.plot import show +import matplotlib.pyplot as plt + +def make_masked_raster(polygon, resolution, bands=1, all_touched=False, + nodata=-9999., crs='+init=epsg:27700', filename=None): + """Generates a raster with points outside poly set to nodata. Pixels + are set to be of dimension res x res""" + x = np.arange(polygon.bounds['minx'].values, polygon.bounds['maxx'].values, resolution) + y = np.arange(polygon.bounds['miny'].values, polygon.bounds['maxy'].values, resolution) + X,Y = np.meshgrid(x,y) + Z = np.ones(shape=X.shape) + transform = from_origin(x[0] - resolution/2, y[-1]+resolution/2, resolution, resolution) + + raster_args = {'driver': 'GTiff', + 'height': Z.shape[0], + 'width': Z.shape[1], + 'count': bands, + 'dtype': Z.dtype, + 'crs': crs, + 'transform': transform, + 'nodata': nodata} + + if filename is None: + memfile = MemoryFile() + raster = memfile.open(**raster_args) + else: + raster = rasterio.open(filename, 'w+', **raster_args) + + for i in range(bands): + raster.write(Z, i+1) + + mask = rasterio.mask.mask(raster, polygon.geometry, crop=False, all_touched=all_touched) + raster.write_mask(mask[0][0]) + return raster + + +def raster2coords(raster): + x, y = np.meshgrid(np.arange(raster.shape[0]), np.arange(raster.shape[1])) + coords = np.array([raster.xy(x, y) for x, y in zip(x.ravel(), y.ravel())]) + return coords[raster.read_masks(1).T.ravel().astype(np.bool), :] + + +def fill_raster(data, raster, band=1): + r = raster.read(1, masked=True).T.flatten() + r[~r.mask.flatten()] = data + r = r.reshape([raster.shape[1], raster.shape[0]]).T + raster.write(r, band) + + +def gen_raster(posterior, poly, filename=None, + summary=lambda x: np.mean(x, axis=0)): + raster = make_masked_raster(polygon=poly, resolution=5000., + bands=1, filename=filename) + + fill_raster(summary(posterior), + raster, 1) + + raster.update_tags(1, surface='s_star') + raster.close() + + return raster + + +def plot_raster(ax, raster, transform, title=None, z_range=[None, None], log=False, alpha=1.0): + + show(raster, transform=transform, vmin=z_range[0], vmax=z_range[1], ax=ax, cmap='coolwarm', alpha=alpha) + ax.set_title(title) + ax.axis('off') + cb = plt.colorbar(ax.images[1], ax=ax, shrink=0.7) + if log is True: + ticks = cb.get_ticks() + cb.set_ticks(ticks) + cb.set_ticklabels(np.round(np.exp(ticks), 1)) + return ax, cb diff --git a/savsnet/logistic2D.py b/savsnet/logistic2D.py index f6d4e63..0e9efc7 100644 --- a/savsnet/logistic2D.py +++ b/savsnet/logistic2D.py @@ -1,12 +1,18 @@ """Functions to perform 2D logistic kriging""" +import sys + +import geopandas as gp import numpy as np +import pymc3 as pm import theano as t import theano.tensor as tt -from theano.tensor.nlinalg import matrix_inverse -import pymc3 as pm from pymc3.gp.util import stabilize -from pymc3.gp.util import kmeans_inducing_points +from theano.tensor.nlinalg import matrix_inverse + +from savsnet.gis_util import make_masked_raster, raster2coords +from savsnet.raster import make_raster + class Data: def __init__(self, y, coords_obs, coords_knots): @@ -21,61 +27,95 @@ def __len__(self): return self.y.shape[0] - -class Logistic2D(pm.Model): - def __init__(self, y, coords, n_knots, name='', model=None): - super().__init__(name, model) - +def logistic2D(y, N, coords, pred_coords): + model = pm.Model() + with model: alpha = pm.Normal('alpha', mu=0., sd=1.) sigma_sq = pm.Gamma('sigma_sq', 1., 1.) - phi = pm.Gamma('phi', 1., 1.) + phi = pm.Gamma('phi', 5., 1.) - # Inducing points - knots = kmeans_inducing_points(n_knots, coords) spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi) spatial_gp = pm.gp.Latent(cov_func=spatial_cov) - inducing = spatial_gp.prior('k', X=knots) - - # Project inducing points onto real points - cov_kk = spatial_cov(knots) - kk_inducing = tt.dot(matrix_inverse(stabilize(cov_kk)), inducing) - cov_sk = spatial_cov(coords, knots) - s = tt.dot(cov_sk, kk_inducing) + s = spatial_gp.prior('s', X=coords) eta = alpha + s - y_rv = pm.Bernoulli('y', pm.invlogit(eta), observed=y) + y_rv = pm.Binomial('y', n=N, p=pm.invlogit(eta), observed=y) + kxx = spatial_cov(coords) + kxxtx = matrix_inverse(stabilize(kxx)) + kxxs = tt.dot(kxxtx, s) + knew = spatial_cov(pred_coords, coords) + s_star = tt.dot(knew, kxxs) + s_star_ = pm.Deterministic('s_star', s_star) - def sample(self, *args, **kwargs): - with self: - return pm.sample(*args, **kwargs) + def sample_fn(*args, **kwargs): + with model: + trace = pm.sample(*args, **kwargs) + return {'model': model, 'posterior': trace} + return sample_fn -if __name__ == '__main__': +# Prediction + + + +def build_predictor(orig_coords, pred_coords): + t.config.compute_test_value = 'off' + sigma_sq = tt.dscalar('sigma_sq', ) + phi = tt.dscalar('phi') + s = tt.dvector('s') + spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi) + cov_x = spatial_cov(orig_coords) + kxx = matrix_inverse(stabilize(cov_x)) + kxxs = tt.dot(kxx, s) + kss = spatial_cov(pred_coords, orig_coords) + s_star = tt.dot(kss, kxxs) + + return t.function([sigma_sq, phi, s], [s_star]) + + + +if __name__ == '__main__': import argparse import pickle as pkl import pandas as pd parser = argparse.ArgumentParser(description='Fit Binomial GP timeseries model') parser.add_argument("data", nargs=1, type=str, - help="Input data file with (at least) columns Date, Species, Consult_reason") + help="Input data file with (at least) columns consult_date, gi_mpc_selected, Easting, Northing") + parser.add_argument("--iterations", "-i", type=int, default=[5000], nargs=1, + help="Number of MCMC iterations") args = parser.parse_args() data = pd.read_csv(args.data[0]) + data['consult_date'] = pd.to_datetime(data['consult_date']) + week = pd.to_timedelta(7, unit='day') - data = data[('2020-01-01' <= data['consult_date']) & (data['consult_date'] < '2020-01-07')] - print(data.columns) + start = pd.to_datetime(['2019-12-02']) - y = np.array(data['gi_mpc_selected']).astype(np.int) - coords = np.array(data[['Easting','Northing']], dtype='float32') + for i in range(9): + end = start + week + d = data[(start[0] <= data['consult_date']) & (data['consult_date'] < end[0])] - model = Logistic2D(y, coords, n_knots=300) - trace = model.sample(5000, chains=1) - with model: - pm.traceplot(trace) - with open('vom_dog_post.pkl','wb') as f: - pkl.dump(trace, f) + d = d.groupby([d['Easting'], d['Northing']]) + aggr = d['gi_mpc_selected'].agg([['cases', np.sum], ['N', len]]) + aggr.reset_index(inplace=True) + + y = np.array(aggr['cases']) + N = np.array(aggr['N']) + coords = np.array(aggr[['Easting', 'Northing']], dtype='float32') + + # GIS + poly = gp.read_file('/home/jewellcp/Documents/GIS/gadm36_GBR.gpkg').to_crs(epsg=27700) + r = make_masked_raster(poly, 5000., crs='+init=epsg:27700') + pred_points = raster2coords(r) + sampler = logistic2D(y, N, coords / 1000., pred_points / 1000.) + result = sampler(args.iterations[0], chains=1) + with open(f"vom_dog_week{i}.pkl", 'wb') as f: + pkl.dump(result, f) + start = end + sys.exit(0) diff --git a/savsnet/raster.py b/savsnet/raster.py new file mode 100644 index 0000000..076c2a8 --- /dev/null +++ b/savsnet/raster.py @@ -0,0 +1,18 @@ +"""Generate rasters for logistic 2D""" + +import pickle as pkl +import geopandas as gp +from savsnet.gis_util import gen_raster + + +def make_raster(s_star, filename): + poly = gp.read_file('/home/jewellcp/Documents/GIS/gadm36_GBR.gpkg').to_crs(epsg=27700) + gen_raster(s_star, poly, filename=filename) + + +if __name__ == '__main__': + + with open('../../vom_dog_post.pkl', 'rb') as f: + trace = pkl.load(f) + + make_raster(trace['posterior']['s_star'], 'vom_dog_2020-01-01.tiff') From 795ed1250bdc272de4d0a8e66b396b4abf789541 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Sun, 23 Feb 2020 15:37:32 +0000 Subject: [PATCH 07/15] Tidied timeseries plotting. Extended logistic2D to use inducing points. --- savsnet/logistic2D.py | 84 ++++++++++++---------------- savsnet/plot_ts_gp.py | 105 +++++++++++++++++------------------ savsnet/prev_ts_GP.py | 126 +++++++++++++++++++----------------------- 3 files changed, 143 insertions(+), 172 deletions(-) diff --git a/savsnet/logistic2D.py b/savsnet/logistic2D.py index 0e9efc7..d90a567 100644 --- a/savsnet/logistic2D.py +++ b/savsnet/logistic2D.py @@ -27,7 +27,15 @@ def __len__(self): return self.y.shape[0] -def logistic2D(y, N, coords, pred_coords): +def project(x_val, x_coords, x_star_coords, cov_fn): + kxx = cov_fn(x_coords) + kxxtx = matrix_inverse(stabilize(kxx)) + kxxs = tt.dot(kxxtx, x_val) + knew = cov_fn(x_star_coords, x_coords) + return tt.dot(knew, kxxs) + + +def logistic2D(y, coords, knots, pred_coords): model = pm.Model() with model: alpha = pm.Normal('alpha', mu=0., sd=1.) @@ -36,17 +44,12 @@ def logistic2D(y, N, coords, pred_coords): spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi) spatial_gp = pm.gp.Latent(cov_func=spatial_cov) - s = spatial_gp.prior('s', X=coords) + s = spatial_gp.prior('s', X=knots) - eta = alpha + s - y_rv = pm.Binomial('y', n=N, p=pm.invlogit(eta), observed=y) + eta = alpha + project(s, knots, coords, spatial_cov) + y_rv = pm.Bernoulli('y', p=pm.invlogit(eta), observed=y) - kxx = spatial_cov(coords) - kxxtx = matrix_inverse(stabilize(kxx)) - kxxs = tt.dot(kxxtx, s) - knew = spatial_cov(pred_coords, coords) - s_star = tt.dot(knew, kxxs) - s_star_ = pm.Deterministic('s_star', s_star) + s_star_ = pm.Deterministic('s_star', project(s, knots, pred_coords, spatial_cov)) def sample_fn(*args, **kwargs): with model: @@ -56,26 +59,6 @@ def sample_fn(*args, **kwargs): return sample_fn -# Prediction - - - -def build_predictor(orig_coords, pred_coords): - t.config.compute_test_value = 'off' - sigma_sq = tt.dscalar('sigma_sq', ) - phi = tt.dscalar('phi') - s = tt.dvector('s') - spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi) - cov_x = spatial_cov(orig_coords) - kxx = matrix_inverse(stabilize(cov_x)) - kxxs = tt.dot(kxx, s) - kss = spatial_cov(pred_coords, orig_coords) - s_star = tt.dot(kss, kxxs) - - return t.function([sigma_sq, phi, s], [s_star]) - - - if __name__ == '__main__': import argparse import pickle as pkl @@ -84,38 +67,39 @@ def build_predictor(orig_coords, pred_coords): parser = argparse.ArgumentParser(description='Fit Binomial GP timeseries model') parser.add_argument("data", nargs=1, type=str, help="Input data file with (at least) columns consult_date, gi_mpc_selected, Easting, Northing") + parser.add_argument("--startdate", "-s", type=np.datetime64, nargs=1, help="Start date") + parser.add_argument("--period", "-p", type=int, default=[7], nargs=1, help="Period in days") parser.add_argument("--iterations", "-i", type=int, default=[5000], nargs=1, help="Number of MCMC iterations") args = parser.parse_args() data = pd.read_csv(args.data[0]) data['consult_date'] = pd.to_datetime(data['consult_date']) - week = pd.to_timedelta(7, unit='day') + week = pd.to_timedelta(args.period[0], unit='day') - start = pd.to_datetime(['2019-12-02']) + start = args.startdate[0] - for i in range(9): - end = start + week - d = data[(start[0] <= data['consult_date']) & (data['consult_date'] < end[0])] + end = start + week + d = data[(start <= data['consult_date']) & (data['consult_date'] < end)] - d = d.groupby([d['Easting'], d['Northing']]) - aggr = d['gi_mpc_selected'].agg([['cases', np.sum], ['N', len]]) - aggr.reset_index(inplace=True) + d = d.groupby([d['person_easting'], d['person_northing']]) + aggr = d['gi_mpc_selected'].agg([['case', lambda x: np.sum(x) > 0]]) + aggr.reset_index(inplace=True) - y = np.array(aggr['cases']) - N = np.array(aggr['N']) - coords = np.array(aggr[['Easting', 'Northing']], dtype='float32') + y = np.array(aggr['case']) + coords = np.array(aggr[['person_easting', 'person_northing']], dtype='float32') + knots = pm.gp.util.kmeans_inducing_points(300, coords) - # GIS - poly = gp.read_file('/home/jewellcp/Documents/GIS/gadm36_GBR.gpkg').to_crs(epsg=27700) - r = make_masked_raster(poly, 5000., crs='+init=epsg:27700') - pred_points = raster2coords(r) + # GIS + poly = gp.read_file('/home/jewellcp/Documents/GIS/gadm36_GBR.gpkg').to_crs(epsg=27700) + r = make_masked_raster(poly, 5000., crs='+init=epsg:27700') + pred_points = raster2coords(r) - sampler = logistic2D(y, N, coords / 1000., pred_points / 1000.) - result = sampler(args.iterations[0], chains=1) - with open(f"vom_dog_week{i}.pkl", 'wb') as f: - pkl.dump(result, f) + sampler = logistic2D(y, coords=coords/1000., knots=knots/1000., pred_coords=pred_points/1000.) + result = sampler(args.iterations[0], chains=1) + with open(f"vom_dog_spatial_week{start}.pkl", 'wb') as f: + pkl.dump(result, f) - start = end + start = end sys.exit(0) diff --git a/savsnet/plot_ts_gp.py b/savsnet/plot_ts_gp.py index 7dc4548..de677c2 100644 --- a/savsnet/plot_ts_gp.py +++ b/savsnet/plot_ts_gp.py @@ -5,12 +5,16 @@ from matplotlib.ticker import FormatStrFormatter import pandas as pd import numpy as np -import pymc3 as pm import pickle -from prev_ts_GP import extractData -def plotPrediction(ax,X,y,N,pred,mindate,lag=None,prev_mult=1,plot_gp=False): +def clip_to_range(x, datetime_range): + mintime = np.min(datetime_range) + maxtime = np.max(datetime_range) + return x[(x.index >= mintime) & (x.index <= maxtime)] + + +def plotPrediction(ax, data, prev_mult=1, plot_gp=False): """Predictive time series plot with (by default) the prediction summarised as [0.01,0.05,0.5,0.95,0.99] quantiles, and observations colour-coded by tail-probability. @@ -32,23 +36,17 @@ def plotPrediction(ax,X,y,N,pred,mindate,lag=None,prev_mult=1,plot_gp=False): Nothing. Just modifies ax """ - from pymc3.gp.util import plot_gp_dist + # Prediction quantiles + phat = data['pred'].transpose() * prev_mult # TxM for T times and M mcmc iterations + pctiles = np.percentile(phat, [1, 5, 50, 95, 99], axis=1) - # Time slice - ts = slice(0,X.shape[0]) - if lag is not None: - ts = slice(X.shape[0]-lag, X.shape[0]) - # Data - x = np.array([mindate + pd.Timedelta(d,unit='D') for d in X[ts]]) - pbar = np.array(y/N)[ts] * prev_mult - - # Prediction quantiles - phat = pm.invlogit(pred[:,ts]).eval() * prev_mult - pctiles = np.percentile(phat, [1,5,50,95,99], axis=0) + pbar = data['data']['cases']/data['data']['N'] * prev_mult + pbar = clip_to_range(pbar, phat.index) # Tail probabilities for observed p - prp = np.sum(pbar > phat, axis=0)/phat.shape[0] + phat_interp = phat.apply(lambda x: np.interp(pbar.index, phat.index, x)) + prp = np.sum(pbar[:, None] > phat_interp, axis=1)/phat_interp.shape[1] prp[prp > .5] = 1. - prp[prp > .5] # Risk masks @@ -58,17 +56,33 @@ def plotPrediction(ax,X,y,N,pred,mindate,lag=None,prev_mult=1,plot_gp=False): # Construct plot if plot_gp is True: - plot_gp_dist(ax,phat,x,plot_samples=False, palette="Blues") + from pymc3.gp.util import plot_gp_dist + plot_gp_dist(ax,phat,phat.columns,plot_samples=False, palette="Blues") else: - ax.fill_between(x, pctiles[4,:], pctiles[0,:], color='lightgrey',alpha=.5,label="99% credible interval") - ax.fill_between(x, pctiles[3,:], pctiles[1,:], color='lightgrey',alpha=1,label='95% credible interval') - ax.plot(x, pctiles[2,:], c='grey', ls='-', label="Predicted prevalence") - ax.scatter(x[green],pbar[green],c='green',s=8,alpha=0.5,label='0.05 '2016-08-01'] - dat = dat[dat.Date <= '2019-08-01'] + if date_range is None: + date_range = [np.min(dat['Date']), np.max(dat['Date'])] + dat = dat[dat['Species'] == species] + dat['Date'] = pd.to_datetime(dat['Date']) + dat = dat[dat['Date'] >= date_range[0]] + dat = dat[dat['Date'] < date_range[1]] # Turn date into days, and quantize into weeks - minDate = np.min(dat.Date) - day = ((pd.DatetimeIndex(dat.Date) - minDate).days // 7) * 7 - day.name = 'Day' - dat['day'] = day - - # Calculate the earliest day for each practice - day0 = dat.groupby(dat['Practice_ID'])['day'].agg([['day0', np.min]]) + dat['day'] = (dat['Date'] - __refdate) // np.timedelta64(1,'W') * 7.0 # Ensure this is float - # Now group by Practice/day combination - dat = dat.groupby(by=[dat['Practice_ID'], day]) - aggTable = dat['Consult_reason'].agg([['cases', lambda x: np.sum(x == condition)], + # Now group by date combination + datgrp = dat.groupby(by='day') + aggTable = datgrp['Consult_reason'].agg([['cases', lambda x: np.sum(x == condition)], ['N', len]]) - aggTable = aggTable.reset_index() - aggTable['date'] = [minDate + pd.Timedelta(dt, unit='D') for dt in aggTable['Day']] - aggTable = pd.merge(aggTable, day0, how='left', on='Practice_ID') - aggTable['time_since_recruitment'] = aggTable['Day'] - aggTable['day0'] - aggTable['Practice_ID'] = aggTable['Practice_ID'].astype('category') + aggTable['day'] = np.array(aggTable.index) + aggTable.index = __refdate + pd.to_timedelta(aggTable.index, unit='D') return aggTable -def predProb(y, ystar): - """Calculates predictive tail probability of y wrt ystar. - - Parameters - ========== - y -- a vector of length n of observed values of y - ystar -- a m x n matrix containing draws from the joint distribution of ystar. - - Returns - ======= - A vector of tail probabilities for y wrt ystar. - """ - q = np.sum(y > ystar, axis=0) / ystar.shape[0] - q[q > .5] = 1. - q[q > .5] - return q +# def predProb(y, ystar): +# """Calculates predictive tail probability of y wrt ystar. +# +# Parameters +# ========== +# y -- a vector of length n of observed values of y +# ystar -- a m x n matrix containing draws from the joint distribution of ystar. +# +# Returns +# ======= +# A vector of tail probabilities for y wrt ystar. +# """ +# q = np.sum(y > ystar, axis=0) / ystar.shape[0] +# q[q > .5] = 1. - q[q > .5] +# return q if __name__ == '__main__': @@ -192,12 +173,19 @@ def predProb(y, ystar): print("Running GP smoother for condition '%s' in species '%s'" % (condition, species)) sys.stdout.write("Extracting data...") sys.stdout.flush() - d = extractData(data, species, condition) + d = extractData(data, species, condition, date_range=['2016-02-02','2019-12-01']) sys.stdout.write("done\nCalculating...") sys.stdout.flush() - res = BinomGP(np.array(d.cases), np.array(d.N), - np.array(d.Day), np.array(d.Practice_ID.cat.codes), np.array(d.Day), + + pred_range = pd.date_range(start='2018-02-14', end='2020-02-14', freq='W') + day_pred = (pred_range - __refdate) / np.timedelta64(1, 'D') + + res = BinomGP(np.array(d.cases, dtype=float), np.array(d.N, dtype=float), + np.array(d.day, dtype=float), np.array(day_pred, dtype=float), mcmc_iter=args.iterations[0]) + res['data'] = extractData(data, species, condition, date_range=['2018-02-14','2020-02-14']) + res['pred'] = pd.DataFrame(res['pred']['pi_star']) + res['pred'].columns = pd.Index(pred_range) sys.stdout.write("done\n") sys.stdout.flush() filename = "%s_%s.pkl" % (species, condition) From 8dcb15d2287664e22816dfd7446c11b675e5760b Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Mon, 24 Feb 2020 10:44:09 +0000 Subject: [PATCH 08/15] Automated weekly spatial analysis. --- logistic_krige.py | 57 +++++++++++++++++++++++++++++++++++++++++++ savsnet/logistic2D.py | 2 +- vomiting.sge | 18 ++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 logistic_krige.py create mode 100644 vomiting.sge diff --git a/logistic_krige.py b/logistic_krige.py new file mode 100644 index 0000000..8429d09 --- /dev/null +++ b/logistic_krige.py @@ -0,0 +1,57 @@ +""" Performs logistic Kriging """ + +import sys +import argparse +import pickle as pkl +import numpy as np +import pandas as pd +import geopandas as gp +import pymc3 as pm + +from savsnet.logistic2D import logistic2D +from savsnet.gis_util import gen_raster, make_masked_raster, raster2coords + +if __name__ == '__main__': + import argparse + import pickle as pkl + import pandas as pd + + parser = argparse.ArgumentParser(description='Fit Binomial GP timeseries model') + parser.add_argument("data", nargs=1, type=str, + help="Input data file with (at least) columns consult_date, gi_mpc_selected, Easting, Northing") + parser.add_argument("--iterations", "-i", type=int, default=[5000], nargs=1, + help="Number of MCMC iterations") + parser.add_argument("--startdate", "-s", type=np.datetime64, nargs=1, help="Start date") + parser.add_argument("--period", "-d", type=int, default=[7], nargs=1, help="Period in days") + parser.add_argument("--polygon", "-p", type=str, nargs=1, help="Polygon file") + args = parser.parse_args() + + data = pd.read_csv(args.data[0]) + data['consult_date'] = pd.to_datetime(data['consult_date']) + period = pd.to_timedelta(args.period[0], unit='day') + + start = args.startdate[0] + end = start + period + + d = data[(start <= data['consult_date']) & (data['consult_date'] < end)] + + d = d.groupby([d['person_easting'], d['person_northing']]) + aggr = d['gi_mpc_selected'].agg([['case', lambda x: np.sum(x) > 0]]) + aggr.reset_index(inplace=True) + + y = np.array(aggr['case']) + coords = np.array(aggr[['person_easting', 'person_northing']], dtype='float32') + knots = pm.gp.util.kmeans_inducing_points(300, coords) + + # GIS + poly = gp.read_file(args.polygon[0]).to_crs(epsg=27700) + r = make_masked_raster(poly, 5000., crs='+init=epsg:27700') + pred_points = raster2coords(r) + + sampler = logistic2D(y, coords=coords/1000., knots=knots/1000., pred_coords=pred_points/1000.) + result = sampler(args.iterations[0], chains=1) + with open(f"posterior_week{start}.pkl", 'wb') as f: + pkl.dump(result, f) + + gen_raster(result['posterior']['s_star'], poly, filename=f"raster_{start}.tiff") + sys.exit(0) diff --git a/savsnet/logistic2D.py b/savsnet/logistic2D.py index d90a567..29cb9eb 100644 --- a/savsnet/logistic2D.py +++ b/savsnet/logistic2D.py @@ -40,7 +40,7 @@ def logistic2D(y, coords, knots, pred_coords): with model: alpha = pm.Normal('alpha', mu=0., sd=1.) sigma_sq = pm.Gamma('sigma_sq', 1., 1.) - phi = pm.Gamma('phi', 5., 1.) + phi = pm.Gamma('phi', 2., 0.1) spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi) spatial_gp = pm.gp.Latent(cov_func=spatial_cov) diff --git a/vomiting.sge b/vomiting.sge new file mode 100644 index 0000000..87234a0 --- /dev/null +++ b/vomiting.sge @@ -0,0 +1,18 @@ +#!/bin/bash + +#$ -j y +#$ -cwd +#$ -N savs-dogvom +#$ -l gpu=1 +#$ -pe smp 8 +#$ -S /bin/bash + +. /etc/profile + +conda activate savsnet + +weeks=("2019-12-02" "2019-12-09" "2019-12-16" "2019-12-23" "2019-12-30" "2020-01-6" "2020-01-13" "2020-01-20" "2020-01-27" "2020-01-3" "2020-01-10") + +week=${weeks[$(($SGE_TASK_ID-1))]} +python logistic_krige.py -i 5000 -s $week -p ~/GIS/gadm36_GBR.gpkg data/vomdog_17feb.csv + From 398d70de96e8fe36c138f54d28123343c04f18e4 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Mon, 24 Feb 2020 20:02:18 +0000 Subject: [PATCH 09/15] Implemented alpha channel in raster. Now two-layered. --- logistic_krige.py | 13 ++++++++++++- savsnet/gis_util.py | 12 ++++++------ 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/logistic_krige.py b/logistic_krige.py index 8429d09..8dd974b 100644 --- a/logistic_krige.py +++ b/logistic_krige.py @@ -11,6 +11,16 @@ from savsnet.logistic2D import logistic2D from savsnet.gis_util import gen_raster, make_masked_raster, raster2coords + +def get_mean(x): + return np.mean(x, axis=0) + + +def get_exceed(x): + p = np.sum(x > 0., axis=0)/x.shape[0] + return (p < 0.05) | (p > 0.95) + + if __name__ == '__main__': import argparse import pickle as pkl @@ -53,5 +63,6 @@ with open(f"posterior_week{start}.pkl", 'wb') as f: pkl.dump(result, f) - gen_raster(result['posterior']['s_star'], poly, filename=f"raster_{start}.tiff") + gen_raster(result['posterior']['s_star'], poly, filename=f"raster_{start}.tiff", + summary=[['mean', get_mean], ['exceed', get_exceed]]) sys.exit(0) diff --git a/savsnet/gis_util.py b/savsnet/gis_util.py index ddc933c..34c9987 100644 --- a/savsnet/gis_util.py +++ b/savsnet/gis_util.py @@ -54,16 +54,16 @@ def fill_raster(data, raster, band=1): def gen_raster(posterior, poly, filename=None, - summary=lambda x: np.mean(x, axis=0)): + summary=[['s_star', lambda x: np.mean(x, axis=0)]]): raster = make_masked_raster(polygon=poly, resolution=5000., - bands=1, filename=filename) + bands=len(summary), filename=filename) - fill_raster(summary(posterior), - raster, 1) + for i, (name, f) in enumerate(summary): + fill_raster(f(posterior), + raster, i) + raster.update_tags(i, surface=name) - raster.update_tags(1, surface='s_star') raster.close() - return raster From 48920fa0727f6c3da93fffb852451797e7914324 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Tue, 25 Feb 2020 09:00:33 +0000 Subject: [PATCH 10/15] Minor changes --- savsnet/gis_util.py | 8 ++++---- savsnet/logistic2D.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/savsnet/gis_util.py b/savsnet/gis_util.py index 34c9987..1107938 100644 --- a/savsnet/gis_util.py +++ b/savsnet/gis_util.py @@ -14,7 +14,7 @@ def make_masked_raster(polygon, resolution, bands=1, all_touched=False, x = np.arange(polygon.bounds['minx'].values, polygon.bounds['maxx'].values, resolution) y = np.arange(polygon.bounds['miny'].values, polygon.bounds['maxy'].values, resolution) X,Y = np.meshgrid(x,y) - Z = np.ones(shape=X.shape) + Z = np.full(X.shape, -9999.0) transform = from_origin(x[0] - resolution/2, y[-1]+resolution/2, resolution, resolution) raster_args = {'driver': 'GTiff', @@ -47,7 +47,7 @@ def raster2coords(raster): def fill_raster(data, raster, band=1): - r = raster.read(1, masked=True).T.flatten() + r = raster.read(band, masked=True).T.flatten() r[~r.mask.flatten()] = data r = r.reshape([raster.shape[1], raster.shape[0]]).T raster.write(r, band) @@ -60,8 +60,8 @@ def gen_raster(posterior, poly, filename=None, for i, (name, f) in enumerate(summary): fill_raster(f(posterior), - raster, i) - raster.update_tags(i, surface=name) + raster, i+1) + raster.update_tags(i+1, surface=name) raster.close() return raster diff --git a/savsnet/logistic2D.py b/savsnet/logistic2D.py index 29cb9eb..cfba4e3 100644 --- a/savsnet/logistic2D.py +++ b/savsnet/logistic2D.py @@ -45,11 +45,11 @@ def logistic2D(y, coords, knots, pred_coords): spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi) spatial_gp = pm.gp.Latent(cov_func=spatial_cov) s = spatial_gp.prior('s', X=knots) + s_star_ = pm.Deterministic('s_star', project(s, knots, pred_coords, spatial_cov)) eta = alpha + project(s, knots, coords, spatial_cov) y_rv = pm.Bernoulli('y', p=pm.invlogit(eta), observed=y) - s_star_ = pm.Deterministic('s_star', project(s, knots, pred_coords, spatial_cov)) def sample_fn(*args, **kwargs): with model: From a2d10ab2722ae525b96406284dbee1194a7fdf98 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Tue, 25 Feb 2020 16:05:11 +0000 Subject: [PATCH 11/15] Fixed raster plotting issue. --- savsnet/gis_util.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/savsnet/gis_util.py b/savsnet/gis_util.py index 1107938..610f046 100644 --- a/savsnet/gis_util.py +++ b/savsnet/gis_util.py @@ -14,7 +14,7 @@ def make_masked_raster(polygon, resolution, bands=1, all_touched=False, x = np.arange(polygon.bounds['minx'].values, polygon.bounds['maxx'].values, resolution) y = np.arange(polygon.bounds['miny'].values, polygon.bounds['maxy'].values, resolution) X,Y = np.meshgrid(x,y) - Z = np.full(X.shape, -9999.0) + Z = np.full(X.shape, 1.) transform = from_origin(x[0] - resolution/2, y[-1]+resolution/2, resolution, resolution) raster_args = {'driver': 'GTiff', @@ -22,7 +22,7 @@ def make_masked_raster(polygon, resolution, bands=1, all_touched=False, 'width': Z.shape[1], 'count': bands, 'dtype': Z.dtype, - 'crs': crs, + 'crs': polygon.crs, 'transform': transform, 'nodata': nodata} @@ -31,24 +31,26 @@ def make_masked_raster(polygon, resolution, bands=1, all_touched=False, raster = memfile.open(**raster_args) else: raster = rasterio.open(filename, 'w+', **raster_args) - + for i in range(bands): raster.write(Z, i+1) - mask = rasterio.mask.mask(raster, polygon.geometry, crop=False, all_touched=all_touched) - raster.write_mask(mask[0][0]) + mask = rasterio.mask.mask(raster, polygon.geometry, crop=True, all_touched=all_touched, filled=True) + for i in range(bands): + raster.write(mask[0][i], i+1) + return raster def raster2coords(raster): x, y = np.meshgrid(np.arange(raster.shape[0]), np.arange(raster.shape[1])) coords = np.array([raster.xy(x, y) for x, y in zip(x.ravel(), y.ravel())]) - return coords[raster.read_masks(1).T.ravel().astype(np.bool), :] + return coords[raster.read(1).T.ravel()!=raster.nodata, :] def fill_raster(data, raster, band=1): r = raster.read(band, masked=True).T.flatten() - r[~r.mask.flatten()] = data + r[r!=raster.nodata] = data r = r.reshape([raster.shape[1], raster.shape[0]]).T raster.write(r, band) From 9c14852cd38a36d691cb047441d2939352a95ff4 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Tue, 25 Feb 2020 23:08:45 +0000 Subject: [PATCH 12/15] Fixed raster plotting issue. --- logistic_krige.py | 2 +- savsnet/gis_util.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/logistic_krige.py b/logistic_krige.py index 8dd974b..5a97612 100644 --- a/logistic_krige.py +++ b/logistic_krige.py @@ -18,7 +18,7 @@ def get_mean(x): def get_exceed(x): p = np.sum(x > 0., axis=0)/x.shape[0] - return (p < 0.05) | (p > 0.95) + return ((p < 0.05) | (p > 0.95))*255. if __name__ == '__main__': diff --git a/savsnet/gis_util.py b/savsnet/gis_util.py index 610f046..5696677 100644 --- a/savsnet/gis_util.py +++ b/savsnet/gis_util.py @@ -7,7 +7,7 @@ from rasterio.plot import show import matplotlib.pyplot as plt -def make_masked_raster(polygon, resolution, bands=1, all_touched=False, +def make_masked_raster(polygon, resolution, bands=1, all_touched=False, nodata=-9999., crs='+init=epsg:27700', filename=None): """Generates a raster with points outside poly set to nodata. Pixels are set to be of dimension res x res""" @@ -49,7 +49,7 @@ def raster2coords(raster): def fill_raster(data, raster, band=1): - r = raster.read(band, masked=True).T.flatten() + r = raster.read(band).T.flatten() r[r!=raster.nodata] = data r = r.reshape([raster.shape[1], raster.shape[0]]).T raster.write(r, band) From cf23aef81a7e46d36b4b575986895b275b761c83 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Thu, 21 May 2020 09:34:56 +0100 Subject: [PATCH 13/15] Removed system-specific sge file --- vomiting.sge | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 vomiting.sge diff --git a/vomiting.sge b/vomiting.sge deleted file mode 100644 index 87234a0..0000000 --- a/vomiting.sge +++ /dev/null @@ -1,18 +0,0 @@ -#!/bin/bash - -#$ -j y -#$ -cwd -#$ -N savs-dogvom -#$ -l gpu=1 -#$ -pe smp 8 -#$ -S /bin/bash - -. /etc/profile - -conda activate savsnet - -weeks=("2019-12-02" "2019-12-09" "2019-12-16" "2019-12-23" "2019-12-30" "2020-01-6" "2020-01-13" "2020-01-20" "2020-01-27" "2020-01-3" "2020-01-10") - -week=${weeks[$(($SGE_TASK_ID-1))]} -python logistic_krige.py -i 5000 -s $week -p ~/GIS/gadm36_GBR.gpkg data/vomdog_17feb.csv - From 627e5d9d14dab6d296cd1f1dc68b54aeb7545511 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Thu, 21 May 2020 10:06:55 +0100 Subject: [PATCH 14/15] Updated docstring --- savsnet/logistic2D.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/savsnet/logistic2D.py b/savsnet/logistic2D.py index cfba4e3..62d62f3 100644 --- a/savsnet/logistic2D.py +++ b/savsnet/logistic2D.py @@ -28,6 +28,14 @@ def __len__(self): def project(x_val, x_coords, x_star_coords, cov_fn): + """Projects a Gaussian process defined by `x_val` at `x_coords` + onto a set of coordinates `x_star_coords`. + :param x_val: values of the GP at `x_coords` + :param x_coords: a set of coordinates for each `x_val` + :param x_star_coords: a set of coordinates onto which to project the GP + :param cov_fn: a covariance function returning a covariance matrix given a set of coordinates + :returns: a vector of projected values at `x_star_coords` + """ kxx = cov_fn(x_coords) kxxtx = matrix_inverse(stabilize(kxx)) kxxs = tt.dot(kxxtx, x_val) @@ -36,6 +44,37 @@ def project(x_val, x_coords, x_star_coords, cov_fn): def logistic2D(y, coords, knots, pred_coords): + """Returns an instance of a logistic geostatistical model with + Matern32 covariance. + + Let $y_i, i=1,\dots,n$ be a set of binary observations at locations $x_i, i=1,\dots,n$. + We model $y_i$ as + $$ + y_i \sim Bernoulli(p_i) + $$ + with + $$ + \mbox{logit}(p_i) = \alpha + S(x_i). + $$ + + $S(x_i)$ is a latent Gaussian process defined as + $$ + S(\bm{x}) \sim \mbox{MultivariateNormal}(\bm{0}, \Sigma^2) + $$ + where + $$ + \Sigma_{ij}^2 = \sigma^2 \left(1 + \frac{\sqrt{3(x - x')^2}}{\ell}\right) + \mathrm{exp}\left[ - \frac{\sqrt{3(x - x')^2}}{\ell} \right] + $$ + + The model evaluates $S(x_i)$ approximately using a set of inducing points $x^\star_i, i=1,\dots,m$ for $m$ auxilliary locations. See [Banerjee \emph{et al.} (2009)](https://dx.doi.org/10.1111%2Fj.1467-9868.2008.00663.x) for further details. + + :param y: a vector of binary outcomes {0, 1} + :param coords: a matrix of coordinates of `y` of shape `[n, d]` for `d`-dimensions and `n` observations. + :param knots: a matrix of inducing point coordinates of shape `[m, d]`. + :param pred_coords: a matrix of coordinates at which predictions are required. + :returns: a dictionary containing the PyMC3 `model`, and the `posterior` PyC3 `Multitrace` object. + """ model = pm.Model() with model: alpha = pm.Normal('alpha', mu=0., sd=1.) From a82be3ed0cc2530c2474b819fc5bb2fa40491504 Mon Sep 17 00:00:00 2001 From: Chris Jewell Date: Thu, 21 May 2020 10:13:28 +0100 Subject: [PATCH 15/15] Update README.md --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 8b515c9..7239917 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,22 @@ $ python savsnet/plot_ts_GP.py --help for further details. +Spatial mapping of cases +------------------------ + +The `savsnet/logistic2D` module contains functions for spatial smoothing of case occurrence at point locations, using an inducing point approximation to a logistic geostatistical model with stationary mean and Matérn ($k=3/2$) covariance function. This is called by the `logisticKrige.py` Python script such as: +```bash +$ python logistic_krige.py -i 5000 -s '2020-03-04' -p gadm36_GBR.gpkg myData.csv +``` +where `myData.csv` is a CSV file containing at least the headings `consult_date` (ISO date format), `person_easting` and `person_northing` (in rectangular coordinates), and `case` (1 or 0 denoting positive or negative for a given condition). The script runs the logistic geostatistic model, writes the posterior to a Python pickle file and the posterior mean to a GeoTIFF file. + +See +```bash +$ python logisticKrige.py --help +``` +for information on further arguments. + + License -------