Skip to content

Code including spatial analysis for prolific vomiting paper #1

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,22 @@ $ python savsnet/plot_ts_GP.py --help
for further details.


Spatial mapping of cases
------------------------

The `savsnet/logistic2D` module contains functions for spatial smoothing of case occurrence at point locations, using an inducing point approximation to a logistic geostatistical model with stationary mean and Matérn ($k=3/2$) covariance function. This is called by the `logisticKrige.py` Python script such as:
```bash
$ python logistic_krige.py -i 5000 -s '2020-03-04' -p gadm36_GBR.gpkg myData.csv
```
where `myData.csv` is a CSV file containing at least the headings `consult_date` (ISO date format), `person_easting` and `person_northing` (in rectangular coordinates), and `case` (1 or 0 denoting positive or negative for a given condition). The script runs the logistic geostatistic model, writes the posterior to a Python pickle file and the posterior mean to a GeoTIFF file.

See
```bash
$ python logisticKrige.py --help
```
for information on further arguments.


License
-------

Expand Down
68 changes: 68 additions & 0 deletions logistic_krige.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
""" Performs logistic Kriging """

import sys
import argparse
import pickle as pkl
import numpy as np
import pandas as pd
import geopandas as gp
import pymc3 as pm

from savsnet.logistic2D import logistic2D
from savsnet.gis_util import gen_raster, make_masked_raster, raster2coords


def get_mean(x):
return np.mean(x, axis=0)


def get_exceed(x):
p = np.sum(x > 0., axis=0)/x.shape[0]
return ((p < 0.05) | (p > 0.95))*255.


if __name__ == '__main__':
import argparse
import pickle as pkl
import pandas as pd

parser = argparse.ArgumentParser(description='Fit Binomial GP timeseries model')
parser.add_argument("data", nargs=1, type=str,
help="Input data file with (at least) columns consult_date, gi_mpc_selected, Easting, Northing")
parser.add_argument("--iterations", "-i", type=int, default=[5000], nargs=1,
help="Number of MCMC iterations")
parser.add_argument("--startdate", "-s", type=np.datetime64, nargs=1, help="Start date")
parser.add_argument("--period", "-d", type=int, default=[7], nargs=1, help="Period in days")
parser.add_argument("--polygon", "-p", type=str, nargs=1, help="Polygon file")
args = parser.parse_args()

data = pd.read_csv(args.data[0])
data['consult_date'] = pd.to_datetime(data['consult_date'])
period = pd.to_timedelta(args.period[0], unit='day')

start = args.startdate[0]
end = start + period

d = data[(start <= data['consult_date']) & (data['consult_date'] < end)]

d = d.groupby([d['person_easting'], d['person_northing']])
aggr = d['gi_mpc_selected'].agg([['case', lambda x: np.sum(x) > 0]])
aggr.reset_index(inplace=True)

y = np.array(aggr['case'])
coords = np.array(aggr[['person_easting', 'person_northing']], dtype='float32')
knots = pm.gp.util.kmeans_inducing_points(300, coords)

# GIS
poly = gp.read_file(args.polygon[0]).to_crs(epsg=27700)
r = make_masked_raster(poly, 5000., crs='+init=epsg:27700')
pred_points = raster2coords(r)

sampler = logistic2D(y, coords=coords/1000., knots=knots/1000., pred_coords=pred_points/1000.)
result = sampler(args.iterations[0], chains=1)
with open(f"posterior_week{start}.pkl", 'wb') as f:
pkl.dump(result, f)

gen_raster(result['posterior']['s_star'], poly, filename=f"raster_{start}.tiff",
summary=[['mean', get_mean], ['exceed', get_exceed]])
sys.exit(0)
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
numpy
matplotlib
rasterio
geopandas
pandas
theano
pymc3
82 changes: 82 additions & 0 deletions savsnet/gis_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Raster generator
import numpy as np
import rasterio
import rasterio.mask
from rasterio.io import MemoryFile
from rasterio.transform import from_origin
from rasterio.plot import show
import matplotlib.pyplot as plt

def make_masked_raster(polygon, resolution, bands=1, all_touched=False,
nodata=-9999., crs='+init=epsg:27700', filename=None):
"""Generates a raster with points outside poly set to nodata. Pixels
are set to be of dimension res x res"""
x = np.arange(polygon.bounds['minx'].values, polygon.bounds['maxx'].values, resolution)
y = np.arange(polygon.bounds['miny'].values, polygon.bounds['maxy'].values, resolution)
X,Y = np.meshgrid(x,y)
Z = np.full(X.shape, 1.)
transform = from_origin(x[0] - resolution/2, y[-1]+resolution/2, resolution, resolution)

raster_args = {'driver': 'GTiff',
'height': Z.shape[0],
'width': Z.shape[1],
'count': bands,
'dtype': Z.dtype,
'crs': polygon.crs,
'transform': transform,
'nodata': nodata}

if filename is None:
memfile = MemoryFile()
raster = memfile.open(**raster_args)
else:
raster = rasterio.open(filename, 'w+', **raster_args)

for i in range(bands):
raster.write(Z, i+1)

mask = rasterio.mask.mask(raster, polygon.geometry, crop=True, all_touched=all_touched, filled=True)
for i in range(bands):
raster.write(mask[0][i], i+1)

return raster


def raster2coords(raster):
x, y = np.meshgrid(np.arange(raster.shape[0]), np.arange(raster.shape[1]))
coords = np.array([raster.xy(x, y) for x, y in zip(x.ravel(), y.ravel())])
return coords[raster.read(1).T.ravel()!=raster.nodata, :]


def fill_raster(data, raster, band=1):
r = raster.read(band).T.flatten()
r[r!=raster.nodata] = data
r = r.reshape([raster.shape[1], raster.shape[0]]).T
raster.write(r, band)


def gen_raster(posterior, poly, filename=None,
summary=[['s_star', lambda x: np.mean(x, axis=0)]]):
raster = make_masked_raster(polygon=poly, resolution=5000.,
bands=len(summary), filename=filename)

for i, (name, f) in enumerate(summary):
fill_raster(f(posterior),
raster, i+1)
raster.update_tags(i+1, surface=name)

raster.close()
return raster


def plot_raster(ax, raster, transform, title=None, z_range=[None, None], log=False, alpha=1.0):

show(raster, transform=transform, vmin=z_range[0], vmax=z_range[1], ax=ax, cmap='coolwarm', alpha=alpha)
ax.set_title(title)
ax.axis('off')
cb = plt.colorbar(ax.images[1], ax=ax, shrink=0.7)
if log is True:
ticks = cb.get_ticks()
cb.set_ticks(ticks)
cb.set_ticklabels(np.round(np.exp(ticks), 1))
return ax, cb
144 changes: 144 additions & 0 deletions savsnet/logistic2D.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
"""Functions to perform 2D logistic kriging"""

import sys

import geopandas as gp
import numpy as np
import pymc3 as pm
import theano as t
import theano.tensor as tt
from pymc3.gp.util import stabilize
from theano.tensor.nlinalg import matrix_inverse

from savsnet.gis_util import make_masked_raster, raster2coords
from savsnet.raster import make_raster


class Data:
def __init__(self, y, coords_obs, coords_knots):
self.y = np.squeeze(np.array(y))
self.coords = np.array(coords_obs)
self.knots = np.array(coords_knots)
assert y.shape[0] == coords_obs.shape[0]
assert self.coords.shape[1] == 2
assert self.knots.shape[1] == 2

def __len__(self):
return self.y.shape[0]


def project(x_val, x_coords, x_star_coords, cov_fn):
"""Projects a Gaussian process defined by `x_val` at `x_coords`
onto a set of coordinates `x_star_coords`.
:param x_val: values of the GP at `x_coords`
:param x_coords: a set of coordinates for each `x_val`
:param x_star_coords: a set of coordinates onto which to project the GP
:param cov_fn: a covariance function returning a covariance matrix given a set of coordinates
:returns: a vector of projected values at `x_star_coords`
"""
kxx = cov_fn(x_coords)
kxxtx = matrix_inverse(stabilize(kxx))
kxxs = tt.dot(kxxtx, x_val)
knew = cov_fn(x_star_coords, x_coords)
return tt.dot(knew, kxxs)


def logistic2D(y, coords, knots, pred_coords):
"""Returns an instance of a logistic geostatistical model with
Matern32 covariance.

Let $y_i, i=1,\dots,n$ be a set of binary observations at locations $x_i, i=1,\dots,n$.
We model $y_i$ as
$$
y_i \sim Bernoulli(p_i)
$$
with
$$
\mbox{logit}(p_i) = \alpha + S(x_i).
$$

$S(x_i)$ is a latent Gaussian process defined as
$$
S(\bm{x}) \sim \mbox{MultivariateNormal}(\bm{0}, \Sigma^2)
$$
where
$$
\Sigma_{ij}^2 = \sigma^2 \left(1 + \frac{\sqrt{3(x - x')^2}}{\ell}\right)
\mathrm{exp}\left[ - \frac{\sqrt{3(x - x')^2}}{\ell} \right]
$$

The model evaluates $S(x_i)$ approximately using a set of inducing points $x^\star_i, i=1,\dots,m$ for $m$ auxilliary locations. See [Banerjee \emph{et al.} (2009)](https://dx.doi.org/10.1111%2Fj.1467-9868.2008.00663.x) for further details.

:param y: a vector of binary outcomes {0, 1}
:param coords: a matrix of coordinates of `y` of shape `[n, d]` for `d`-dimensions and `n` observations.
:param knots: a matrix of inducing point coordinates of shape `[m, d]`.
:param pred_coords: a matrix of coordinates at which predictions are required.
:returns: a dictionary containing the PyMC3 `model`, and the `posterior` PyC3 `Multitrace` object.
"""
model = pm.Model()
with model:
alpha = pm.Normal('alpha', mu=0., sd=1.)
sigma_sq = pm.Gamma('sigma_sq', 1., 1.)
phi = pm.Gamma('phi', 2., 0.1)

spatial_cov = sigma_sq * pm.gp.cov.Matern32(2, phi)
spatial_gp = pm.gp.Latent(cov_func=spatial_cov)
s = spatial_gp.prior('s', X=knots)
s_star_ = pm.Deterministic('s_star', project(s, knots, pred_coords, spatial_cov))

eta = alpha + project(s, knots, coords, spatial_cov)
y_rv = pm.Bernoulli('y', p=pm.invlogit(eta), observed=y)


def sample_fn(*args, **kwargs):
with model:
trace = pm.sample(*args, **kwargs)
return {'model': model, 'posterior': trace}

return sample_fn


if __name__ == '__main__':
import argparse
import pickle as pkl
import pandas as pd

parser = argparse.ArgumentParser(description='Fit Binomial GP timeseries model')
parser.add_argument("data", nargs=1, type=str,
help="Input data file with (at least) columns consult_date, gi_mpc_selected, Easting, Northing")
parser.add_argument("--startdate", "-s", type=np.datetime64, nargs=1, help="Start date")
parser.add_argument("--period", "-p", type=int, default=[7], nargs=1, help="Period in days")
parser.add_argument("--iterations", "-i", type=int, default=[5000], nargs=1,
help="Number of MCMC iterations")
args = parser.parse_args()

data = pd.read_csv(args.data[0])
data['consult_date'] = pd.to_datetime(data['consult_date'])
week = pd.to_timedelta(args.period[0], unit='day')

start = args.startdate[0]

end = start + week
d = data[(start <= data['consult_date']) & (data['consult_date'] < end)]

d = d.groupby([d['person_easting'], d['person_northing']])
aggr = d['gi_mpc_selected'].agg([['case', lambda x: np.sum(x) > 0]])
aggr.reset_index(inplace=True)

y = np.array(aggr['case'])
coords = np.array(aggr[['person_easting', 'person_northing']], dtype='float32')
knots = pm.gp.util.kmeans_inducing_points(300, coords)

# GIS
poly = gp.read_file('/home/jewellcp/Documents/GIS/gadm36_GBR.gpkg').to_crs(epsg=27700)
r = make_masked_raster(poly, 5000., crs='+init=epsg:27700')
pred_points = raster2coords(r)

sampler = logistic2D(y, coords=coords/1000., knots=knots/1000., pred_coords=pred_points/1000.)
result = sampler(args.iterations[0], chains=1)
with open(f"vom_dog_spatial_week{start}.pkl", 'wb') as f:
pkl.dump(result, f)

start = end

sys.exit(0)
Loading