Skip to content

Commit d6cb289

Browse files
committed
Allow use residues other than STY for model training
1 parent ea3fa57 commit d6cb289

7 files changed

+45
-16
lines changed

DESCRIPTION

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ Type: Package
33
Title: Predicting the impact of mutations on kinase-substrate
44
phosphorylation
55
Version: 1.2
6-
Date: 2018-02-19
6+
Date: 2018-02-20
77
Author: Omar Wagih
88
Maintainer: Omar Wagih <[email protected]>
99
Description: MIMP is a machine learning method that predicts the impact of

R/Rmimp.R

+8-4
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,9 @@ unfactor <- function(df){
124124
#' @param threshold (optional) the minimum number of scores needed for each domain to train the model
125125
#' @param min.auc (optional) the minimum number of AUC needed for each domain to train the model
126126
#' @param priors Named character vector containing priors of amino acids.
127+
#' @param residues_groups a vector of regular expressions used to group kinases by central residue they target;
128+
#' if a sequence does not have a central residue matching a group chosen from modified.residues by the algorithm
129+
#' (based on PWM), the sequence will be discarded.
127130
#'
128131
#' @return a GMM model
129132
#'
@@ -132,7 +135,8 @@ unfactor <- function(df){
132135
#'
133136
#' @export
134137
trainModel <- function(pos.dir, neg.dir, kinase.domain = F,
135-
cores = 2, file = NULL, threshold = 10, min.auc = 0.65, priors){
138+
cores = 2, file = NULL, threshold = 10, min.auc = 0.65, priors,
139+
residues_groups = c('S|T', 'Y')){
136140
# Get a list of files from pos.dir and neg.dir
137141
# and check if the corresponding files exists
138142
fileNames <- intersect(list.files(path = c(pos.dir), full.names = F), list.files(path = c(neg.dir), full.names = F))
@@ -189,11 +193,11 @@ trainModel <- function(pos.dir, neg.dir, kinase.domain = F,
189193
pwm <- PWM(pos.seqs, is.kinase.pwm = F, priors = priors, do.pseudocounts = T)
190194

191195
# Score the positive and negative seqs with the PWM
192-
pos.scores <- unlist(mss(pos.seqs, pwm, kinase.domain = kinase.domain))
193-
neg.scores <- unlist(mss(neg.seqs, pwm, kinase.domain = kinase.domain))
196+
pos.scores <- unlist(mss(pos.seqs, pwm, kinase.domain = kinase.domain, residues_groups = residues_groups))
197+
neg.scores <- unlist(mss(neg.seqs, pwm, kinase.domain = kinase.domain, residues_groups = residues_groups))
194198

195199
# Check if both pos and neg have more scores than THRESHOLD
196-
if (any(c(nrow(pos.scores), nrow(neg.scores)) < threshold)) {
200+
if (any(c(nrow(pos.scores), nrow(neg.scores)) < threshold) || is.null(pos.scores) || is.null(neg.scores)) {
197201
warning(binding.site, " skipped as at least ", threshold, " scores required for trainning.")
198202
return(NULL)
199203
}

R/pwm-functions.r

+25-9
Original file line numberDiff line numberDiff line change
@@ -249,28 +249,44 @@ scoreArrayRolling <- function(seqs, pwm){
249249
#' @param na_rm Remove NA scores?
250250
#' @param ignore_cent If TRUE, central residue is ignore from scoring.
251251
#' @param kinase.domain Whether the domain to be trained is a kinase domain.
252+
#' @param residues_groups a vector of regular expressions used to group kinases by central residue they target;
253+
#' if a sequence does not have a central residue matching a group chosen from modified.residues by the algorithm
254+
#' (based on PWM), the sequence will be discarded.
252255
#'
253256
#' @keywords pwm mss match tfbs
254257
#'
255258
#' @keywords internal
256259
#' @examples
257260
#' # No Examples
258-
mss <- function(seqs, pwm, na_rm=F, ignore_cent=T, kinase.domain = T){
261+
mss <- function(seqs, pwm, na_rm=F, ignore_cent=T, kinase.domain = T, residues_groups = c('S|T', 'Y')){
259262
# If not kinase domain, use non-central MSS instead
260263
if (!kinase.domain) {
261264
return(.mssNonCentral(seqs, pwm))
262265
}
263266

264-
cent_ind = ceiling(ncol(pwm)/2)
267+
# Central residue index
268+
central_index = ceiling(ncol(pwm)/2)
269+
265270
# Only score sequences which have a central residue S/T or Y depending on the PWM
266-
kinase_type = names(which.max(pwm[,cent_ind]))
267-
kinase_type = ifelse(grepl('S|T', kinase_type), 'S|T', 'Y')
271+
272+
# Take the aminoacid with the heaviest weight at the central position
273+
heaviest_central_residue = names(which.max(pwm[,central_index]))
274+
275+
# Kinases are considered to be of the same type
276+
# if they modify the same group of residues.
277+
278+
# Knowing the most frequently modified residue (from PWM),
279+
# choose the kinase type (=residues group) that matches the residue.
280+
for (residue_group in residues_groups) {
281+
if (grepl(residue_group, heaviest_central_residue)) {
282+
kinase_type = residue_group
283+
}
284+
}
285+
268286
central_res = kinase_type
269287

270-
# Central residue index
271-
central_ind = NA
272-
if(ignore_cent)
273-
central_ind = ceiling(ncol(pwm)/2)
288+
if (!ignore_cent)
289+
central_index = NA
274290

275291
# Info content
276292
ic = attr(pwm, 'match.ic')
@@ -283,7 +299,7 @@ mss <- function(seqs, pwm, na_rm=F, ignore_cent=T, kinase.domain = T){
283299
ignore_cent = ignore_cent)
284300

285301
# Get array of scores
286-
keep_scores = grepl(central_res, substr(seqs, central_ind, central_ind))
302+
keep_scores = grepl(central_res, substr(seqs, central_index, central_index))
287303

288304
# Score only ones we're keeping
289305
scores = rep(NA, length(seqs))

build/rmimp_1.2.tar.gz

574 Bytes
Binary file not shown.

build/rmimp_manual.pdf

626 Bytes
Binary file not shown.

man/mss.Rd

+6-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/trainModel.Rd

+5-1
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)