reimandlab
diff --git a/‎DESCRIPTION
+1-1 b/‎DESCRIPTION
+1-1
diff --git a/‎R/Rmimp.R
+8-4 b/‎R/Rmimp.R
+8-4
diff --git a/‎R/pwm-functions.r
+25-9 b/‎R/pwm-functions.r
+25-9
diff --git a/‎build/rmimp_1.2.tar.gz
574 Bytes b/‎build/rmimp_1.2.tar.gz
574 Bytes
diff --git a/‎build/rmimp_manual.pdf
626 Bytes b/‎build/rmimp_manual.pdf
626 Bytes
diff --git a/‎man/mss.Rd
+6-1 b/‎man/mss.Rd
+6-1
diff --git a/‎man/trainModel.Rd
+5-1 b/‎man/trainModel.Rd
+5-1
@@ -3,7 +3,7 @@ Type: Package
 Title: Predicting the impact of mutations on kinase-substrate
     phosphorylation
 Version: 1.2
-Date: 2018-02-19
+Date: 2018-02-20
 Author: Omar Wagih
 Maintainer: Omar Wagih <[email protected]>
 Description: MIMP is a machine learning method that predicts the impact of
 
@@ -124,6 +124,9 @@ unfactor <- function(df){
 #' @param threshold (optional) the minimum number of scores needed for each domain to train the model
 #' @param min.auc (optional) the minimum number of AUC needed for each domain to train the model
 #' @param priors Named character vector containing priors of amino acids.
+#' @param residues_groups a vector of regular expressions used to group kinases by central residue they target;
+#' if a sequence does not have a central residue matching a group chosen from modified.residues by the algorithm
+#' (based on PWM), the sequence will be discarded.
 #' 
 #' @return a GMM model
 #' 
@@ -132,7 +135,8 @@ unfactor <- function(df){
 #' 
 #' @export
 trainModel <- function(pos.dir, neg.dir, kinase.domain = F,
-                       cores = 2, file = NULL, threshold = 10, min.auc = 0.65, priors){
+                       cores = 2, file = NULL, threshold = 10, min.auc = 0.65, priors,
+                       residues_groups = c('S|T', 'Y')){
   # Get a list of files from pos.dir and neg.dir
   # and check if the corresponding files exists
   fileNames <- intersect(list.files(path = c(pos.dir), full.names = F), list.files(path = c(neg.dir), full.names = F))
@@ -189,11 +193,11 @@ trainModel <- function(pos.dir, neg.dir, kinase.domain = F,
     pwm <- PWM(pos.seqs, is.kinase.pwm = F, priors = priors, do.pseudocounts = T)
 
     # Score the positive and negative seqs with the PWM
-    pos.scores <- unlist(mss(pos.seqs, pwm, kinase.domain = kinase.domain))
-    neg.scores <- unlist(mss(neg.seqs, pwm, kinase.domain = kinase.domain))
+    pos.scores <- unlist(mss(pos.seqs, pwm, kinase.domain = kinase.domain, residues_groups = residues_groups))
+    neg.scores <- unlist(mss(neg.seqs, pwm, kinase.domain = kinase.domain, residues_groups = residues_groups))
 
     # Check if both pos and neg have more scores than THRESHOLD
-    if (any(c(nrow(pos.scores), nrow(neg.scores)) < threshold)) {
+    if (any(c(nrow(pos.scores), nrow(neg.scores)) < threshold) || is.null(pos.scores) || is.null(neg.scores)) {
       warning(binding.site, " skipped as at least ", threshold, " scores required for trainning.")
       return(NULL)
     }
 
@@ -249,28 +249,44 @@ scoreArrayRolling <- function(seqs, pwm){
 #' @param na_rm Remove NA scores?
 #' @param ignore_cent If TRUE, central residue is ignore from scoring.
 #' @param kinase.domain Whether the domain to be trained is a kinase domain.
+#' @param residues_groups a vector of regular expressions used to group kinases by central residue they target;
+#' if a sequence does not have a central residue matching a group chosen from modified.residues by the algorithm
+#' (based on PWM), the sequence will be discarded.
 #'  
 #' @keywords pwm mss match tfbs
 #' 
 #' @keywords internal
 #' @examples
 #' # No Examples
-mss <- function(seqs, pwm, na_rm=F, ignore_cent=T, kinase.domain = T){
+mss <- function(seqs, pwm, na_rm=F, ignore_cent=T, kinase.domain = T, residues_groups = c('S|T', 'Y')){
   # If not kinase domain, use non-central MSS instead
   if (!kinase.domain) {
     return(.mssNonCentral(seqs, pwm))
   }
 
-  cent_ind = ceiling(ncol(pwm)/2)
+  # Central residue index
+  central_index = ceiling(ncol(pwm)/2)
+  
   # Only score sequences which have a central residue S/T or Y depending on the PWM
-  kinase_type = names(which.max(pwm[,cent_ind]))
-  kinase_type = ifelse(grepl('S|T', kinase_type), 'S|T', 'Y')
+  
+  # Take the aminoacid with the heaviest weight at the central position
+  heaviest_central_residue = names(which.max(pwm[,central_index]))
+  
+  # Kinases are considered to be of the same type
+  # if they modify the same group of residues.
+
+  # Knowing the most frequently modified residue (from PWM),
+  # choose the kinase type (=residues group) that matches the residue.
+  for (residue_group in residues_groups) {
+    if (grepl(residue_group, heaviest_central_residue)) {
+      kinase_type = residue_group
+    }
+  }
+  
   central_res = kinase_type
 
-  # Central residue index
-  central_ind = NA
-  if(ignore_cent)
-    central_ind = ceiling(ncol(pwm)/2)
+  if (!ignore_cent)
+    central_index = NA
 
   # Info content
   ic = attr(pwm, 'match.ic')
@@ -283,7 +299,7 @@ mss <- function(seqs, pwm, na_rm=F, ignore_cent=T, kinase.domain = T){
                                ignore_cent = ignore_cent)
 
   # Get array of scores
-  keep_scores = grepl(central_res, substr(seqs, central_ind, central_ind))
+  keep_scores = grepl(central_res, substr(seqs, central_index, central_index))
 
   # Score only ones we're keeping
   scores = rep(NA, length(seqs))