Skip to content

Commit

Permalink
fix unique psms ids issue
Browse files Browse the repository at this point in the history
hot-fix for non-unique input files
  • Loading branch information
gieses committed Mar 2, 2021
1 parent c75b841 commit c4d073c
Show file tree
Hide file tree
Showing 3 changed files with 6 additions and 2 deletions.
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FDR,Fasta1,Fasta2,LinkPos1,LinkPos2,PSMID,Peptide1,Peptide2,Run,fdrGroup,isTT,score,rp,scx,hsax,isTD,isDD
fdr,Fasta1,Fasta2,LinkPos1,LinkPos2,PSMID,Peptide1,Peptide2,Run,fdrGroup,isTT,score,rp,scx,hsax,isTD,isDD
0.0,SUCC_ECOLI Succinate--CoA ligase [ADP-forming] subunit beta OS=Escherichia coli (strain K12) OX=83333 GN=sucC PE=1 SV=1,SUCD_ECOLI Succinate--CoA ligase [ADP-forming] subunit alpha OS=Escherichia coli (strain K12) OX=83333 GN=sucD PE=1 SV=2,11,12,333321,VAEETPHLIHKVALDPLTGPMPYQGR,MGHAGAIIAGGKGTADEK,recal_B181130_09_HF_FW_IN_130_ECLP_DSS01_SCX22_hSAX06_rep1.mgf,Between ,True,27.165,2905.68822,22,6,False,False
0.0,SUCC_ECOLI Succinate--CoA ligase [ADP-forming] subunit beta OS=Escherichia coli (strain K12) OX=83333 GN=sucC PE=1 SV=1,SUCD_ECOLI Succinate--CoA ligase [ADP-forming] subunit alpha OS=Escherichia coli (strain K12) OX=83333 GN=sucD PE=1 SV=2,11,12,1414170,VAEETPHLIHKVALDPLTGPMPYQGR,MoxGHAGAIIAGGKGTADEK,recal_B181203_16_HF_FW_IN_130_ECLP_DSS01_SCX23_hSAX06_rep2.mgf,Between ,True,26.019,2833.43868,23,6,False,False
0.0,RS6_ECOLI 30S ribosomal protein S6 OS=Escherichia coli (strain K12) OX=83333 GN=rpsF PE=1 SV=1,RS18_ECOLI 30S ribosomal protein S18 OS=Escherichia coli (strain K12) OX=83333 GN=rpsR PE=1 SV=2,11,6,1625137,HAVTEASPMVKAK,DIATLKNYITESGK,recal_B181123_21_HF_FW_IN_130_ECLP_DSS01_SCX21_hSAX05_rep2.mgf,Between ,True,24.917,3094.394160000001,21,5,False,False
Expand Down
2 changes: 1 addition & 1 deletion xirt/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@
# They must be carefully adapted for each prediction task.
# recommended to use sigmoid for fractions (SCX/hSAX) if ordinal regression method should be used
hsax-activation: sigmoid
# columne where the fraction RT is in the CSV input
# column where the fraction RT is in the CSV input ("xx_ordinal" xx_
hsax-column: hsax_ordinal
# the number of unique / distinct values (e.g. fractions)
hsax-dimension: 10
Expand Down
4 changes: 4 additions & 0 deletions xirt/predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,10 @@ def preprocess(matches_df, sequence_type="crosslink", max_length=-1, cl_residue=
# sort to keep only highest scoring peptide from duplicated entries
matches_df = matches_df.sort_values(by="score", ascending=False)

if len(matches_df["PSMID"].unique()) != matches_df.shape[0]:
logger.warning("PSMID column was not unique! Redundant PSMIDs were removed")
matches_df = matches_df.drop_duplicates("PSMID")

logger.info("Reordering peptide sequences. (mode: {})".format(sequence_type))

# generate columns to handle based on input data type
Expand Down

0 comments on commit c4d073c

Please sign in to comment.