Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support for SNP queries for Brassica napus; new endpoint for homologous protein queries #261

Merged
merged 9 commits into from
Mar 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions api/models/canola_nssnp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import Optional
from api import db
# from sqlalchemy.ext.declarative import declarative_base


# CanolaBase = declarative_base()
class CanolaProteinReference(db.Model):
__bind_key__ = "canola_nssnp"
__tablename__ = "protein_reference"

protein_reference_id: db.Mapped[int] = db.mapped_column(db.Integer(), primary_key=True, autoincrement=True)
gene_identifier: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
gene_name: db.Mapped[Optional[str]] = db.mapped_column(db.String(45), nullable=True)

proteinsJoin = db.relationship("CanolaSnpsToProtein", backref="protein", cascade="all, delete-orphan")


class CanolaSnpsToProtein(db.Model):
__bind_key__ = "canola_nssnp"
__tablename__ = "snps_to_protein"

snps_reference_id: db.Mapped[int] = db.mapped_column(db.Integer(), primary_key=True, autoincrement=True)
protein_reference_id: db.Mapped[int] = db.mapped_column(
db.Integer(), db.ForeignKey("protein_reference.protein_reference_id", ondelete="CASCADE"), primary_key=True
)
transcript_pos: db.Mapped[int] = db.mapped_column(db.Integer(), nullable=False)
chromosome: db.Mapped[str] = db.mapped_column(db.String(25), nullable=False)
chromosomal_loci: db.Mapped[int] = db.mapped_column(db.Integer(), nullable=False)
ref_DNA: db.Mapped[str] = db.mapped_column(db.String(1), nullable=False)
alt_DNA: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
aa_pos: db.Mapped[int] = db.mapped_column(db.Integer(), nullable=False)
ref_aa: db.Mapped[str] = db.mapped_column(db.String(3), nullable=False)
alt_aa: db.Mapped[str] = db.mapped_column(db.String(3), nullable=False)
type: db.Mapped[str] = db.mapped_column(db.String(50), nullable=False)
effect_impact: db.Mapped[str] = db.mapped_column(db.String(50), nullable=False)
transcript_biotype: db.Mapped[Optional[str]] = db.mapped_column(db.String(45), nullable=True)
alt_freq: db.Mapped[float] = db.mapped_column(db.Numeric(10, 5), nullable=False)
14 changes: 14 additions & 0 deletions api/models/homologs_db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from api import db


class homologs(db.Model):
__bind_key__ = "homologs_db"
__tablename__ = 'homologs'

homologs_id: db.Mapped[int] = db.mapped_column(db.Integer(), primary_key=True, autoincrement=True)
search_protein_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
result_protein_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
search_species_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
result_species_name: db.Mapped[str] = db.mapped_column(db.String(45), nullable=False)
Percent_id: db.Mapped[float] = db.mapped_column(db.Numeric(10, 5), nullable=False)
e_score: db.Mapped[str] = db.mapped_column(db.String(10), nullable=False)
150 changes: 117 additions & 33 deletions api/resources/snps.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,11 @@
SnpsReference as SoybeanSnpsReference,
SamplesLookup as SoybeanSampleNames,
)
from api.models.canola_nssnp import (
CanolaProteinReference as CanolaProteinReference,
CanolaSnpsToProtein as CanolaSnpsToProtein
)
from api.models.homologs_db import homologs as HomologsDB
from api.utils.bar_utils import BARUtils
from flask import request
import re
Expand Down Expand Up @@ -141,7 +146,7 @@ class GeneNameAlias(Resource):
def get(self, species="", gene_id=""):
"""Endpoint returns annotated SNP poplar data in order of (to match A th API format):
AA pos (zero-indexed), sample id, 'missense_variant','MODERATE', 'MISSENSE', codon/DNA base change,
AA change (DH), pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype
AA change (DH), pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype (allele frequency for canola)
values with single quotes are fixed"""
results_json = []

Expand All @@ -160,43 +165,76 @@ def get(self, species="", gene_id=""):
protein_reference = SoybeanProteinReference
snps_to_protein = SoybeanSnpsToProtein
snps_reference = SoybeanSnpsReference
elif species == "canola" and BARUtils.is_canola_gene_valid(gene_id):
protein_reference = CanolaProteinReference
snps_to_protein = CanolaSnpsToProtein
else:
return BARUtils.error_exit("Invalid gene id"), 400

rows = (
db.session.execute(
db.select(protein_reference, snps_to_protein, snps_reference)
.select_from(protein_reference)
.join(snps_to_protein)
.join(snps_reference)
.where(protein_reference.gene_identifier == gene_id)
if species == "canola" and BARUtils.is_canola_gene_valid(gene_id):
rows = (
db.session.execute(
db.select(protein_reference, snps_to_protein)
.select_from(protein_reference)
.join(snps_to_protein)
.where(protein_reference.gene_identifier == gene_id)
)
.tuples()
.all()
)
for protein, snptoprotein in rows:
itm_lst = [
snptoprotein.chromosome,
snptoprotein.aa_pos - 1, # zero index-ed
None,
"missense_variant",
"MODERATE",
"MISSENSE",
str(snptoprotein.transcript_pos) + snptoprotein.ref_DNA + ">" + snptoprotein.alt_DNA,
snptoprotein.ref_aa + snptoprotein.alt_aa,
None,
gene_id,
"protein_coding",
"CODING",
protein.gene_name,
float(snptoprotein.alt_freq),
]
results_json.append(itm_lst)
else:
rows = (
db.session.execute(
db.select(protein_reference, snps_to_protein, snps_reference)
.select_from(protein_reference)
.join(snps_to_protein)
.join(snps_reference)
.where(protein_reference.gene_identifier == gene_id)
)
.tuples()
.all()
)
.tuples()
.all()
)

# BAR A Th API format is chr, AA pos (zero-indexed), sample id, 'missense_variant',
# 'MODERATE', 'MISSENSE', codon/DNA base change, AA change (DH),
# pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype
for protein, snpsjoin, snpstbl in rows:
itm_lst = [
snpstbl.chromosome,
# snpstbl.chromosomal_loci,
snpsjoin.aa_pos - 1, # zero index-ed
snpstbl.sample_id,
"missense_variant",
"MODERATE",
"MISSENSE",
str(snpsjoin.transcript_pos) + snpsjoin.ref_DNA + ">" + snpsjoin.alt_DNA,
snpsjoin.ref_aa + snpsjoin.alt_aa,
None,
re.sub(r".\d$", "", protein.gene_identifier),
"protein_coding",
"CODING",
protein.gene_identifier,
None,
]
results_json.append(itm_lst)
# BAR A Th API format is chr, AA pos (zero-indexed), sample id, 'missense_variant',
# 'MODERATE', 'MISSENSE', codon/DNA base change, AA change (DH),
# pro length, gene ID, 'protein_coding', 'CODING', transcript id, biotype
for protein, snpsjoin, snpstbl in rows:
itm_lst = [
snpstbl.chromosome,
# snpstbl.chromosomal_loci,
snpsjoin.aa_pos - 1, # zero index-ed
snpstbl.sample_id,
"missense_variant",
"MODERATE",
"MISSENSE",
str(snpsjoin.transcript_pos) + snpsjoin.ref_DNA + ">" + snpsjoin.alt_DNA,
snpsjoin.ref_aa + snpsjoin.alt_aa,
None,
re.sub(r".\d$", "", protein.gene_identifier),
"protein_coding",
"CODING",
protein.gene_identifier,
None,
]
results_json.append(itm_lst)

# Return results if there are data
if len(results_json) > 0:
Expand Down Expand Up @@ -502,3 +540,49 @@ def get(self, pval="", araid="", popid=""):
pop_both_sig_idx = HotspotUtils.get_sig_index(pop_both_sig)
output = {"ara_id": araid, "pop_id": popid, "ara_hotspots": ara_both_sig_idx, "pop_hotspots": pop_both_sig_idx}
return BARUtils.success_exit(output)


@snps.route("/homologs/<string:search_species>/<string:search_gene>/<string:target_species>")
class Homologs(Resource):
@snps.param("search_species", _in="path", default="canola")
@snps.param("search_gene", _in="path", default="BnaA07g31480D")
@snps.param("target_species", _in="path", default="arabidopsis")
def get(self, search_species="", search_gene="", target_species=""):
"""This endpoint shows the homologs proteins of search_gene in target_species.
The endpoint returns a list of homologous pairs of proteins in following format:
Percent_id(percent identity get by blast); e score
"""
# Escape input
search_species = escape(search_species)
gene_id = escape(search_gene)
target_species = escape(target_species)
supported = ["arabidopsis", "canola"]
if (search_species not in supported) or (target_species not in supported):
return BARUtils.error_exit("Species not supported"), 400
elif (search_species == "arabidopsis" and BARUtils.is_arabidopsis_gene_valid(gene_id)) or (search_species == "canola" and BARUtils.is_canola_gene_valid(gene_id)):
results = HomologsDB.query.filter_by(
search_protein_name=gene_id,
search_species_name=search_species,
result_species_name=target_species
).all()
if not results:
return BARUtils.error_exit("No homologs found for the given query"), 400

homologs_list = [
{
"search_species_name": search_species,
"search_protein_name": gene_id,
"result_species_name": target_species,
"result_protein_name": homolog.result_protein_name,
"Percent_id": float(homolog.Percent_id),
"e_score": float(homolog.e_score)
}
for homolog in results
]
homologs_list.sort(key=lambda x: x["e_score"])
if len(homologs_list) >= 5:
homologs_list = homologs_list[:5]
response = {"homologs": homologs_list}
return BARUtils.success_exit(response), 200
else:
return BARUtils.error_exit("Invalid gene id"), 400
11 changes: 11 additions & 0 deletions api/utils/bar_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,17 @@ def is_cannabis_gene_valid(gene):
else:
return False

@staticmethod
def is_canola_gene_valid(gene):
"""This function verifies if canola gene (BnaC07g42830D) is valid
:param gene:
:return:
"""
if re.search(r"^Bna[AC]\d{2}g\d{5}[A-D]?$", gene, re.I):
return True
else:
return False

@staticmethod
def is_arachis_gene_valid(gene):
"""This function verifies if arachis gene is valid: Adur10000_comp0_c0_seq1
Expand Down
5 changes: 4 additions & 1 deletion config/BAR_API.cfg
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -40,5 +40,8 @@ SQLALCHEMY_BINDS = {
'striga' : 'mysql://root:root@localhost/striga',
'tomato_nssnp' : 'mysql://root:root@localhost/tomato_nssnp',
'tomato_sequence' : 'mysql://root:root@localhost/tomato_sequence',
'triphysaria' : 'mysql://root:root@localhost/triphysaria'
'triphysaria' : 'mysql://root:root@localhost/triphysaria',
'canola_nssnp' : 'mysql://root:root@localhost/canola_nssnp',
'homologs_db' : 'mysql://root:root@localhost/homologs_db'

}
Loading
Loading