Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
sean-bam committed Feb 3, 2024
0 parents commit ec4dbbf
Show file tree
Hide file tree
Showing 51 changed files with 37,169 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Auto detect text files and perform LF normalization
* text=auto
129 changes: 129 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/
2 changes: 2 additions & 0 deletions PyIsland/Drivers/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from . import mmseqs
from . import hhsuite
118 changes: 118 additions & 0 deletions PyIsland/Drivers/hhsuite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
from pathlib import Path
import subprocess


def update_hhsuite_db(old_db, new_db, datfile):
"removes entries from HHsuite DB"

# for file in Path('../data/interim/profiles/aln1/').glob('aln1HHdb*.ff*'):
# print(file)

old_db_ffindex = Path(old_db).with_suffix(".ffindex")
old_db_ffdata = Path(old_db).with_suffix(".ffdata")

assert (
old_db_ffindex.is_file() and old_db_ffdata.is_file()
), f"""
Cant find {old_db_ffindex} and/or {old_db_ffdata}
"""

new_db_ffindex = Path(new_db).with_suffix(".ffindex")
new_db_ffdata = Path(new_db).with_suffix(".ffdata")

# make a copy of the oldDB
shutil.copyfile(old_db_ffindex, new_db_ffindex)
shutil.copyfile(old_db_ffdata, new_db_ffdata)

# remove entries
subprocess.run(
f"ffindex_modify -s -u -f {datfile} {new_db_ffindex}", shell=True, check=True
)

# check
total_ids = 0
with open(old_db_ffindex) as infile:
for line in infile:
total_ids += 1

num_ids_to_remove = 0
with open(datfile) as infile:
for line in infile:
num_ids_to_remove += 1

remaining_ids = 0
with open(new_db_ffindex) as infile:
for line in infile:
remaining_ids += 1

assert (
total_ids - num_ids_to_remove == remaining_ids
), f"""
There are {total_ids} in the HHsuite DB, and you asked to remove {num_ids_to_remove},
leaving {total_ids - num_ids_to_remove} IDs in the new HHSuite DB.
However, there are {remaining_ids} in the new DB, so something is off.
Did you check the ID extension in the datfile?
It should probably end in .a3m or .hhm
"""


def get_hhsuite_neff(msa):
p1 = subprocess.run(
f"hhalign -i {msa} -o /dev/null -diff inf -id 100 -cpu 12",
shell=True,
capture_output=True,
text=True,
)

neff = float(p1.stdout.strip().split()[10])
return round(neff, 2)

def align_clusters_with_hhalign(input_dir, output_dir, df_clustering):
"""
accepts a table of profile, length and cluster assignment
runs hhalign of the longest query versus all templates in the cluster
singletons are converted to a3m
"""
input_dir = Path(input_dir)
assert input_dir.is_dir(), f"{input_dir} isnt a directory!"

output_dir = Path(output_dir)
if not output_dir.is_dir():
output_dir.mkdir(parents=True)

# df = pd.read_csv(clustering_table)
grouped = df_clustering.groupby("cluster")

# make a

for name, group in grouped:
output_a3m = output_dir / Path(f"cls_{name}").with_suffix(".a3m")

# sort by profile length≈
profiles = group.sort_values(by="qlen", ascending=False).qseq.tolist()

# get the first, longest profile
query_profile = input_dir / Path(f"{profiles[0]}.afa")

# for singletons, just convert to a3m
if len(group) < 2:
subprocess.run(
f"reformat.pl fas a3m {query_profile} {output_a3m}",
shell=True,
check=True,
)
else:
# make a string of all the remaining profiles to pass to hhalign
templates = ""
for profile in profiles[1:]:
subject = input_dir / Path(f"{profile}.afa")

assert subject.is_file(), f"cant find {subject}"
templates = templates + f"-t {subject} "

# run hhalign
subprocess.run(
f"hhalign -i {query_profile} {templates} -M 50 -glob -id 100 -diff inf -oa3m {output_a3m}",
shell=True,
check=True,
)
Loading

0 comments on commit ec4dbbf

Please sign in to comment.