diff --git a/CIRIquant/circ.py b/CIRIquant/circ.py index 57f92bd..b65ad38 100644 --- a/CIRIquant/circ.py +++ b/CIRIquant/circ.py @@ -4,15 +4,16 @@ import sys import re import logging +import functools import pysam import time import subprocess from multiprocessing import Pool from collections import defaultdict -from itertools import izip_longest +from itertools import zip_longest -import utils +from CIRIquant import utils LOGGER = logging.getLogger('CIRIquant') PREFIX = re.compile(r'(.+)[/_-][12]') @@ -113,7 +114,7 @@ def load_fai(fname): content = line.rstrip().split('\t') chrom, length, start, eff_length, line_length = content shift_length = int(length) * int(line_length) / int(eff_length) - faidx[chrom] = [int(start), shift_length] + faidx[chrom] = [int(start), int(shift_length)] return faidx @@ -163,7 +164,7 @@ def generate_index(log_file, circ_info, circ_fasta): """ - from logger import ProgressBar + from CIRIquant.logger import ProgressBar fai = utils.FASTA + '.fai' if not os.path.exists(fai): @@ -242,13 +243,13 @@ def denovo_alignment(log_file, thread, reads, outdir, prefix): denovo_bam = '{}/circ/{}_denovo.bam'.format(outdir, prefix) sorted_bam = '{}/circ/{}_denovo.sorted.bam'.format(outdir, prefix) - align_cmd = '{} -p {} --dta -q -x {}/circ/{}_index -1 {} -2 {} | {} view -bS > {}'.format( + reads_string = f"-1 {reads[0]} -2 {reads[1]}" if len(reads) > 1 else f"-U {reads[0]}" + align_cmd = '{} -p {} --dta -q -x {}/circ/{}_index {} | {} view -bS > {}'.format( utils.HISAT2, thread, outdir, prefix, - reads[0], - reads[1], + reads_string, utils.SAMTOOLS, denovo_bam, ) @@ -287,7 +288,7 @@ def grouper(iterable, n, fillvalue=None): """ args = [iter(iterable)] * n - return izip_longest(*args, fillvalue=None) + return zip_longest(*args, fillvalue=None) def proc_denovo_bam(bam_file, thread, circ_info, threshold, lib_type): @@ -309,7 +310,7 @@ def proc_denovo_bam(bam_file, thread, circ_info, threshold, lib_type): pool = Pool(thread, denovo_initializer, (bam_file, circ_info, threshold, )) jobs = [] - chunk_size = max(500, len(header) / thread + 1) + chunk_size = max(500, int(len(header) / thread) + 1) for circ_chunk in grouper(header, chunk_size): jobs.append(pool.apply_async(denovo_worker, (circ_chunk, lib_type, ))) pool.close() @@ -412,7 +413,7 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir, fsj reads of circRNAs, pair_id -> mate_id -> circ_id """ - import cPickle + import pickle LOGGER.info('Detecting FSJ reads from genome alignment file') sam = pysam.AlignmentFile(bam_file, 'rb') @@ -434,7 +435,7 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir, tmp = job.get() if tmp is None: continue - res = tmp if isinstance(tmp, dict) else cPickle.load(open(tmp, 'rb')) + res = tmp if isinstance(tmp, dict) else pickle.load(open(tmp, 'rb')) chrom_fp_bsj, chrom_fsj, chrom_cand = res['fp_bsj'], res['fsj_reads'], res['cand_to_genome'] for pair_id, mate_id in chrom_fp_bsj: fp_bsj[pair_id][mate_id] = 1 @@ -445,13 +446,13 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir, circ_bsj = defaultdict(dict) circ_fsj = defaultdict(dict) for pair_id in cand_reads: - for mate_id, (circ_id, blocks, cigartuples) in cand_reads[pair_id].iteritems(): + for mate_id, (circ_id, blocks, cigartuples) in cand_reads[pair_id].items(): if pair_id in fp_bsj and mate_id in fp_bsj[pair_id]: continue circ_bsj[circ_id].update({query_prefix(pair_id): 1}) for pair_id in fsj_reads: - for mate_id, circ_id in fsj_reads[pair_id].iteritems(): + for mate_id, circ_id in fsj_reads[pair_id].items(): if pair_id in cand_reads and mate_id in cand_reads[pair_id] and not (pair_id in fp_bsj and mate_id in fp_bsj[pair_id]): continue circ_fsj[circ_id].update({query_prefix(pair_id): 1}) @@ -495,7 +496,7 @@ def genome_worker(chrom, tmp_dir, is_no_fsj): fsj_reads of circRNAs, (query_name, mate_id, circ_id) """ - import cPickle + import pickle if chrom not in CIRC: return None @@ -524,7 +525,7 @@ def genome_worker(chrom, tmp_dir, is_no_fsj): fsj_reads = [] else: fsj_reads = [] - for circ_id, parser in CIRC[chrom].iteritems(): + for circ_id, parser in CIRC[chrom].items(): # FSJ across start site for read in sam.fetch(region='{0}:{1}-{1}'.format(chrom, parser.start)): if read.is_unmapped or read.is_supplementary: @@ -550,10 +551,10 @@ def genome_worker(chrom, tmp_dir, is_no_fsj): res = {'fp_bsj': fp_bsj, 'fsj_reads': fsj_reads, 'cand_to_genome': cand_to_genome} - res_to_string = cPickle.dumps(res, 0) + res_to_string = pickle.dumps(res, 0) if sys.getsizeof(res_to_string) > 1024 * 1024 * 1024: pkl_file = "{}/{}.pkl".format(tmp_dir, chrom) - cPickle.dump(res, open(pkl_file, "wb"), -1) + pickle.dump(res, open(pkl_file, "wb"), -1) return pkl_file return res @@ -633,7 +634,7 @@ def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, pre output file name """ - from utils import check_dir + from CIRIquant.utils import check_dir circ_dir = '{}/circ'.format(outdir) check_dir(circ_dir) @@ -683,7 +684,7 @@ def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, pre else: circ_exp = sample_exp - from version import __version__ + from CIRIquant.version import __version__ header += ['version: {}'.format(__version__), ] gtf_info = index_annotation(utils.GTF) format_output(circ_info, circ_exp, sample_stat, header, gtf_info, out_file) @@ -762,8 +763,8 @@ def format_output(circ_info, circ_exp, sample_stat, header, gtf_index, outfile): with open(outfile, 'w') as out: for h in header: out.write('##' + h + '\n') - for chrom in sorted(circ_info.keys(), key=by_chrom): - for circ_id in sorted(circ_info[chrom].keys(), cmp=by_circ, key=lambda x:circ_info[chrom][x]): + for chrom in sorted(circ_info.keys(), key=functools.cmp_to_key(by_chrom)): + for circ_id, _ in sorted(circ_info[chrom].items(), key=functools.cmp_to_key(by_circ)): if circ_id not in circ_exp or circ_exp[circ_id]['bsj'] == 0: continue parser = circ_info[chrom][circ_id] @@ -801,25 +802,28 @@ def format_output(circ_info, circ_exp, sample_stat, header, gtf_index, outfile): return 1 -def by_chrom(x): +def by_chrom(x: str, y: str): """ Sort by chromosomes """ - chrom = x - if x.startswith('chr'): - chrom = chrom.strip('chr') - try: - chrom = int(chrom) - except ValueError as e: - pass - return chrom + def format_chrom(value: str) -> str | int: + if value.startswith('chr'): + value = value[len('chr'):] + return int(value) if value.isnumeric() else value + + x, y = format_chrom(x), format_chrom(y) + + if type(x) == type(y): + return x.__lt__(y) + else: + return 1 if type(x) == int else -1 def by_circ(x, y): """ Sort circRNAs by the start and end position """ - return x.end - y.end if x.start == y.start else x.start - y.start + return x[1].end - y[1].end if x[1].start == y[1].start else x[1].start - y[1].start class GTFParser(object): @@ -869,8 +873,8 @@ def index_annotation(gtf): # if 'gene_type' in parser.attr and parser.attr['gene_type'] in ['lincRNA', 'pseudogene']: # continue - start_div, end_div = parser.start / 500, parser.end / 500 - for i in xrange(start_div, end_div + 1): + start_div, end_div = int(parser.start / 500), int(parser.end / 500) + for i in range(start_div, end_div + 1): gtf_index[parser.chrom].setdefault(i, []).append(parser) return gtf_index @@ -882,13 +886,13 @@ def circRNA_attr(gtf_index, circ): if circ.chrom not in gtf_index: LOGGER.warn('chrom of contig "{}" not in annotation gtf, please check'.format(circ.chrom)) return {} - start_div, end_div = circ.start / 500, circ.end / 500 + start_div, end_div = int(circ.start / 500), int(circ.end / 500) host_gene = {} start_element = defaultdict(list) end_element = defaultdict(list) - for x in xrange(start_div, end_div + 1): + for x in range(start_div, end_div + 1): if x not in gtf_index[circ.chrom]: continue for element in gtf_index[circ.chrom][x]: diff --git a/CIRIquant/de.py b/CIRIquant/de.py index 4cca2ce..e2a3bd2 100644 --- a/CIRIquant/de.py +++ b/CIRIquant/de.py @@ -12,7 +12,7 @@ import numexpr as ne ne.set_num_threads(4) -from version import __version__ +from CIRIquant.version import __version__ LOGGER = logging.getLogger('CIRI_DE') CIRC = namedtuple('CIRC', 'bsj fsj ratio rnaser_bsj rnaser_fsj') @@ -20,9 +20,9 @@ def main(): global LOGGER - from circ import grouper - from logger import get_logger - from utils import check_file, get_thread_num + from CIRIquant.circ import grouper + from CIRIquant.logger import get_logger + from CIRIquant.utils import check_file, get_thread_num # Init argparser parser = argparse.ArgumentParser(prog="CIRIquant") @@ -178,7 +178,7 @@ def correction_worker(circ_ids, factor): def load_gtf(in_file): - from circ import GTFParser + from CIRIquant.circ import GTFParser LOGGER.info('Loading CIRIquant result: {}'.format(in_file)) diff --git a/CIRIquant/main.py b/CIRIquant/main.py old mode 100644 new mode 100755 index 7b4f362..da73392 --- a/CIRIquant/main.py +++ b/CIRIquant/main.py @@ -7,12 +7,11 @@ def main(): - from version import __version__ - import circ - import pipeline - from logger import get_logger - from utils import check_file, check_dir, check_config, get_thread_num - from utils import CIRCparser, TOOLS + from CIRIquant.version import __version__ + from CIRIquant import circ, pipeline + from CIRIquant.logger import get_logger + from CIRIquant.utils import check_file, check_dir, check_config, get_thread_num + from CIRIquant.utils import CIRCparser, TOOLS # Init argparser parser = argparse.ArgumentParser(prog='CIRIquant') @@ -20,6 +19,8 @@ def main(): # required arguments parser.add_argument('--config', dest='config_file', metavar='FILE', help='Config file in YAML format', ) + parser.add_argument('-r', '--read', dest='mate1', metavar='MATE1', + help='Input reads (for single-end data)', ) parser.add_argument('-1', '--read1', dest='mate1', metavar='MATE1', help='Input mate1 reads (for paired-end data)', ) parser.add_argument('-2', '--read2', dest='mate2', metavar='MATE2', @@ -77,8 +78,11 @@ def main(): """Check required parameters""" # check input reads - if args.mate1 and args.mate2: - reads = [check_file(args.mate1), check_file(args.mate2)] + if args.mate1: + if args.mate2: + reads = [check_file(args.mate1), check_file(args.mate2)] + else: + reads = [check_file(args.mate1)] else: sys.exit('No input files specified, please see manual for detailed information') @@ -136,7 +140,7 @@ def main(): """Start Running""" os.chdir(outdir) - logger.info('Input reads: ' + ','.join([os.path.basename(args.mate1), os.path.basename(args.mate2)])) + logger.info('Input reads: ' + ','.join([os.path.basename(read) for read in reads])) if lib_type == 0: lib_name = 'unstranded' diff --git a/CIRIquant/pipeline.py b/CIRIquant/pipeline.py index e7729a3..237bfd8 100644 --- a/CIRIquant/pipeline.py +++ b/CIRIquant/pipeline.py @@ -4,7 +4,7 @@ import sys import logging import subprocess -import utils +from CIRIquant import utils LOGGER = logging.getLogger('CIRIquant') @@ -13,12 +13,13 @@ def align_genome(log_file, thread, reads, outdir, prefix): align_dir = outdir + '/align' utils.check_dir(align_dir) sorted_bam = '{}/{}.sorted.bam'.format(align_dir, prefix) - hisat_cmd = '{0} -p {1} --dta -q -x {2} -1 {3} -2 {4} -t --new-summary | {5} sort -o {6} --threads {1} -'.format( + + reads_string = f"-1 {reads[0]} -2 {reads[1]}" if len(reads) > 1 else f"-U {reads[0]}" + hisat_cmd = '{0} -p {1} --dta -q -x {2} {3} -t --new-summary | {4} sort -o {5} --threads {1} -'.format( utils.HISAT2, thread, utils.HISAT_INDEX, - reads[0], - reads[1], + reads_string, utils.SAMTOOLS, sorted_bam ) diff --git a/CIRIquant/prep_CIRIquant.py b/CIRIquant/prep_CIRIquant.py index d5b183b..48ba507 100644 --- a/CIRIquant/prep_CIRIquant.py +++ b/CIRIquant/prep_CIRIquant.py @@ -5,7 +5,7 @@ import argparse import logging from collections import namedtuple -from version import __version__ +from CIRIquant.version import __version__ LOGGER = logging.getLogger('prep_CIRIquant') CIRC = namedtuple('CIRC', 'bsj fsj ratio rnaser_bsj rnaser_fsj') @@ -13,7 +13,7 @@ def load_gtf(in_file): - from circ import GTFParser + from CIRIquant.circ import GTFParser LOGGER.info('Loading CIRIquant result: {}'.format(in_file)) circ_data = {} @@ -47,8 +47,8 @@ def load_gtf(in_file): def main(): global LOGGER - from logger import get_logger - from utils import check_file + from CIRIquant.logger import get_logger + from CIRIquant.utils import check_file # Init argparser parser = argparse.ArgumentParser(prog="prep_CIRIquant") diff --git a/CIRIquant/replicate.py b/CIRIquant/replicate.py index 5642ff5..5c8a760 100644 --- a/CIRIquant/replicate.py +++ b/CIRIquant/replicate.py @@ -5,15 +5,15 @@ import argparse import logging import subprocess -from version import __version__ +from CIRIquant.version import __version__ LOGGER = logging.getLogger('CIRI_DE') def main(): global LOGGER - from logger import get_logger - from utils import check_file + from CIRIquant.logger import get_logger + from CIRIquant.utils import check_file # Init argparser parser = argparse.ArgumentParser(prog="CIRIquant_DE_replicate") diff --git a/CIRIquant/utils.py b/CIRIquant/utils.py index 60d02f5..9fbc862 100644 --- a/CIRIquant/utils.py +++ b/CIRIquant/utils.py @@ -1,8 +1,8 @@ #! /usr/bin/env python # -*- encoding:utf-8 -*= import os -from commands import getstatusoutput import subprocess +from packaging.version import Version import sys import logging LOGGER = logging.getLogger('CIRIquant') @@ -122,19 +122,15 @@ def check_config(config_file): def check_software(cmd): # Get software path from environment - from commands import getstatusoutput - status, ret = getstatusoutput('which {}'.format(cmd)) + status, ret = subprocess.getstatusoutput('which {}'.format(cmd)) if status == 0: return ret else: return None - def check_samtools_version(samtools): - from commands import getoutput - from distutils.version import LooseVersion - version = getoutput('{} --version'.format(samtools).split('\n')[0].split(' ')[1]) - if version and cmp(LooseVersion(version), LooseVersion('1.9')) < 0: + version = subprocess.getoutput('{} --version'.format(samtools)).split('\n')[0].split(' ')[1] + if version and Version(version) < Version('1.9'): raise ConfigError('samtools version too low, 1.9 required') return 1 diff --git a/requirements.txt b/requirements.txt index cdb3b13..24eda86 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,5 @@ -numexpr==2.6.9 -numpy==1.16.4 -pysam==0.15.2 -PyYAML==5.4 -scikit-learn==0.20.3 -scipy==1.2.2 -argparse>=1.2.1 +numpy==2.1.0 +pysam==0.22.1 +PyYAML==6.0.1 +scikit-learn==1.5.1 +scipy==1.14.1 diff --git a/setup.py b/setup.py index da4c6c4..e94ee8d 100644 --- a/setup.py +++ b/setup.py @@ -42,7 +42,7 @@ def read(infile): include_package_data=True, zip_safe=False, install_requires=[ - 'argparse>=1.2.1', 'PyYAML==5.4', 'pysam==0.15.2', 'numpy==1.16.4', - 'scipy==1.2.2', 'scikit-learn==0.20.3', 'numexpr==2.6.9', + 'PyYAML==6.0.1', 'pysam==0.22.1', 'numpy==2.1.0', + 'scipy==1.14.1', 'scikit-learn==1.5.1', ], )