frakka/plot.py at main · stroehleina/frakka · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
from utils import Logger
from utils import CountRecord
from math import log, ceil
from statistics import median
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sys
import os

# Importing messaging and error methods from FrakkaUtils
msg=Logger.msg
err=Logger.err

class Plotter:
	'''A class for creating plots from frakka output objects'''
	def __init__(self, outdir, file, other_co, drop, score, prefix):
		self.outdir = outdir
		self.file = file
		self.other_co = int(other_co)
		self.drop = int(drop)
		self.score = score
		self.prefix = prefix

class CountPlotter(Plotter):
	'''A Plotter to plot per-species confidence summaries'''
	def __init__(self, outdir, file, counts, other_co, drop, score, prefix):
		super().__init__(outdir, file, other_co, drop, score, prefix)
		self.counts = counts

	def plot(self):
		'''Reads a list of CountsRecord objects and creates a plot from each'''

		# Applying minimum read count (drop species with less than drop reads)
		msg(f'Applying minimum read count cut-off. Of {len(self.counts)} species, those with less than {self.drop} reads will not be plotted.')
		self.counts = [c for c in self.counts if c.read_count >= self.drop]
		msg(f'Applied minimum read count cut-off. {len(self.counts)} species remaining.')

		# Merging counts for low-abundance species read_counts
		msg(f'Merging species with read count below {self.other_co} into category "Other".')
		other = [c.read_count for c in self.counts if c.read_count < self.other_co]
		other_sum = sum(other)
		msg(f'Merged {len(other)} species (total reads: {other_sum}) into category "Other".')
		self.counts = [c for c in self.counts if c.read_count >= self.other_co]

		if self.other_co != 0:
			cr = CountRecord(file=self.file, truespec='Other', kspec='Other', species='Other', read_count=other_sum)
			# INFO cannot provide median of the merged "Other" category
			# INFO There is not a way to calculate the median based on medians of subsets of the whole and still be statistically accurate.
			# INFO We also cannot use the means of the subsets, given that they are not of equal size.
			self.counts += [cr]

		plot_lst = []
		# Sort the list of counts objects by read_count
		for c in self.counts:
			color='darkgrey'

			if c.kspec == '9606' or c.kspec == 'Homo sapiens':
				color='darkred'

			if c.kspec == c.truespec and c.kspec != 'Other':
				color='darkgreen'

			plot_lst.append({'name' : c.kspec, 'read_count' : int(c.read_count), 'color' : color})

		# Sort by reverse numerical 'read_count' then by 'name'
		plot_lst.sort(key=lambda e: (-e['read_count'], e['name']))

		# Adjust upper x_bound based on entry with highest read count
		max_rc = int(sorted(plot_lst, key=lambda e: e['read_count'], reverse=True)[0]['read_count'])

		# Adjust image size based on longest read name
		max_name = len(sorted(plot_lst, key=lambda e: len(e['name']), reverse=True)[0]['name'])

		plt.rcdefaults()
		# msg(f'590 pixels corresponds to:')
		# msg(f'5 + 0.1 * max_name = {5 + 0.1 * max_name}')
		# msg(f'80275 pixels corresponds to:')
		# msg(f'0.25 * len(plot_lst) = {0.25 * len(plot_lst)}')

		# maximum is 65536 pixels
		# NOTE reduced dpi?

		ax = plt.subplots(figsize=(5 + 0.1 * max_name, 0.25 * len(plot_lst)))[1]
		ax.set_xbound(lower=1, upper=max_rc)
		plt.xscale('log', base=10)

		species = [d['name'] for d in plot_lst]
		y_pos = [i+1 for i,v in enumerate(species)]
		plt_counts = [d['read_count'] for d in plot_lst]
		plt_colors = [d['color'] for d in plot_lst]

		ax.barh(y_pos, plt_counts, color=plt_colors)
		ax.set_yticks(y_pos, labels=species, fontstyle='italic')
		ax.invert_yaxis()
		ax.set_xlabel('Read counts')
		ax.set_title(f'Distribution of reads per species\nwith confidence score \u2265 {self.score}\n(file {self.file.split("/")[-1]})')
		ax.set_xmargin(0.3)

		for i, v in enumerate(plt_counts):
			ax.text(v, i+1, str(v), ha='left', va='center')

		plt.subplots_adjust(left=0.4)

		# chop file ending, replace all "." by "__" and add .pdf
		outfile = self.prefix + 'counts_' + '__'.join(self.file.split('/')[-1].split('.')[:-1]) + f'_c-o_{self.score}.pdf'

		figpath = '/'.join([self.outdir, outfile])
		msg(f'Saving filtered read counts-per-species plot for file {self.file} to {figpath}')
		plt.savefig(figpath)

class ReadPlotter(Plotter):
	'''A Plotter to plot per-file (handled in main) read confidence distributions'''
	def __init__(self, outdir, file, rcl, other_co, drop, score, prefix):
		super().__init__(outdir, file, other_co, drop, score, prefix)
		# a list of ReadRecord objects
		self.rcl = rcl

	def plot(self):
		'''Reads a list of ReadRecord objects (which have a confidence score and a species and a taxid)'''

		# Non-redundant set of all species present in the list of read records
		# species = set([rc.kspec for rc in rcl]) # don't need this we will just fill dynamically by iterating over all ReadRecords

		species = {}

		for rc in self.rcl:
			# file, truespec, kspec, read_id, score
			# TODO allow to filter for truespec only
			# TODO color addition for truespec
			try:
				species[rc.kspec]['scores'].append(rc.score)
			except KeyError:
				species[rc.kspec] = {'scores' : [rc.score]}

		for s in list(species):

			# remove species for which the length of the list of values is shorter than self.drop (--minreads)
			if len(species[s]['scores']) <= self.drop:
				msg(f'Species {s} has {len(species[s]["scores"])} reads (<= {self.drop}). Skipping record.')
				del species[s]
				continue
			else:
				try:
					species['All']['scores'] += species[s]['scores']
				except KeyError as e:
					species['All'] = {'scores' : species[s]['scores'].copy()}

				# msg(f'Adding {len(species[s]["scores"])} records of species {s} to the total ("All").')
				# msg(f'Total number of All: {len(species["All"]["scores"])}')

			# concatenate all lists of values from species for which the length of the list is shorter than other_co=0
			if len(species[s]['scores']) <= self.other_co:
				try:
					species['Other']['scores'] += species[s]['scores']
				except KeyError as e:
					species['Other'] = {'scores' : species[s]['scores'].copy()}

				# msg(f'Adding {len(species[s]["scores"])} records to "Other".')
				# msg(f'Total number of "Other": {len(species["Other"]["scores"])}')

			species[s]['median'] = round(median(species[s]['scores']), 2)


		if 'All' not in species.keys():
			err(f'No reads were added for plotting, try reducing the minimal required read count {self.drop} per species (set via --minreads).')
		species['All']['median'] = round(median(species['All']['scores']), 2)

		if 'Other' in species.keys():
			species['Other']['median'] = round(median(species['Other']['scores']), 2)

		# sort species by length of scores list
		species = dict(sorted(species.items(), key=lambda e: len(e[1]['scores']), reverse=True))

		# Create long format pandas object and feed that to displot (should automatically create the facetting we want)
		longlst = []
		for s in species:
			for c in species[s]['scores']:
				longlst.append({'species' : s, 'score' : c, 'median' : species[s]['median']})

		plot_df = pd.DataFrame(longlst)

		plot = sns.displot(plot_df, x='score', col='species', kind='kde', col_wrap=3, color='black', facet_kws={'sharey': False, 'sharex' : False}, warn_singular=False)

		title_f = self.file.split('/')[-1]
		plot.fig.subplots_adjust(top=0.95)
		plot.fig.suptitle(f'Confidence score distribution for file {title_f} (score cut-off: {self.score})')
		plot.set(xlim=(0, 1))
		plot.set_titles(col_template = '{col_name}', fontstyle='italic')
		plot.set_axis_labels('Confidence score')
		spec_median = plot_df[["species", "median"]].drop_duplicates()
		axes = plot.fig.axes

		for i,ax in enumerate(axes):
			ax.axvline(spec_median.iloc[i]['median'], color='red')

		# chop file ending, replace all "." by "__" and add .pdf
		outfile = self.prefix + 'reads_' + '__'.join(self.file.split('/')[-1].split('.')[:-1]) + f'_species_distr_{self.score}.pdf'

		figpath = '/'.join([self.outdir, outfile])
		msg(f'Saving filtered distribution plots for {len(axes)} species (including "All" and "Other" categories, if specified) for file {self.file} to {figpath}')
		plot.figure.savefig(figpath)