Skip to content

Commit be14ae9

Browse files
committed
added relative indexing with seqspec index, added splitcode indexing, updated docs
1 parent 7accfa4 commit be14ae9

7 files changed

+430
-82
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
![python versions](https://img.shields.io/pypi/pyversions/seqspec)
66
[![license](https://img.shields.io/pypi/l/seqspec)](LICENSE)
77

8-
`seqspec` is a file format that describes data generated from genomics experiments. Both the file format and `seqspec` tool [enable uniform processing](./docs/UNIFORM.md) of genomics data.
8+
`seqspec`, short for "sequence specification" (pronounced "seek-speck"), is a file format that describes data generated from genomics experiments. Both the file format and `seqspec` tool [enable uniform processing](./docs/UNIFORM.md) of genomics data.
99

1010
![alt text](docs/images/simple_file_structure.png)
1111
**Figure 1**: Anatomy of a `seqspec` file.

docs/SEQSPEC_FILE.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ authors:
77

88
# Overview
99

10-
`seqspec` is a machine-readable specification for annotating sequencing libraries produced by genomics assays. Genomic library structure depends on both the assay and sequencer (and kits) used to generate and bind the assay-specific construct to the sequencing adapters to generate a sequencing library. `seqspec` is specific to both a genomics assay and sequencer and provides a standardized format for describing the structure of sequencing libraries and the resulting sequencing reads. Specifically, a `seqspec` file is a machine-readable YAML file that annotates the content of molecules in genomic libraries, the structure of reads generated from them, and how those are stored in files.
10+
`seqspec`, short for "sequence specification" (pronounced "seek-speck"), is a machine-readable specification for annotating sequencing libraries produced by genomics assays. Genomic library structure depends on both the assay and sequencer (and kits) used to generate and bind the assay-specific construct to the sequencing adapters to generate a sequencing library. `seqspec` is specific to both a genomics assay and sequencer and provides a standardized format for describing the structure of sequencing libraries and the resulting sequencing reads. Specifically, a `seqspec` file is a machine-readable YAML file that annotates the content of molecules in genomic libraries, the structure of reads generated from them, and how those are stored in files.
1111

1212
The primary goal of a `seqspec` file is to:
1313

docs/SEQ_PRIMER.md

+6-1
Original file line numberDiff line numberDiff line change
@@ -51,4 +51,9 @@ Paper describing nanopore sequencing: https://www.nature.com/articles/nbt.1495
5151

5252
### Strandedness
5353

54-
Unlike Illumina, nanopore sequencing produces the direct nucleotide sequence in linear order starting at the beginning of the molecule.
54+
Unlike Illumina, nanopore sequencing produces the direct nucleotide sequence in linear order starting at the beginning of the molecule. [But the read generated from a molecule can start from either end and either strand](https://nanoporetech.com/document/genomic-dna-by-ligation-sqk-lsk114). And the strand can be either the top or bottom strand. So a read is produced in one of four "orientations"
55+
56+
1. forward, top strand
57+
2. forward, reverse strand
58+
3. backward, top strand
59+
4. backward, reverse strand

seqspec/Region.py

+122-31
Original file line numberDiff line numberDiff line change
@@ -242,34 +242,6 @@ def complement(self):
242242
self.sequence = complement_sequence(self.sequence)
243243

244244

245-
def complement_nucleotide(nucleotide):
246-
complements = {
247-
"A": "T",
248-
"T": "A",
249-
"G": "C",
250-
"C": "G",
251-
"R": "Y",
252-
"Y": "R",
253-
"S": "S",
254-
"W": "W",
255-
"K": "M",
256-
"M": "K",
257-
"B": "V",
258-
"D": "H",
259-
"V": "B",
260-
"H": "D",
261-
"N": "N",
262-
"X": "X",
263-
}
264-
return complements.get(
265-
nucleotide, "N"
266-
) # Default to 'N' if nucleotide is not recognized
267-
268-
269-
def complement_sequence(sequence):
270-
return "".join(complement_nucleotide(n) for n in sequence.upper())
271-
272-
273245
class RegionCoordinate(Region):
274246
def __init__(
275247
self,
@@ -291,15 +263,106 @@ def __init__(
291263
self.start = start
292264
self.stop = stop
293265

294-
def __repr__(self):
295-
return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
266+
def __repr__(self) -> str:
267+
d = {
268+
"region": self.region_id,
269+
"start": self.start,
270+
"stop": self.stop,
271+
}
272+
return f"{d}"
273+
274+
# def __repr__(self):
275+
# return f"RegionCoordinate {self.name} [{self.region_type}]: [{self.start}, {self.stop})"
296276

297277
def __str__(self):
298-
return f"RegionCoordinate {self.name} [{self.region_type}]: ({self.start}, {self.stop})"
278+
return f"RegionCoordinate {self.name} [{self.region_type}]: [{self.start}, {self.stop})"
299279

300280
def __eq__(self, other):
301281
return self.start == other.start and self.stop == other.stop
302282

283+
def __sub__(self, other):
284+
"""
285+
Define the subtraction between two RegionCoordinate objects.
286+
rc2 - rc1 should return a new RegionCoordinate with:
287+
- start as rc2.start - rc1.stop
288+
- stop as rc2.stop - rc1.start
289+
- min_len and max_len is max(self.start, other.start) - min(self.stop, other.stop)
290+
"""
291+
if not isinstance(other, RegionCoordinate):
292+
raise TypeError(
293+
f"Unsupported operand type(s) for -: 'RegionCoordinate' and '{type(other).__name__}'"
294+
)
295+
296+
# Case 1: self is to the left of other
297+
# need to handle the case where the diff is zero but the position of other is left or right of self
298+
# self - other
299+
if self.stop <= other.start:
300+
new_start = self.stop
301+
new_stop = other.start
302+
303+
# Case 2: self is to the right of other
304+
elif other.stop <= self.start:
305+
new_start = other.stop
306+
new_stop = self.start
307+
else:
308+
# not so obious what to do with the start and the stop for the same element
309+
if self.start == other.start and self.stop == other.stop:
310+
new_start = self.start
311+
new_stop = self.stop
312+
else:
313+
raise ValueError("Subtraction is not defined.")
314+
315+
# Create a new RegionCoordinate object with the updated start and stop values
316+
new_region = RegionCoordinate(
317+
region=Region(
318+
region_id=f"{self.region_id} - {other.region_id}",
319+
region_type="difference",
320+
name=f"{self.name} - {other.name}",
321+
sequence_type="diff",
322+
sequence="X",
323+
min_len=0,
324+
max_len=0,
325+
onlist=None,
326+
regions=None,
327+
), # Assuming the region meta-data stays the same
328+
start=new_start,
329+
stop=new_stop,
330+
)
331+
332+
# Set min_len and max_len
333+
new_region.min_len = abs(new_region.stop - new_region.start)
334+
new_region.max_len = abs(new_region.stop - new_region.start)
335+
new_region.sequence = "X" * new_region.min_len
336+
337+
return new_region
338+
339+
340+
class RegionCoordinateDifference:
341+
def __init__(
342+
self,
343+
obj: RegionCoordinate,
344+
fixed: RegionCoordinate,
345+
rgncdiff: RegionCoordinate,
346+
loc: str = "",
347+
):
348+
self.obj = obj
349+
self.fixed = fixed
350+
self.rgncdiff = rgncdiff
351+
self.loc = loc
352+
if obj.stop <= fixed.start:
353+
self.loc = "-"
354+
elif obj.start >= fixed.stop:
355+
self.loc = "+"
356+
357+
def __repr__(self) -> str:
358+
d = {
359+
"obj": self.obj,
360+
"fixed": self.fixed,
361+
"rgncdiff": self.rgncdiff,
362+
"loc": self.loc,
363+
}
364+
return f"{d}"
365+
303366

304367
def project_regions_to_coordinates(
305368
regions: List[Region], rcs: List[RegionCoordinate] = []
@@ -395,3 +458,31 @@ def to_dict(self):
395458

396459
def update_file_id(self, file_id):
397460
self.file_id = file_id
461+
462+
463+
def complement_nucleotide(nucleotide):
464+
complements = {
465+
"A": "T",
466+
"T": "A",
467+
"G": "C",
468+
"C": "G",
469+
"R": "Y",
470+
"Y": "R",
471+
"S": "S",
472+
"W": "W",
473+
"K": "M",
474+
"M": "K",
475+
"B": "V",
476+
"D": "H",
477+
"V": "B",
478+
"H": "D",
479+
"N": "N",
480+
"X": "X",
481+
}
482+
return complements.get(
483+
nucleotide, "N"
484+
) # Default to 'N' if nucleotide is not recognized
485+
486+
487+
def complement_sequence(sequence):
488+
return "".join(complement_nucleotide(n) for n in sequence.upper())

0 commit comments

Comments
 (0)