@@ -286,34 +286,59 @@ def download_annotation_data() -> Tuple[str, str]:
286286 )
287287 return gencode , encode
288288
289+ def _bgzip_ann_data (ann_fname : str ) -> str :
290+ """
291+ Compress an annotation file using bgzip and verify the output.
292+
293+ This function calls the bgzip utility to compress the specified annotation file,
294+ checks that the compressed file exists, and returns its path. It raises an error
295+ if the compression fails.
296+
297+ Args:
298+ ann_fname (str): The path to the annotation file to be compressed.
299+
300+ Returns:
301+ str: The path to the compressed annotation file.
302+
303+ Raises:
304+ subprocess.SubprocessError: If bgzip compression fails.
305+ """
306+
307+ try :
308+ subprocess .call (f"bgzip -f { ann_fname } " , shell = True )
309+ except (subprocess .SubprocessError , Exception ) as e :
310+ raise subprocess .SubprocessError (f"Bgzip compression failed on { ann_fname } " ) from e
311+ ann_fname_gz = f"{ ann_fname } .gz"
312+ assert os .path .isfile (ann_fname_gz ) # check that the bgzipped bed exists
313+ return ann_fname_gz
314+
315+
289316
290317def _retrieve_ann_data (annotation_dir : str , url : str , fname : str ) -> str :
291- """Retrieve and validate annotation data from a specified URL.
318+ """
319+ Download and extract an annotation file, then compress it with bgzip.
292320
293- This function downloads annotation data from a given URL, verifies the
294- integrity of the downloaded file using its MD5 checksum, and extracts the
295- relevant file to the specified annotation directory. It raises an error if
296- the checksum does not match.
321+ This function downloads an annotation archive, verifies its integrity, extracts
322+ the specified file, and compresses it using bgzip. It returns the path to the
323+ compressed annotation file.
297324
298325 Args:
299- annotation_dir (str): The directory where the annotation data will be
300- stored.
301- url (str): The URL from which to download the annotation data.
302- fname (str): The name of the file to retrieve after extraction.
326+ annotation_dir (str): The directory to store the annotation data.
327+ url (str): The URL of the annotation archive to download.
328+ fname (str): The name of the file to extract and compress.
303329
304330 Returns:
305- str: The path to the extracted annotation file.
331+ str: The path to the compressed annotation file.
306332
307333 Raises:
308- ValueError: If the MD5 checksum of the downloaded file does not match
309- the expected value.
334+ ValueError: If the downloaded file fails the MD5 check.
310335 """
311336
312337 # download gencode annotation
313338 annfile_tar = download (annotation_dir , http_url = os .path .join (TESTDATAURL , url ))
314339 if MD5ANNOTATION [os .path .basename (annfile_tar )] != compute_md5 (annfile_tar ):
315340 raise ValueError (f"Download for { os .path .basename (annfile_tar )} failed" )
316- return os .path .join (untar (annfile_tar , annotation_dir ), fname )
341+ return _bgzip_ann_data ( os .path .join (untar (annfile_tar , annotation_dir ), fname ) )
317342
318343
319344def ensure_pams_directory (dest : str ) -> str :
0 commit comments