-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Closes #86 Adds `isoslam.extract_transcripts()` which pulls out transcripts from a `.bed` file. - Replaced `.strip("_introns")` due to [B005 strip-with-multiple-characters](https://docs.astral.sh/ruff/rules/strip-with-multi-characters/) Questions/things to check Further down the line once I understand I hope to clarify - Does this need to be a dictionary? - If so would there be any benefit to making it a nested dictionary with names for `chromosome`, `start`, `end`, - `transcript_id` and `bedstrand`?
- Loading branch information
Showing
2 changed files
with
79 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
"""IsoSLAM module.""" | ||
|
||
from collections import defaultdict | ||
from pathlib import Path | ||
from typing import Any | ||
|
||
from isoslam import io | ||
|
||
|
||
def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]: | ||
""" | ||
Extract features from `.bed` file and return as a dictionary indexed by transcript_id. | ||
Parameters | ||
---------- | ||
bed_file : str | Path | ||
Path, as string or pathlib Path, to a `.bed` file. | ||
Returns | ||
------- | ||
dict[Any, list[tuple[Any, int, int, Any, Any]]] | ||
Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id. | ||
""" | ||
coordinates = defaultdict(list) | ||
for line in io.load_file(bed_file): | ||
contents = line.strip().split("\t") | ||
transcript_id = contents[3].replace("_intron", "") | ||
coordinates[transcript_id].append( | ||
( | ||
contents[0], | ||
int(contents[1]), | ||
int(contents[2]), | ||
transcript_id, | ||
contents[5], | ||
) | ||
) | ||
return coordinates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
"""Tests for the isoslam module.""" | ||
|
||
from pathlib import Path | ||
from typing import Any | ||
|
||
import pytest # type: ignore[import-not-found] | ||
|
||
from isoslam import isoslam | ||
|
||
BASE_DIR = Path.cwd() | ||
RESOURCES = BASE_DIR / "tests" / "resources" | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("bed_file", "expected_transcript"), | ||
[ | ||
pytest.param( # type: ignore[misc] | ||
RESOURCES / "bed" / "test_coding_introns.bed", | ||
{ | ||
"ENST00000442898": [ | ||
("9", 14940, 15080, "ENST00000442898", "-"), | ||
("9", 15149, 15908, "ENST00000442898", "-"), | ||
("9", 16061, 16717, "ENST00000442898", "-"), | ||
("9", 16876, 16964, "ENST00000442898", "-"), | ||
("9", 17166, 17343, "ENST00000442898", "-"), | ||
("9", 17479, 17718, "ENST00000442898", "-"), | ||
("9", 17855, 18027, "ENST00000442898", "-"), | ||
("9", 18174, 18380, "ENST00000442898", "-"), | ||
("9", 18492, 24850, "ENST00000442898", "-"), | ||
("9", 25004, 29601, "ENST00000442898", "-"), | ||
] | ||
}, | ||
id="bed coding introons", | ||
), | ||
], | ||
) | ||
def test_isoslam_extract_transcripts( | ||
bed_file: str | Path, | ||
expected_transcript: dict[Any, list[tuple[Any, int, int, Any, Any]]], | ||
) -> None: | ||
"""Test extraction of tanscript data from bed file using extract_transcripts().""" | ||
assert isoslam.extract_transcripts(bed_file) == expected_transcript |