Skip to content

Commit

Permalink
feature: extract transcripts
Browse files Browse the repository at this point in the history
Closes #86

Adds `isoslam.extract_transcripts()` which pulls out transcripts from a `.bed` file.

- Replaced `.strip("_introns")` due to [B005
strip-with-multiple-characters](https://docs.astral.sh/ruff/rules/strip-with-multi-characters/)

Questions/things to check

Further down the line once I understand I hope to clarify

- Does this need to be a dictionary?
- If so would there be any benefit to making it a nested dictionary with names for `chromosome`, `start`, `end`,
- `transcript_id` and `bedstrand`?
  • Loading branch information
ns-rse committed Nov 29, 2024
1 parent edd7c28 commit 0737c50
Show file tree
Hide file tree
Showing 2 changed files with 79 additions and 0 deletions.
37 changes: 37 additions & 0 deletions isoslam/isoslam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
"""IsoSLAM module."""

from collections import defaultdict
from pathlib import Path
from typing import Any

from isoslam import io


def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]:
"""
Extract features from `.bed` file and return as a dictionary indexed by transcript_id.
Parameters
----------
bed_file : str | Path
Path, as string or pathlib Path, to a `.bed` file.
Returns
-------
dict[Any, list[tuple[Any, int, int, Any, Any]]]
Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id.
"""
coordinates = defaultdict(list)
for line in io.load_file(bed_file):
contents = line.strip().split("\t")
transcript_id = contents[3].replace("_intron", "")
coordinates[transcript_id].append(
(
contents[0],
int(contents[1]),
int(contents[2]),
transcript_id,
contents[5],
)
)
return coordinates
42 changes: 42 additions & 0 deletions tests/test_isoslam.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
"""Tests for the isoslam module."""

from pathlib import Path
from typing import Any

import pytest # type: ignore[import-not-found]

from isoslam import isoslam

BASE_DIR = Path.cwd()
RESOURCES = BASE_DIR / "tests" / "resources"


@pytest.mark.parametrize(
("bed_file", "expected_transcript"),
[
pytest.param( # type: ignore[misc]
RESOURCES / "bed" / "test_coding_introns.bed",
{
"ENST00000442898": [
("9", 14940, 15080, "ENST00000442898", "-"),
("9", 15149, 15908, "ENST00000442898", "-"),
("9", 16061, 16717, "ENST00000442898", "-"),
("9", 16876, 16964, "ENST00000442898", "-"),
("9", 17166, 17343, "ENST00000442898", "-"),
("9", 17479, 17718, "ENST00000442898", "-"),
("9", 17855, 18027, "ENST00000442898", "-"),
("9", 18174, 18380, "ENST00000442898", "-"),
("9", 18492, 24850, "ENST00000442898", "-"),
("9", 25004, 29601, "ENST00000442898", "-"),
]
},
id="bed coding introons",
),
],
)
def test_isoslam_extract_transcripts(
bed_file: str | Path,
expected_transcript: dict[Any, list[tuple[Any, int, int, Any, Any]]],
) -> None:
"""Test extraction of tanscript data from bed file using extract_transcripts()."""
assert isoslam.extract_transcripts(bed_file) == expected_transcript

0 comments on commit 0737c50

Please sign in to comment.