feature: extract transcripts

Closes #86 Adds `isoslam.extract_transcripts()` which pulls out transcripts from a `.bed` file. - Replaced `.strip("_introns")` due to [B005 strip-with-multiple-characters](https://docs.astral.sh/ruff/rules/strip-with-multi-characters/) Questions/things to check Further down the line once I understand I hope to clarify - Does this need to be a dictionary? - If so would there be any benefit to making it a nested dictionary with names for `chromosome`, `start`, `end`, - `transcript_id` and `bedstrand`?
sudlab · Nov 29, 2024 · 0737c50 · 0737c50
1 parent edd7c28
commit 0737c50
Show file tree

Hide file tree

Showing 2 changed files with 79 additions and 0 deletions.
diff --git a/isoslam/isoslam.py b/isoslam/isoslam.py
@@ -0,0 +1,37 @@
+"""IsoSLAM module."""
+
+from collections import defaultdict
+from pathlib import Path
+from typing import Any
+
+from isoslam import io
+
+
+def extract_transcripts(bed_file: str | Path) -> dict[Any, list[tuple[Any, int, int, Any, Any]]]:
+    """
+    Extract features from `.bed` file and return as a dictionary indexed by transcript_id.
+
+    Parameters
+    ----------
+    bed_file : str | Path
+        Path, as string or pathlib Path, to a `.bed` file.
+
+    Returns
+    -------
+    dict[Any, list[tuple[Any, int, int, Any, Any]]]
+        Nested dictionary of chromosome, start, end and bedstrand indexed by transcript_id.
+    """
+    coordinates = defaultdict(list)
+    for line in io.load_file(bed_file):
+        contents = line.strip().split("\t")
+        transcript_id = contents[3].replace("_intron", "")
+        coordinates[transcript_id].append(
+            (
+                contents[0],
+                int(contents[1]),
+                int(contents[2]),
+                transcript_id,
+                contents[5],
+            )
+        )
+    return coordinates
diff --git a/tests/test_isoslam.py b/tests/test_isoslam.py
@@ -0,0 +1,42 @@
+"""Tests for the isoslam module."""
+
+from pathlib import Path
+from typing import Any
+
+import pytest  # type: ignore[import-not-found]
+
+from isoslam import isoslam
+
+BASE_DIR = Path.cwd()
+RESOURCES = BASE_DIR / "tests" / "resources"
+
+
+@pytest.mark.parametrize(
+    ("bed_file", "expected_transcript"),
+    [
+        pytest.param(  # type: ignore[misc]
+            RESOURCES / "bed" / "test_coding_introns.bed",
+            {
+                "ENST00000442898": [
+                    ("9", 14940, 15080, "ENST00000442898", "-"),
+                    ("9", 15149, 15908, "ENST00000442898", "-"),
+                    ("9", 16061, 16717, "ENST00000442898", "-"),
+                    ("9", 16876, 16964, "ENST00000442898", "-"),
+                    ("9", 17166, 17343, "ENST00000442898", "-"),
+                    ("9", 17479, 17718, "ENST00000442898", "-"),
+                    ("9", 17855, 18027, "ENST00000442898", "-"),
+                    ("9", 18174, 18380, "ENST00000442898", "-"),
+                    ("9", 18492, 24850, "ENST00000442898", "-"),
+                    ("9", 25004, 29601, "ENST00000442898", "-"),
+                ]
+            },
+            id="bed coding introons",
+        ),
+    ],
+)
+def test_isoslam_extract_transcripts(
+    bed_file: str | Path,
+    expected_transcript: dict[Any, list[tuple[Any, int, int, Any, Any]]],
+) -> None:
+    """Test extraction of tanscript data from bed file using extract_transcripts()."""
+    assert isoslam.extract_transcripts(bed_file) == expected_transcript