From 517bc8d788a42543032e0d3ab8e73353d27164bd Mon Sep 17 00:00:00 2001 From: Daniel Ji Date: Thu, 15 Aug 2024 14:03:07 -0700 Subject: [PATCH] Read mdocfile data from a string (#28) --- README.md | 15 +++++++++++++-- docs/index.md | 21 ++++++++++++++++++--- src/mdocfile/__init__.py | 2 +- src/mdocfile/data_models.py | 33 ++++++++++++++++++++++++++++++++- src/mdocfile/functions.py | 17 +---------------- tests/conftest.py | 7 ++++++- tests/test_functions.py | 6 ++++++ 7 files changed, 77 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index ec2c00b..58996fd 100644 --- a/README.md +++ b/README.md @@ -31,8 +31,6 @@ df = mdocfile.read('my_mdoc_file.mdoc') For writing valid mdoc files, please see [writing mdoc files](https://teamtomo.org/mdocfile/writing/). - - # Installation pip: @@ -40,3 +38,16 @@ pip: ```shell pip install mdocfile ``` + +# Parsing from text + +`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. +This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). + +```python +from mdocfile.data_models import Mdoc + +mdoc_data = ... + +mdoc = Mdoc.from_string(mdoc_data).as_dataframe() +``` diff --git a/docs/index.md b/docs/index.md index 4f872f1..fefda90 100644 --- a/docs/index.md +++ b/docs/index.md @@ -29,14 +29,29 @@ import mdocfile df = mdocfile.read('my_mdoc_file.mdoc') ``` ---- - For writing valid mdoc files, please see [writing mdoc files](./writing.md). +--- + # Installation pip: ```shell pip install mdocfile -``` \ No newline at end of file +``` + +--- + +# Parsing from text + +`Mdoc.from_string().as_dataframe()` will return the contents of string mdoc data as a pandas dataframe. +This is useful for mdoc data that is not stored in a file (e.g. from a database or a web request). + +```python +from mdocfile.data_models import Mdoc + +mdoc_data = ... + +mdoc = Mdoc.from_string(mdoc_data).as_dataframe() +``` diff --git a/src/mdocfile/__init__.py b/src/mdocfile/__init__.py index 8acbf99..1cb2442 100644 --- a/src/mdocfile/__init__.py +++ b/src/mdocfile/__init__.py @@ -1 +1 @@ -from .functions import read +from .functions import read \ No newline at end of file diff --git a/src/mdocfile/data_models.py b/src/mdocfile/data_models.py index 0bd5b84..2e30eac 100644 --- a/src/mdocfile/data_models.py +++ b/src/mdocfile/data_models.py @@ -1,3 +1,4 @@ +import pandas as pd from pydantic import field_validator, BaseModel from pathlib import Path, PureWindowsPath from typing import List, Optional, Tuple, Union, Sequence @@ -170,7 +171,17 @@ class Mdoc(BaseModel): @classmethod def from_file(cls, filename: str): with open(filename) as file: - lines = [line.strip() for line in file.readlines()] + return cls.from_lines(file.readlines()) + + @classmethod + def from_string(cls, string: str): + lines = string.split('\n') + + return cls.from_lines(lines) + + @classmethod + def from_lines(cls, file_lines: List[str]) -> 'Mdoc': + lines = [line.strip() for line in file_lines] split_idxs = find_section_entries(lines) split_idxs.append(len(lines)) @@ -185,6 +196,26 @@ def from_file(cls, filename: str): in zip(split_idxs, split_idxs[1:]) ] return cls(titles=titles, global_data=global_data, section_data=section_data) + + def as_dataframe(self) -> pd.DataFrame: + """ + Convert an Mdoc object to a pandas DataFrame + """ + global_data = self.global_data.model_dump() + section_data = { + k: [section.model_dump()[k] for section in self.section_data] + for k + in self.section_data[0].model_dump().keys() + } + df = pd.DataFrame(data=section_data) + + # add duplicate copies of global data and mdoc file titles to each row of + # the dataframe - tidy data is easier to analyse + for k, v in global_data.items(): + df[k] = [v] * len(df) + df['titles'] = [self.titles] * len(df) + df = df.dropna(axis='columns', how='all') + return df def to_string(self): """ diff --git a/src/mdocfile/functions.py b/src/mdocfile/functions.py index 78036c7..3ec1e3e 100644 --- a/src/mdocfile/functions.py +++ b/src/mdocfile/functions.py @@ -18,19 +18,4 @@ def read(filename: PathLike) -> pd.DataFrame: df : pd.DataFrame dataframe containing info from mdoc file """ - mdoc = Mdoc.from_file(filename) - global_data = mdoc.global_data.model_dump() - section_data = { - k: [section.model_dump()[k] for section in mdoc.section_data] - for k - in mdoc.section_data[0].model_dump().keys() - } - df = pd.DataFrame(data=section_data) - - # add duplicate copies of global data and mdoc file titles to each row of - # the dataframe - tidy data is easier to analyse - for k, v in global_data.items(): - df[k] = [v] * len(df) - df['titles'] = [mdoc.titles] * len(df) - df = df.dropna(axis='columns', how='all') - return df + return Mdoc.from_file(filename).as_dataframe() diff --git a/tests/conftest.py b/tests/conftest.py index da19cee..4308e0f 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,16 @@ -import pytest from pathlib import Path +import pytest + @pytest.fixture def tilt_series_mdoc_file(): return Path(__file__).parent / 'test_data' / 'tilt_series.mdoc' +@pytest.fixture +def tilt_series_mdoc_string(): + with open(Path(__file__).parent / 'test_data' / 'tilt_series.mdoc') as f: + return f.read() @pytest.fixture def montage_section_mdoc_file(): diff --git a/tests/test_functions.py b/tests/test_functions.py index ec2b4f0..ced4c2c 100644 --- a/tests/test_functions.py +++ b/tests/test_functions.py @@ -1,6 +1,7 @@ import pandas as pd from mdocfile import read +from mdocfile.data_models import Mdoc def test_read_tilt_series_mdoc(tilt_series_mdoc_file): @@ -9,6 +10,11 @@ def test_read_tilt_series_mdoc(tilt_series_mdoc_file): assert df.shape == (41, 26) assert 'TiltAngle' in df.columns +def test_read_tilt_series_mdoc_string(tilt_series_mdoc_string): + df = Mdoc.from_string(tilt_series_mdoc_string).as_dataframe() + assert isinstance(df, pd.DataFrame) + assert df.shape == (41, 26) + assert 'TiltAngle' in df.columns def test_read_montage_section_mdoc(montage_section_mdoc_file): df = read(montage_section_mdoc_file)