-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpandascharm.py
executable file
·175 lines (150 loc) · 5.54 KB
/
pandascharm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pandas
__author__ = 'Markus Englund'
__license__ = 'MIT'
__version__ = '0.3.0'
def frame_as_categorical(frame, include_categories=None):
"""
Return a pandas DataFrame with each column treated as a
categorical with unordered categories. The same categories
are applied to all columns.
Parameters
----------
frame : pandas.DataFrame
include_categories : list (default: None)
Categories to add unless they are already present
in `frame`.
"""
include_categories = include_categories if include_categories else []
current_categories = pandas.unique(frame.values.ravel())
current_categories_notnull = (
current_categories[pandas.notnull(current_categories)])
categories = set(current_categories_notnull).union(include_categories)
categorical = frame.apply(lambda x: pandas.Series(x.astype('category')))
unified_categorical = categorical.apply(
lambda x: x.cat.set_categories(new_categories=categories))
return unified_categorical
def frame_as_object(frame):
"""
Return a pandas DataFrame as NumPy `dtype` ``object``.
Useful for casting from a categorical frame.
"""
return frame.apply(lambda x: x.astype('object'))
def from_bioalignment(bioalignment, categorical=True):
"""
Convert a BioPython alignment to a pandas DataFrame.
Parameters
----------
bioalignment : Bio.Align.MultipleSeqAlignment
categorical : bool (default: True)
If True, the result will be returned as a categorical frame.
"""
frame = pandas.DataFrame()
dtype = 'category' if categorical else 'object'
for record in bioalignment:
s = pandas.Series(list(record.seq), name=record.id, dtype=dtype)
frame = pandas.concat([frame, s], axis=1)
return frame
def from_charmatrix(charmatrix, categorical=True):
"""
Convert a DendroPy CharacterMatrix to a pandas DataFrame.
Parameters
----------
charmatrix : dendropy.CharacterMatrix
categorical : bool (default: True)
If True, the result will be returned as a categorical frame.
"""
frame = pandas.DataFrame()
for taxon, seq in charmatrix.items():
s = pandas.Series(
seq.symbols_as_list(), name=taxon.label)
frame = pandas.concat([frame, s], axis=1)
if categorical:
state_alphabet = charmatrix.state_alphabets[0].symbols
new_frame = frame_as_categorical(
frame, include_categories=state_alphabet)
else:
new_frame = frame
return new_frame
def from_sequence_dict(d, categorical=True):
"""
Convert a dict with sequences as strings to a pandas DataFrame.
Parameters
----------
d : dict
categorical : bool (default: True)
If True, the result will be returned as a categorical frame.
"""
d_seq_list = {k: list(v) for (k, v) in d.items()}
frame = pandas.DataFrame(d_seq_list)
if categorical:
return frame_as_categorical(frame)
else:
return frame
def to_bioalignment(frame, alphabet='generic_alphabet'):
"""
Convert a pandas DataFrame to a BioPython MultipleSeqAlignment.
Parameters
----------
frame : pandas.DataFrame
alphabet : str, default: 'generic_alignment'
BioPython alphabet to use: 'generic_alphabet', 'generic_dna',
'generic_nucleotide', 'generic_protein' or 'generic_rna')
"""
if alphabet not in [
'generic_alphabet', 'generic_dna',
'generic_nucleotide', 'generic_protein', 'generic_rna']:
raise ValueError(
'Invalid BioPython alphabet: {}'
.format(alphabet))
try:
import Bio.Alphabet
from Bio.AlignIO import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
except ImportError as ex:
raise ImportError(
'\'to_bioalignment\' requires BioPython.\n{ex}'.format(ex=str(ex)))
alignment = MultipleSeqAlignment([])
bio_alphabet = getattr(Bio.Alphabet, alphabet)
for id, seq_series in frame.iteritems():
seq_record = SeqRecord(
Seq(''.join(seq_series), alphabet=bio_alphabet),
id=id)
alignment.append(seq_record)
return alignment
def to_charmatrix(frame, data_type):
"""
Convert a pandas DataFrame to a DendroPy CharacterMatrix.
Parameters
----------
frame : pandas.DataFrame
data_type : str
Type of CharacterMatrix to create: 'dna', 'rna',
'protein' or 'standard'.
"""
try:
import dendropy
except ImportError as ex:
raise ImportError(
'\'to_charmatrix\' requires DendroPy.\n{ex}'.format(ex=str(ex)))
d = frame.apply(lambda x: ''.join(x), axis=0).to_dict()
if data_type == 'standard':
charmatrix = dendropy.StandardCharacterMatrix.from_dict(d)
elif data_type == 'dna':
charmatrix = dendropy.DnaCharacterMatrix.from_dict(d)
elif data_type == 'rna':
charmatrix = dendropy.RnaCharacterMatrix.from_dict(d)
elif data_type == 'protein':
charmatrix = dendropy.ProteinCharacterMatrix.from_dict(d)
else:
raise ValueError(
'{} is not a valid data type'.format(repr(data_type)))
# Preserve taxon name sort order
taxon_names = list(frame.columns)
charmatrix.taxon_namespace.sort(key=lambda x: taxon_names.index(x.label))
return charmatrix
def to_sequence_dict(frame, into=dict):
"""Convert a pandas DataFrame to a dict with sequences as strings."""
return frame.apply(lambda x: ''.join(x)).to_dict(into=into)