Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CLePAPS structural alphabet #681

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion doc/apidoc.json
Original file line number Diff line number Diff line change
Expand Up @@ -356,7 +356,6 @@
"set_component",
"list_assemblies",
"get_assembly"

],
"CIF format" : [
"CIFFile",
Expand All @@ -382,5 +381,17 @@
"StringArrayEncoding",
"TypeCode"
]
},
"biotite.structure.alphabet" : {
"Structural alphabets": [
"I3DSequence",
"ProteinBlocksAlphabet",
"ClepapsAlphabet"
],
"Conversion Function": [
"to_3di",
"to_protein_blocks",
"to_clepaps"
]
}
}
31 changes: 30 additions & 1 deletion doc/references.bib
Original file line number Diff line number Diff line change
Expand Up @@ -742,7 +742,7 @@ @article{Steele2021
eprint = {2001.05304},
primaryclass = {cs},
doi = {10.48550/arXiv.2001.05304},
archiveprefix = {arxiv}
archiveprefix = {arXiv}
}

@article{Steinegger2017,
Expand Down Expand Up @@ -838,6 +838,35 @@ @article{VanHerk1992
doi = {10.1016/0167-8655(92)90069-C}
}

@article{VanKempen2024,
title = {Fast and Accurate Protein Structure Search with {{Foldseek}}},
author = {{van Kempen}, Michel and Kim, Stephanie S. and Tumescheit, Charlotte and Mirdita, Milot and Lee, Jeongjae and Gilchrist, Cameron L. M. and Söding, Johannes and Steinegger, Martin},
year = {2024},
month = feb,
journal = {Nature Biotechnology},
volume = {42},
number = {2},
pages = {243--246},
publisher = {Nature Publishing Group},
issn = {1546-1696},
doi = {10.1038/s41587-023-01773-0}
}

@article{Wang2008,
title = {{{CLePAPS}}: {{FAST PAIR ALIGNMENT OF PROTEIN STRUCTURES BASED ON CONFORMATIONAL LETTERS}}},
shorttitle = {{{CLePAPS}}},
author = {Wang, Sheng and Zheng, Wei-Mou},
year = {2008},
month = apr,
journal = {Journal of Bioinformatics and Computational Biology},
volume = {06},
number = {02},
pages = {347--366},
publisher = {World Scientific Publishing Co.},
issn = {0219-7200},
doi = {10.1142/S0219720008003461}
}

@article{Westbrook2015,
title = {The Chemical Component Dictionary: Complete Descriptions of Constituent Molecules in Experimentally Determined {{3D}} Macromolecules in the {{Protein Data Bank}}},
shorttitle = {The Chemical Component Dictionary},
Expand Down
138 changes: 120 additions & 18 deletions src/biotite/sequence/align/matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

__all__ = ["SubstitutionMatrix"]
__name__ = "biotite.sequence.align"
__author__ = "Patrick Kunzmann"

import os
import functools
from pathlib import Path
import numpy as np
from biotite.sequence.seqtypes import NucleotideSequence, ProteinSequence

__all__ = ["SubstitutionMatrix"]
# Directory of matrix files
_DB_DIR = Path(__file__).parent / "matrix_data"


class SubstitutionMatrix(object):
Expand Down Expand Up @@ -59,6 +62,12 @@ class SubstitutionMatrix(object):
- **RBLOSUM<n>_<BLOCKS>**
- **CorBLOSUM<n>_<BLOCKS>**

- Structural alphabet substitution matrices

- **3Di** - For 3Di alphabet from ``foldseek`` :footcite:`VanKempen2024`
- **PB** - For Protein Blocks alphabet from *PBexplore* :footcite:`Barnoud2017`
- **CLESUM** - For CLePAPS alphabet :footcite:`Wang2008`

A list of all available matrix names is returned by
:meth:`list_db()`.

Expand Down Expand Up @@ -124,9 +133,6 @@ class SubstitutionMatrix(object):
>>> matrix = SubstitutionMatrix(alph, alph, "BLOSUM50")
"""

# Directory of matrix files
_db_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "matrix_data")

def __init__(self, alphabet1, alphabet2, score_matrix):
self._alph1 = alphabet1
self._alph2 = alphabet2
Expand Down Expand Up @@ -350,7 +356,7 @@ def dict_from_db(matrix_name):
matrix_dict : dict
A dictionary representing the substitution matrix.
"""
filename = SubstitutionMatrix._db_dir + os.sep + matrix_name + ".mat"
filename = _DB_DIR / f"{matrix_name}.mat"
with open(filename, "r") as f:
return SubstitutionMatrix.dict_from_str(f.read())

Expand All @@ -364,11 +370,10 @@ def list_db():
db_list : list
List of matrix names in the internal database.
"""
files = os.listdir(SubstitutionMatrix._db_dir)
# Remove '.mat' from files
return [file[:-4] for file in sorted(files)]
return [path.stem for path in _DB_DIR.glob("*.mat")]

@staticmethod
@functools.cache
def std_protein_matrix():
"""
Get the default :class:`SubstitutionMatrix` for protein sequence
Expand All @@ -379,9 +384,12 @@ def std_protein_matrix():
matrix : SubstitutionMatrix
Default matrix.
"""
return _matrix_blosum62
return SubstitutionMatrix(
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
)

@staticmethod
@functools.cache
def std_nucleotide_matrix():
"""
Get the default :class:`SubstitutionMatrix` for DNA sequence
Expand All @@ -392,13 +400,107 @@ def std_nucleotide_matrix():
matrix : SubstitutionMatrix
Default matrix.
"""
return _matrix_nuc
return SubstitutionMatrix(
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
)

@staticmethod
@functools.cache
def std_3di_matrix():
"""
Get the default :class:`SubstitutionMatrix` for 3Di sequence
alignments.
:footcite:`VanKempen2024`

Returns
-------
matrix : SubstitutionMatrix
Default matrix.
"""
# Import inside function to avoid circular import
from biotite.structure.alphabet.i3d import I3DSequence

return SubstitutionMatrix(I3DSequence.alphabet, I3DSequence.alphabet, "3Di")

@staticmethod
@functools.cache
def std_protein_blocks_matrix(unknown_match=200, unkown_mismatch=-200):
"""
Get the default :class:`SubstitutionMatrix` for Protein Blocks sequences.

# Preformatted BLOSUM62 and NUC substitution matrix from NCBI
_matrix_blosum62 = SubstitutionMatrix(
ProteinSequence.alphabet, ProteinSequence.alphabet, "BLOSUM62"
)
_matrix_nuc = SubstitutionMatrix(
NucleotideSequence.alphabet_amb, NucleotideSequence.alphabet_amb, "NUC"
)
The matrix is adapted from *PBxplore* :footcite:`Barnoud2017`.

Parameters
----------
unknown_match, unkown_mismatch : int, optional
The match and mismatch score for undefined symbols.
The default values were chose arbitrarily.

Returns
-------
matrix : SubstitutionMatrix
Default matrix.

References
----------

.. footbibliography::

"""
from biotite.structure.alphabet.pb import ProteinBlocksSequence

alphabet = ProteinBlocksSequence.alphabet
unknown_symbol = ProteinBlocksSequence.unknown_symbol
matrix_dict = SubstitutionMatrix.dict_from_db("PB")
for symbol in alphabet:
if symbol == unknown_symbol:
continue
matrix_dict[symbol, unknown_symbol] = unkown_mismatch
matrix_dict[unknown_symbol, symbol] = unkown_mismatch
matrix_dict[unknown_symbol, unknown_symbol] = unknown_match
return SubstitutionMatrix(
alphabet,
alphabet,
matrix_dict,
)

@staticmethod
@functools.cache
def std_clepaps_matrix(unknown_match=200, unkown_mismatch=-200):
"""
Get the default :class:`SubstitutionMatrix` for *CLePAPS* sequences.

Parameters
----------
unknown_match, unkown_mismatch : int, optional
The match and mismatch score for undefined symbols.
The default values were chose arbitrarily.

Returns
-------
matrix : SubstitutionMatrix
Default matrix.

References
----------

.. footbibliography::

"""
from biotite.structure.alphabet.pb import ProteinBlocksSequence

alphabet = ProteinBlocksSequence.alphabet
unknown_symbol = ProteinBlocksSequence.unknown_symbol
matrix_dict = SubstitutionMatrix.dict_from_db("CLESUM")
# Add match/mismatch scores for undefined symbols
for symbol in alphabet:
if symbol == unknown_symbol:
continue
matrix_dict[symbol, unknown_symbol] = unkown_mismatch
matrix_dict[unknown_symbol, symbol] = unkown_mismatch
matrix_dict[unknown_symbol, unknown_symbol] = unknown_match
return SubstitutionMatrix(
alphabet,
alphabet,
matrix_dict,
)
25 changes: 25 additions & 0 deletions src/biotite/sequence/align/matrix_data/3Di.mat
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# 3Di bit/2
# Background (precomputed optional): 0.0489372 0.0306991 0.101049 0.0329671 0.0276149 0.0416262 0.0452521 0.030876 0.0297251 0.0607036 0.0150238 0.0215826 0.0783843 0.0512926 0.0264886 0.0610702 0.0201311 0.215998 0.0310265 0.0295417 0.00001
# Lambda (precomputed optional): 0.351568
A C D E F G H I K L M N P Q R S T V W Y X
A 6 -3 1 2 3 -2 -2 -7 -3 -3 -10 -5 -1 1 -4 -7 -5 -6 0 -2 0
C -3 6 -2 -8 -5 -4 -4 -12 -13 1 -14 0 0 1 -1 0 -8 1 -7 -9 0
D 1 -2 4 -3 0 1 1 -3 -5 -4 -5 -2 1 -1 -1 -4 -2 -3 -2 -2 0
E 2 -8 -3 9 -2 -7 -4 -12 -10 -7 -17 -8 -6 -3 -8 -10 -10 -13 -6 -3 0
F 3 -5 0 -2 7 -3 -3 -5 1 -3 -9 -5 -2 2 -5 -8 -3 -7 4 -4 0
G -2 -4 1 -7 -3 6 3 0 -7 -7 -1 -2 -2 -4 3 -3 4 -6 -4 -2 0
H -2 -4 1 -4 -3 3 6 -4 -7 -6 -6 0 -1 -3 1 -3 -1 -5 -5 3 0
I -7 -12 -3 -12 -5 0 -4 8 -5 -11 7 -7 -6 -6 -3 -9 6 -12 -5 -8 0
K -3 -13 -5 -10 1 -7 -7 -5 9 -11 -8 -12 -6 -5 -9 -14 -5 -15 5 -8 0
L -3 1 -4 -7 -3 -7 -6 -11 -11 6 -16 -3 -2 2 -4 -4 -9 0 -8 -9 0
M -10 -14 -5 -17 -9 -1 -6 7 -8 -16 10 -9 -9 -10 -5 -10 3 -16 -6 -9 0
N -5 0 -2 -8 -5 -2 0 -7 -12 -3 -9 7 0 -2 2 3 -4 0 -8 -5 0
P -1 0 1 -6 -2 -2 -1 -6 -6 -2 -9 0 4 0 0 -2 -4 0 -4 -5 0
Q 1 1 -1 -3 2 -4 -3 -6 -5 2 -10 -2 0 5 -2 -4 -5 -1 -2 -5 0
R -4 -1 -1 -8 -5 3 1 -3 -9 -4 -5 2 0 -2 6 2 0 -1 -6 -3 0
S -7 0 -4 -10 -8 -3 -3 -9 -14 -4 -10 3 -2 -4 2 6 -6 0 -11 -9 0
T -5 -8 -2 -10 -3 4 -1 6 -5 -9 3 -4 -4 -5 0 -6 8 -9 -5 -5 0
V -6 1 -3 -13 -7 -6 -5 -12 -15 0 -16 0 0 -1 -1 0 -9 3 -10 -11 0
W 0 -7 -2 -6 4 -4 -5 -5 5 -8 -6 -8 -4 -2 -6 -11 -5 -10 8 -6 0
Y -2 -9 -2 -3 -4 -2 3 -8 -8 -9 -9 -5 -5 -5 -3 -9 -5 -11 -6 9 0
X 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
21 changes: 21 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.license
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
The MIT License (MIT)

Copyright (c) 2013 Poulain, A. G. de Brevern

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
18 changes: 18 additions & 0 deletions src/biotite/sequence/align/matrix_data/PB.mat
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# PB substitution matrix, adapted from PBxplore
a b c d e f g h i j k l m n o p
a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83
b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22
c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6
d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497
e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632
f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552
g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254
h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399
i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226
j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104
k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382
l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316
m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155
n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146
o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58
p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609
14 changes: 14 additions & 0 deletions src/biotite/structure/alphabet/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# This source code is part of the Biotite package and is distributed
# under the 3-Clause BSD License. Please see 'LICENSE.rst' for further
# information.

"""
A subpackage for converting structures to structural alphabet sequences.
"""

__name__ = "biotite.structure.alphabet"
__author__ = "Martin Larralde, Patrick Kunzmann"

from .clepaps import *
from .i3d import *
from .pb import *
Loading
Loading