Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Issue 3659 fixed loading for MOL2 and PDB edge cases #3662

Open
wants to merge 11 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions package/AUTHORS
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,7 @@ Chronological list of authors
- Kavya Bisht
- Mark Verma
- Marcelo D. Poleto
- Zachary Smith
- Ricky Sexton
- Rafael R. Pappalardo
- Tengyu Xie
Expand Down
2 changes: 2 additions & 0 deletions package/CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ Fixes
RMSD of None
* Fixes hydrogenbonds tutorial path to point to hbonds (Issue #4285, PR #4286)
* Fix atom charge reading in PDBQT parser (Issue #4282, PR #4283)
* Universe keyword to correctly parse repeated resnames in PDB and MOL2
(Issue #3659, PR #3662)

Enhancements
* Add faster nucleic acid Major and Minor pair distance calculators using
Expand Down
23 changes: 19 additions & 4 deletions package/MDAnalysis/topology/MOL2Parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@

from . import guessers
from ..lib.util import openany
from .base import TopologyReaderBase, squash_by
from .base import TopologyReaderBase, squash_by, squash_by_attributes
from ..core.topologyattrs import (
Atomids,
Atomnames,
Expand Down Expand Up @@ -132,9 +132,15 @@ class MOL2Parser(TopologyReaderBase):
"""
format = 'MOL2'

def parse(self, **kwargs):
def parse(self, repeated_resids: bool = False, **kwargs):
"""Parse MOL2 file *filename* and return the dict `structure`.

Parameters
----------
repeated_resids: bool
Whether the same resid is used for multiple residues.
This option will perceive residues using the resid/resname pair.

Returns
-------
A MDAnalysis Topology object
Expand Down Expand Up @@ -252,8 +258,17 @@ def parse(self, **kwargs):
resnames = np.array(resnames, dtype=object)

if np.all(resnames):
residx, resids, (resnames,) = squash_by(
resids, resnames)
if repeated_resids:
(
residx,
(
resids,
resnames,
),
_,
) = squash_by_attributes((resids, resnames))
else:
residx, resids, (resnames,) = squash_by(resids, resnames)
n_residues = len(resids)
attrs.append(Resids(resids))
attrs.append(Resnums(resids.copy()))
Expand Down
57 changes: 48 additions & 9 deletions package/MDAnalysis/topology/PDBParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
from .guessers import guess_masses, guess_types
from .tables import SYMB2Z
from ..lib import util
from .base import TopologyReaderBase, change_squash
from .base import TopologyReaderBase, change_squash, squash_by_attributes
from ..core.topology import Topology
from ..core.topologyattrs import (
Atomnames,
Expand Down Expand Up @@ -200,14 +200,23 @@ class PDBParser(TopologyReaderBase):
"""
format = ['PDB', 'ENT']

def parse(self, **kwargs):
def parse(self, contiguous_resids: bool = True, **kwargs):
"""Parse atom information from PDB file

Parameters
----------
contiguous_resids : bool
Whether members of a residue are assumed to be adjacent atoms.
This assumption can be broken when atoms are added at the end
of a file, for example when adding hydrogens to the PDB.
When False, unique combinations of residue properties such as
resid, resname, and segname will be used to identify resudes.

Returns
-------
MDAnalysis Topology object
"""
top = self._parseatoms()
top = self._parseatoms(contiguous_resids)

try:
bonds = self._parsebonds(top.ids.values)
Expand All @@ -224,8 +233,19 @@ def parse(self, **kwargs):

return top

def _parseatoms(self):
"""Create the initial Topology object"""
def _parseatoms(self, contiguous_resids: bool):
"""Create the initial Topology object

Parameters
----------
contiguous_resids : bool
Whether members of a residue are assumed to be adjacent atoms.
This assumption can be broken when atoms are added at the end
of a file, for example when adding hydrogens to the PDB.
When False, unique combinations of residue properties such as
resid, resname, and segname will be used to identify resudes.
"""

resid_prev = 0 # resid looping hack

record_types = []
Expand Down Expand Up @@ -385,9 +405,25 @@ def _parseatoms(self):
icodes = np.array(icodes, dtype=object)
resnums = resids.copy()
segids = np.array(segids, dtype=object)

residx, (resids, resnames, icodes, resnums, segids) = change_squash(
(resids, resnames, icodes, segids), (resids, resnames, icodes, resnums, segids))
if contiguous_resids:
residx, (
resids,
resnames,
icodes,
resnums,
segids,
) = change_squash(
(resids, resnames, icodes, segids),
(resids, resnames, icodes, resnums, segids),
)
else:
(
residx,
(resids, resnames, icodes, segids),
(resnums,),
) = squash_by_attributes(
(resids, resnames, icodes, segids), resnums
)
n_residues = len(resids)
attrs.append(Resnums(resnums))
attrs.append(Resids(resids))
Expand All @@ -396,7 +432,10 @@ def _parseatoms(self):
attrs.append(Resnames(resnames))

if any(segids) and not any(val is None for val in segids):
segidx, (segids,) = change_squash((segids,), (segids,))
if contiguous_resids:
segidx, (segids,) = change_squash((segids,), (segids,))
else:
segidx, (segids,), _ = squash_by_attributes((segids,))
n_segments = len(segids)
attrs.append(Segids(segids))
else:
Expand Down
43 changes: 43 additions & 0 deletions package/MDAnalysis/topology/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

"""
from functools import reduce
from typing import List

import itertools
import numpy as np
Expand Down Expand Up @@ -142,6 +143,48 @@ def squash_by(child_parent_ids, *attributes):
return atom_idx, unique_resids, [attr[sort_mask] for attr in attributes]


def squash_by_attributes(
squash_attributes: List[np.ndarray], *other_attributes: np.ndarray
):
"""Squash a child-parent relationship using combinations of attributes
parents will retain their order of appearance

Parameters
----------
squash_attributes: list of numpy ndarray
list of attribute arrays (attributes used to identify the parent)
Their unique combinations will set child-parent relationships
*other_attributes: numpy ndarrays
other arrays that need to follow the sorting of ids

Returns
-------
sorted_atom_idx: numpy ndarray
an array of len(child) which points to the index of the parent
squashed_attrs: list of numpy ndarray
List of re-ordered squash_attributes
*other_attrs: list of numpy ndarray
List of re-ordered other_attributes
"""
squash_array = np.column_stack(squash_attributes).astype(str)
unique_combos, sort_mask, atom_idx = np.unique(
squash_array, return_index=True, return_inverse=True, axis=0
)

appearance_order = np.argsort(sort_mask)
new_index = np.arange(0, len(appearance_order))
resort_pairs = np.column_stack((appearance_order, new_index))
atom_idx_mapping = dict(resort_pairs)

sorted_atom_idx = np.vectorize(atom_idx_mapping.get)(atom_idx).astype(int)
sorted_mask = np.sort(sort_mask)

squashed_attrs = [attr[sorted_mask] for attr in squash_attributes]
other_attrs = [attr[sorted_mask] for attr in other_attributes]

return sorted_atom_idx, squashed_attrs, other_attrs


def change_squash(criteria, to_squash):
"""Squash per atom data to per residue according to changes in resid

Expand Down
30 changes: 30 additions & 0 deletions testsuite/MDAnalysisTests/topology/test_mol2.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,21 @@
"""


mol2_repeat_resid = """\
@<TRIPOS>MOLECULE
mol2_repeat_resid
3 0 0 0 0
SMALL
GASTEIGER

@<TRIPOS>ATOM
1 CA 65.9780 40.8660 -0.1830 C.3 1 LYS 0.0532
2 CA 64.3240 40.0060 3.1640 C.3 2 SER 0.1227
3 CA 64.0000 39.2360 8.6480 C.3 1 LEU 0.0996
4 N 66.1350 42.0990 -1.0070 N.4 1 LYS 0.2273
"""


class TestMOL2Base(ParserBase):
parser = mda.topology.MOL2Parser.MOL2Parser
expected_attrs = [
Expand Down Expand Up @@ -301,3 +316,18 @@ def test_unformat():
with pytest.raises(ValueError,
match='Some atoms in the mol2 file'):
u = mda.Universe(StringIO(mol2_resname_unformat), format='MOL2')


def test_repeat_resid():
"""
Test parsing re-used resids with different resnames
"""
u = mda.Universe(
StringIO(mol2_repeat_resid), format="MOL2", repeated_resids=True
)

expected_resnames = np.array(["LYS", "SER", "LEU"], dtype=object)
assert_equal(u.residues.resnames, expected_resnames)

expected_resids = np.array([1, 2, 1])
assert_equal(u.residues.resids, expected_resids)
23 changes: 23 additions & 0 deletions testsuite/MDAnalysisTests/topology/test_pdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,26 @@ def test_PDB_bad_charges(infile, entry):
with pytest.warns(UserWarning, match=wmsg):
u = mda.Universe(StringIO(infile), format='PDB')
assert not hasattr(u, 'formalcharges')


pdb_repeat_resid = """\
ATOM 1 CA LYS A 1 65.978 40.866 -0.183 1.00 0.00 C
ATOM 2 CA SER A 2 64.324 40.006 3.164 1.00 0.00 C
ATOM 3 CA LEU A 1 64.000 39.236 8.648 1.00 0.00 C
ATOM 4 N LYS A 1 66.135 42.099 -1.007 1.00 0.00 N
"""


def test_repeat_resid():
"""
Test parsing re-used resids with differeent resnames
"""
u = mda.Universe(
StringIO(pdb_repeat_resid), format="PDB", contiguous_resids=False
)

expected_resnames = np.array(["LYS", "SER", "LEU"], dtype=object)
assert_equal(u.residues.resnames, expected_resnames)

expected_resids = np.array([1, 2, 1])
assert_equal(u.residues.resids, expected_resids)
Loading