MDAnalysis · zwsmith200 · May 7, 2022 · May 7, 2022 · May 9, 2022 · May 9, 2022
diff --git a/package/AUTHORS b/package/AUTHORS
@@ -193,6 +193,7 @@ Chronological list of authors
   - Kavya Bisht
   - Mark Verma
   - Marcelo D. Poleto
+  - Zachary Smith
   - Ricky Sexton
   - Rafael R. Pappalardo
   - Tengyu Xie

diff --git a/package/CHANGELOG b/package/CHANGELOG
@@ -28,6 +28,8 @@ Fixes
     RMSD of None
   * Fixes hydrogenbonds tutorial path to point to hbonds (Issue #4285, PR #4286)
   * Fix atom charge reading in PDBQT parser (Issue #4282, PR #4283)
+  * Universe keyword to correctly parse repeated resnames in PDB and MOL2
+    (Issue #3659, PR #3662)
 
 Enhancements
   * Add faster nucleic acid Major and Minor pair distance calculators using

diff --git a/package/MDAnalysis/topology/MOL2Parser.py b/package/MDAnalysis/topology/MOL2Parser.py
@@ -46,7 +46,7 @@
 
 from . import guessers
 from ..lib.util import openany
-from .base import TopologyReaderBase, squash_by
+from .base import TopologyReaderBase, squash_by, squash_by_attributes
 from ..core.topologyattrs import (
     Atomids,
     Atomnames,
@@ -132,9 +132,15 @@ class MOL2Parser(TopologyReaderBase):
     """
     format = 'MOL2'
 
-    def parse(self, **kwargs):
+    def parse(self, repeated_resids: bool = False, **kwargs):
         """Parse MOL2 file *filename* and return the dict `structure`.
 
+        Parameters
+        ----------
+        repeated_resids: bool
+          Whether the same resid is used for multiple residues.
+          This option will perceive residues using the resid/resname pair.
+
         Returns
         -------
         A MDAnalysis Topology object
@@ -252,8 +258,17 @@ def parse(self, **kwargs):
         resnames = np.array(resnames, dtype=object)
 
         if np.all(resnames):
-            residx, resids, (resnames,) = squash_by(
-                resids, resnames)
+            if repeated_resids:
+                (
+                    residx,
+                    (
+                        resids,
+                        resnames,
+                    ),
+                    _,
+                ) = squash_by_attributes((resids, resnames))
+            else:
+                residx, resids, (resnames,) = squash_by(resids, resnames)
             n_residues = len(resids)
             attrs.append(Resids(resids))
             attrs.append(Resnums(resids.copy()))

diff --git a/package/MDAnalysis/topology/PDBParser.py b/package/MDAnalysis/topology/PDBParser.py
@@ -64,7 +64,7 @@
 from .guessers import guess_masses, guess_types
 from .tables import SYMB2Z
 from ..lib import util
-from .base import TopologyReaderBase, change_squash
+from .base import TopologyReaderBase, change_squash, squash_by_attributes
 from ..core.topology import Topology
 from ..core.topologyattrs import (
     Atomnames,
@@ -200,14 +200,23 @@ class PDBParser(TopologyReaderBase):
     """
     format = ['PDB', 'ENT']
 
-    def parse(self, **kwargs):
+    def parse(self, contiguous_resids: bool = True, **kwargs):
         """Parse atom information from PDB file
 
+        Parameters
+        ----------
+        contiguous_resids : bool
+          Whether members of a residue are assumed to be adjacent atoms.
+          This assumption can be broken when atoms are added at the end
+          of a file, for example when adding hydrogens to the PDB.
+          When False, unique combinations of residue properties such as
+          resid, resname, and segname will be used to identify resudes.
+
         Returns
         -------
         MDAnalysis Topology object
         """
-        top = self._parseatoms()
+        top = self._parseatoms(contiguous_resids)
 
         try:
             bonds = self._parsebonds(top.ids.values)
@@ -224,8 +233,19 @@ def parse(self, **kwargs):
 
         return top
 
-    def _parseatoms(self):
-        """Create the initial Topology object"""
+    def _parseatoms(self, contiguous_resids: bool):
+        """Create the initial Topology object
+
+        Parameters
+        ----------
+        contiguous_resids : bool
+          Whether members of a residue are assumed to be adjacent atoms.
+          This assumption can be broken when atoms are added at the end
+          of a file, for example when adding hydrogens to the PDB.
+          When False, unique combinations of residue properties such as
+          resid, resname, and segname will be used to identify resudes.
+        """
+
         resid_prev = 0  # resid looping hack
 
         record_types = []
@@ -385,9 +405,25 @@ def _parseatoms(self):
         icodes = np.array(icodes, dtype=object)
         resnums = resids.copy()
         segids = np.array(segids, dtype=object)
-
-        residx, (resids, resnames, icodes, resnums, segids) = change_squash(
-            (resids, resnames, icodes, segids), (resids, resnames, icodes, resnums, segids))
+        if contiguous_resids:
+            residx, (
+                resids,
+                resnames,
+                icodes,
+                resnums,
+                segids,
+            ) = change_squash(
+                (resids, resnames, icodes, segids),
+                (resids, resnames, icodes, resnums, segids),
+            )
+        else:
+            (
+                residx,
+                (resids, resnames, icodes, segids),
+                (resnums,),
+            ) = squash_by_attributes(
+                (resids, resnames, icodes, segids), resnums
+            )
         n_residues = len(resids)
         attrs.append(Resnums(resnums))
         attrs.append(Resids(resids))
@@ -396,7 +432,10 @@ def _parseatoms(self):
         attrs.append(Resnames(resnames))
 
         if any(segids) and not any(val is None for val in segids):
-            segidx, (segids,) = change_squash((segids,), (segids,))
+            if contiguous_resids:
+                segidx, (segids,) = change_squash((segids,), (segids,))
+            else:
+                segidx, (segids,), _ = squash_by_attributes((segids,))
             n_segments = len(segids)
             attrs.append(Segids(segids))
         else:

diff --git a/package/MDAnalysis/topology/base.py b/package/MDAnalysis/topology/base.py
@@ -37,6 +37,7 @@
 
 """
 from functools import reduce
+from typing import List
 
 import itertools
 import numpy as np
@@ -142,6 +143,48 @@ def squash_by(child_parent_ids, *attributes):
     return atom_idx, unique_resids, [attr[sort_mask] for attr in attributes]
 
 
+def squash_by_attributes(
+    squash_attributes: List[np.ndarray], *other_attributes: np.ndarray
+):
+    """Squash a child-parent relationship using combinations of attributes
+    parents will retain their order of appearance
+
+    Parameters
+    ----------
+    squash_attributes: list of numpy ndarray
+      list of attribute arrays (attributes used to identify the parent)
+      Their unique combinations will set child-parent relationships
+    *other_attributes: numpy ndarrays
+      other arrays that need to follow the sorting of ids
+
+    Returns
+    -------
+    sorted_atom_idx: numpy ndarray
+      an array of len(child) which points to the index of the parent
+    squashed_attrs: list of numpy ndarray
+      List of re-ordered squash_attributes
+    *other_attrs: list of numpy ndarray
+      List of re-ordered other_attributes
+    """
+    squash_array = np.column_stack(squash_attributes).astype(str)
+    unique_combos, sort_mask, atom_idx = np.unique(
+        squash_array, return_index=True, return_inverse=True, axis=0
+    )
+
+    appearance_order = np.argsort(sort_mask)
+    new_index = np.arange(0, len(appearance_order))
+    resort_pairs = np.column_stack((appearance_order, new_index))
+    atom_idx_mapping = dict(resort_pairs)
+
+    sorted_atom_idx = np.vectorize(atom_idx_mapping.get)(atom_idx).astype(int)
+    sorted_mask = np.sort(sort_mask)
+
+    squashed_attrs = [attr[sorted_mask] for attr in squash_attributes]
+    other_attrs = [attr[sorted_mask] for attr in other_attributes]
+
+    return sorted_atom_idx, squashed_attrs, other_attrs
+
+
 def change_squash(criteria, to_squash):
     """Squash per atom data to per residue according to changes in resid
 

diff --git a/testsuite/MDAnalysisTests/topology/test_mol2.py b/testsuite/MDAnalysisTests/topology/test_mol2.py
@@ -172,6 +172,21 @@
 """
 
 
+mol2_repeat_resid = """\
+@<TRIPOS>MOLECULE
+mol2_repeat_resid
+ 3 0 0 0 0
+SMALL
+GASTEIGER
+
+@<TRIPOS>ATOM
+    1  CA        65.9780   40.8660   -0.1830 C.3     1  LYS        0.0532
+    2  CA        64.3240   40.0060    3.1640 C.3     2  SER        0.1227
+    3  CA        64.0000   39.2360    8.6480 C.3     1  LEU        0.0996
+    4  N         66.1350   42.0990   -1.0070 N.4     1  LYS        0.2273
+"""
+
+
 class TestMOL2Base(ParserBase):
     parser = mda.topology.MOL2Parser.MOL2Parser
     expected_attrs = [
@@ -301,3 +316,18 @@ def test_unformat():
     with pytest.raises(ValueError,
                        match='Some atoms in the mol2 file'):
         u = mda.Universe(StringIO(mol2_resname_unformat), format='MOL2')
+
+
+def test_repeat_resid():
+    """
+    Test parsing re-used resids with different resnames
+    """
+    u = mda.Universe(
+        StringIO(mol2_repeat_resid), format="MOL2", repeated_resids=True
+    )
+
+    expected_resnames = np.array(["LYS", "SER", "LEU"], dtype=object)
+    assert_equal(u.residues.resnames, expected_resnames)
+
+    expected_resids = np.array([1, 2, 1])
+    assert_equal(u.residues.resids, expected_resids)
diff --git a/testsuite/MDAnalysisTests/topology/test_pdb.py b/testsuite/MDAnalysisTests/topology/test_pdb.py
@@ -371,3 +371,26 @@ def test_PDB_bad_charges(infile, entry):
     with pytest.warns(UserWarning, match=wmsg):
         u = mda.Universe(StringIO(infile), format='PDB')
         assert not hasattr(u, 'formalcharges')
+
+
+pdb_repeat_resid = """\
+ATOM      1  CA  LYS A   1      65.978  40.866  -0.183  1.00  0.00           C
+ATOM      2  CA  SER A   2      64.324  40.006   3.164  1.00  0.00           C
+ATOM      3  CA  LEU A   1      64.000  39.236   8.648  1.00  0.00           C
+ATOM      4  N   LYS A   1      66.135  42.099  -1.007  1.00  0.00           N
+"""
+
+
+def test_repeat_resid():
+    """
+    Test parsing re-used resids with differeent resnames
+    """
+    u = mda.Universe(
+        StringIO(pdb_repeat_resid), format="PDB", contiguous_resids=False
+    )
+
+    expected_resnames = np.array(["LYS", "SER", "LEU"], dtype=object)
+    assert_equal(u.residues.resnames, expected_resnames)
+
+    expected_resids = np.array([1, 2, 1])
+    assert_equal(u.residues.resids, expected_resids)