Source code for domhmm.analysis.base

"""
DomHMM --- :mod:`domhmm.analysis.LeafletAnalysisBase`
===========================================================

This module contains the :class:`LeafletAnalysisBase` class.

"""

# ----PYTHON---- #
from typing import Union, Dict, Any

import numpy as np
import sklearn
from MDAnalysis.analysis import distances
# ----MDANALYSIS---- #
from MDAnalysis.analysis.base import AnalysisBase
from MDAnalysis.analysis.leaflet import LeafletFinder
# if TYPE_CHECKING:
from MDAnalysis.core.universe import Universe, AtomGroup



[docs]
class LeafletAnalysisBase(AnalysisBase):
    """
    LeafletAnalysisBase class.

    This class marks the start point for each analysis method. 

    It connects to the MDAnalysis core library, setups the MDAnalysis universe, and provides coordinates for each
    membrane leaflet.

    Parameters
    ----------
    universe_or_atomgroup: :class:`~MDAnalysis.core.universe.Universe` or :class:`~MDAnalysis.core.groups.AtomGroup`
        Universe or group of atoms to apply this analysis to.
        If a trajectory is associated with the atoms,
        then the computation iterates over the trajectory.
    membrane_select: str
        Membrane selection query for analysis of simulation trajectories.
    gmm_kwargs: Optional[Dict]
        Optional parameter for Gaussian mixture model function.
    hmm_kwargs: Optional[Dict]
        Optional parameter for Hidden Markov model function.
    leaflet_kwargs: Optional[dict]
        dictionary containing additional arguments for the MDAnalysis LeafletFinder
    leaflet_select: Union["auto", List[AtomGroup], List[str]]
        Leaflet selection options for lipids which can be automatic by finding Leafletfinder, atomgroup, string query or
         list
    heads: Dict[str, Any]
        dictionary containing residue name and atom selection for lipid head groups
    tails: Dict[str, Any]
        dictionary containing residue name and atom selection for lipid tail groups
    sterol_heads: Dict[str, Any]
        dictionary containing residue name and atom selection for sterol head groups
    sterol_tails: Dict[str, Any]
         dictionary containing residue name and atom selection for sterol tail groups
         (head as first, tail beginning as second)
    tmd_protein_list: Union["auto", List[AtomGroup], List[str]]
         Transmembrane domain protein list to include area per lipid calculation
    frac: float
        fraction of box length in x and y outside the unit cell considered for Voronoi calculation
    p_value: float
        p_value for z_score calculation
    lipid_leaflet_rate: Union[None, int]
        Frame rate for checking lipids leaflet assignments via LeafletFinder
    sterol_leaflet_rate: int
        Frame rate for checking sterols leaflet assignments via LeafletFinder
    asymmetric_membrane: bool
        Asymmetric membrane option to train models by separated data w.r.t. leaflets
    verbose: bool
        Debug option to print step progress, warnings and errors
    result_plots: bool
        Plotting intermediate result option
    save_plots : bool
        Option for saving intermediate plots in pdf format
    trained_hmms: dict
        User-specific HMM (e.g., pre-trained on another simulation)
    do_clustering: bool
        Perform the hierarchical clustering for each frame
    n_init_hmm: int
        Number of repeats for HMM model trainings

    Attributes
    ----------
    universe_or_atomgroup: :class:`~MDAnalysis.core.universe.Universe`
        The universe to which this analysis is applied
    atomgroup: :class:`~MDAnalysis.core.groups.AtomGroup`
        The atoms to which this analysis is applied
    results: :class:`~MDAnalysis.analysis.base.Results`
        results of calculation are stored here, after calling
        :meth:`.run`
    start: Optional[int]
        The first frame of the trajectory used to compute the analysis
    stop: Optional[int]
        The frame to stop at for the analysis
    step: Optional[int]
        Number of frames to skip between each analyzed frame
    n_frames: int
        Number of frames analysed in the trajectory
    times: numpy.ndarray
        array of Timestep times. Only exists after calling
        :meth:`.run`
    frames: numpy.ndarray
        array of Timestep frame indices. Only exists after calling
        :meth:`.run`
    """

    def __init__(
            self,
            universe_or_atomgroup: Union["Universe", "AtomGroup"],
            membrane_select: str = "all",
            gmm_kwargs: Union[None, dict] = None,
            hmm_kwargs: Union[None, dict] = None,
            leaflet_kwargs: Dict[str, Any] = {},
            leaflet_select: Union[None, "AtomGroup", str, list] = None,
            heads: Dict[str, Any] = {},
            tails: Dict[str, Any] = {},
            sterol_heads: Dict[str, Any] = {},
            sterol_tails: Dict[str, Any] = {},
            tmd_protein_list: Union[None, "AtomGroup", str, list] = None,
            frac: float = 0.5,
            p_value: float = 0.05,
            lipid_leaflet_rate: Union[None, int] = None,
            sterol_leaflet_rate: int = 1,
            asymmetric_membrane: bool = False,
            verbose: bool = False,
            result_plots: bool = False,
            trained_hmms: Dict[str, Any] = {},
            n_init_hmm: int = 2,
            save_plots: bool = False,
            do_clustering: bool = True,
            **kwargs
    ):
        # the below line must be kept to initialize the AnalysisBase class!
        super().__init__(universe_or_atomgroup.trajectory, **kwargs)
        # after this you will be able to access `self.results`
        # `self.results` is a dictionary-like object
        # that can be used to store and retrieve results
        # See more at the MDAnalysis documentation:
        # https://docs.mdanalysis.org/stable/documentation_pages/analysis/base.html?highlight=results#MDAnalysis.analysis.base.Results
        self.resid_tails_selection = {}
        self.universe = universe_or_atomgroup.universe
        self.membrane = universe_or_atomgroup.select_atoms(membrane_select)
        self.membrane_unique_resids = np.unique(self.membrane.resids)
        self.resids_index_map = {resid: idx for idx, resid in enumerate(self.membrane_unique_resids)}
        self.index_resid_map = {idx: resid for idx, resid in enumerate(self.membrane_unique_resids)}
        self.heads = heads
        self.tails = tails
        self.sterol_heads = sterol_heads
        self.sterol_tails = sterol_tails
        self.lipid_leaflet_rate = lipid_leaflet_rate
        self.sterol_leaflet_rate = sterol_leaflet_rate
        self.frac = frac
        self.p_value = p_value
        self.asymmetric_membrane = asymmetric_membrane
        self.verbose = verbose
        self.result_plots = result_plots
        self.tmd_protein = None
        self.n_init_hmm = n_init_hmm
        self.save_plots = save_plots
        self.do_clustering = do_clustering

        assert heads.keys() == tails.keys(), "Heads and tails don't contain same residue names"

        self.leaflet_kwargs = leaflet_kwargs
        self.n_leaflets = 0

        if gmm_kwargs is None:
            self.gmm_kwargs = {"tol": 1E-4, "init_params": 'random_from_data', "verbose": 0,
                               "max_iter": 10000, "n_init": 20,
                               "warm_start": False, "covariance_type": "full"}
        else:
            self.gmm_kwargs = gmm_kwargs

        if hmm_kwargs is None:
            self.hmm_kwargs = {"verbose": False, "tol": 1E-4, "n_iter": 2000,
                               "algorithm": "viterbi", "covariance_type": "full",
                               "init_params": "st", "params": "stmc"}
        else:
            self.hmm_kwargs = hmm_kwargs

        # -----------------------------------------------------------------LEAFLETS----------------------------------- #

        if leaflet_select is None:
            # No information about leaflet assignment is provided. Raise an error and exit.
            raise ValueError(
                "No leaflet assigned! Please provide a list containing either two MDAnalysis.AtomGroup objects, "
                "two valid MDAnalysis selection strings, or 'auto' to trigger automatic leaflet assignment.")

        elif isinstance(leaflet_select, list):
            # If the argument leaflet_select is a list, the user specified their own leaflet selection. The code
            # checks if there are exactly two groups and raises an assertion if not. It then stores the provided
            # AtomGroup in a dictionary, or creates the AtomGroup with a provided selection string and stores it in
            # the dictionary.

            assert len(leaflet_select) == 2, (
                f"A bilayer is required. {len(leaflet_select)} entries found in leaflet_select list.")

            # Initialize empty dictionary to store AtomGroups
            self.leaflet_selection_no_sterol = {}

            # Iterate over both entries
            for i in range(2):
                # MDAnalysis.AtomGroup was provided as input
                if isinstance(leaflet_select[i], AtomGroup):
                    self.leaflet_selection_no_sterol[str(i)] = leaflet_select[i]
                # Character string was provided as input, assume it contains a selection for an MDAnalysis.AtomGroup
                elif isinstance(leaflet_select[i], str):
                    # Try to create a MDAnalysis.AtomGroup, raise a ValueError if not selection group could be provided
                    try:
                        self.leaflet_selection_no_sterol[str(i)] = self.universe.select_atoms(leaflet_select[i])
                    except Exception as e:
                        raise ValueError("Please provide a valid MDAnalysis selection string!") from e

                    # TODO: Check for atom number in upper and lower leaflet and raise a Warning
                else:
                    raise ValueError("Please provide an MDAnalysis.AtomGroup or a valid MDAnalysis selection string!")

                # Iterate over sterol compounds and check if it is part of the phospholipid leaflet assignment
                for rsn, atoms in self.sterol_heads.items():
                    if rsn in self.leaflet_selection_no_sterol[str(i)].residues.resnames:
                        raise ValueError(
                            f"Sterol {rsn} should not be part of the initial leaflet identification! Sterols will be "
                            f"assigned automatically.")
                    else:
                        pass

        elif leaflet_select.lower() == "auto":
            # 'auto' should trigger an automated leaflet assignment pipeline
            # (e.g., LeafletFinder provided by MDAnalysis)
            self.leaflet_selection_no_sterol = self.get_leaflets()

        else:
            # An unknown argument is provided for leaflet_select
            raise ValueError(
                "No leaflet assigned! Please provide a list containing either two MDAnalysis.AtomGroup objects, "
                "two valid MDAnalysis selection strings, or 'auto' to trigger automatic leaflet assignment.")

        # -----------------------------------------------------------------Transmembrane Domains --------------------- #
        if isinstance(tmd_protein_list, list):
            # Initialize empty dictionary to store AtomGroups
            self.tmd_protein = {"0": [], "1": []}
            for each in tmd_protein_list:
                if not isinstance(each, dict):
                    raise ValueError(
                        "Entry for each TDM protein should be a dictionary in the format {'0': ..., '1': ...} "
                        "where 0 for lower leaflet and 1 for upper leaflet.")
                for leaflet, query in each.items():
                    if leaflet not in ["0", "1"]:
                        raise ValueError(
                            "Entry for each TDM protein should be a dictionary in the format {'0': ..., '1': ...} "
                            "where 0 for lower leaflet and 1 for upper leaflet.")
                    if isinstance(query, AtomGroup):
                        # Take center of geometry of three positions
                        cog = np.mean(query.positions, axis=0)
                        self.tmd_protein[leaflet].append(cog)
                    # Character string was provided as input, assume it contains a selection for an MDAnalysis.AtomGroup
                    elif isinstance(query, str):
                        # Try to create a MDAnalysis.AtomGroup, raise a ValueError if not selection group could be
                        # provided
                        try:
                            cog = np.mean(self.universe.select_atoms(query).positions, axis=0)
                            self.tmd_protein[leaflet].append(cog)
                        except Exception as e:
                            raise ValueError("Please provide a valid MDAnalysis selection string!") from e
                    else:
                        raise ValueError(
                            "TDM Protein list should contain AtomGroup from MDAnalysis universe or a string "
                            "query for MDAnalysis selection.")
            self.tmd_protein["0"] = np.array(self.tmd_protein["0"])
            self.tmd_protein["1"] = np.array(self.tmd_protein["1"])
        elif tmd_protein_list is not None:
            # An unknown argument is provided for tdm_protein_list
            raise ValueError(
                "Please provide tdm_protein_list in list format such as [{'0': upper leaflet related 3 atom, "
                "'1': lower leaflet related 3 atom }, {'0': ..., '1': ...}]. Every dictionary stands for an "
                "individual transmembrane protein.")
        # -----------------------------------------------------------------HMMs--------------------------------------- #

        # Check for user-specified trained HMM
        if not any(trained_hmms):
            # Carry on if there is no trained HMM provided -> Will train HMM(s) later on
            self.trained_hmms = trained_hmms

        else:
            # User-specified trained HMM provided, check for consistency with expected format

            # Check for assymmetric membrane
            if self.asymmetric_membrane:
                # Asymmetric membrane functionality was triggered! Assuming same/different lipid types per leaflet
                # Structure of the expected dictionary: {ResnameA: {0: HMM0A, 1: HMM1A}, ResnameB: {0: HMM0B,
                # 1: HMM1B}, ResnameC: {0: None, 1: HMM1B}, ...}

                # Iterate over each entry and check for validity of input
                for lipid, hmms in zip(trained_hmms.keys(), trained_hmms.values()):

                    # Expected format of object "hmms" right now: {0: HMM0X, 1: HMM1X}

                    assert lipid in self.membrane.residues.resnames, f"{lipid} not found in membrane. Maybe a typo?"

                    # Check if correct number of HMMs per lipid is given
                    assert len(
                        hmms.keys()) == 2, (f'Too many/less HMMs provided for lipid type {lipid}.'
                                            + "\nPlease provide exactly two ({0:...,1:...}) or do not trigger "
                                              "'asymmetric_membrane'!\nIf a lipid is not present in one leaflet, "
                                              "use 'None'.")

                    # Check for correct labelling of leaflets
                    assert (0 in hmms.keys()) and (
                            1 in hmms.keys()), "Provide suitable keys (i.e., 0, 1) for the leaflets!"

                    # Iterate over leaflets
                    for leaflet in range(2):

                        # Expected format of object "hmms[leaflet]" right now: HMMYX

                        # If a lipid is not present in one of the leaflets a None is expected instead of a trained HMM
                        if hmms[leaflet] is None:
                            # If no HMM was provided then this lipid should be not part of this leaflet. Sterols are
                            # expected to flip, therefore always two HMM should be provided for sterol types...
                            assert (lipid not in self.leaflet_selection_no_sterol[
                                str(leaflet)].residues.resnames and lipid not in self.sterol_heads.keys()), \
                                (f"Found lipid {lipid} in leaflet {leaflet}, but no HMM was found!\nPlease provide a "
                                 f"valid HMM for this lipid in this leaflet!\nNote, for sterols always two HMMs are "
                                 f"expected!")

                            # If everything is fine carry on with next leaflet/lipid
                            continue

                        # Check if lipid is in this leaflet or if lipid is a sterol
                        assert lipid in self.leaflet_selection_no_sterol[
                            str(leaflet)].residues.resnames or lipid in self.sterol_heads.keys(), \
                            (f"Could not find lipid/sterol {lipid} in leaflet {leaflet}. Provide only HMMs for lipids "
                             f"that are present in this leaflet!")

                        # If all checks passed, we can assume that "something" should be and is present. Now,
                        # check the validity of the provided object
                        try:
                            # Try to sample something from HMM to check if it is fitted
                            hmms[leaflet].sample(n_samples=1)
                        except Exception as hmmerror:
                            raise ValueError(
                                f"HMM check failed with {hmmerror}! Could not sample a single point from the provided "
                                f"HMM for lipid {lipid} in leaflet {leaflet}. Check your model!")

                # If everthing works until here, it is assumed that all provided HMMs are valid and can be used later on
                self.trained_hmms = trained_hmms

            else:
                # Symmetric membrane is assumed!
                # Assuming same lipid types per leaflet
                # Structure of the expected dictionary: {ResnameA: HMMA, ResnameB: HMMB, ResnameC: HMMC, ...}

                # Iterate over each entry and check for validity of input
                for lipid, hmms in zip(trained_hmms.keys(), trained_hmms.values()):

                    assert lipid in self.membrane.residues.resnames, f"{lipid} not found in membrane. Maybe a typo?"

                    # If all checks passed, we can assume that "something" should be and is present. Now, check
                    # the validity of the provided object
                    try:
                        # Try to sample something from HMM to check if it is fitted
                        hmms.sample(n_samples=1)
                    except sklearn.exceptions.NotFittedError as hmmerror:
                        raise ValueError(
                            f"HMM check failed with {hmmerror}! Could not sample a single point from the provided HMM "
                            f"for lipid {lipid} in leaflet {leaflet}. Check your model!")

                # If everthing works until here, it is assumed that all provided HMMs are valid and can be used later on
                self.trained_hmms = trained_hmms

        # --------------------------------------HOUSE KEEPING-------------------------------------- #

        # Save unique residue names
        _, idx = np.unique(self.membrane.resnames, return_index=True)
        self.unique_resnames = self.membrane.resnames[np.sort(idx)]

        # Get residues ids in upper and lower leaflet -> self.leaflet_resids
        self.residue_ids = self.get_resids()

        # Get dictionary with selection of head- and tailgroups in upper and lower leaflets
        self.resid_tails_selection = self.get_lipid_tails()

        self.sterol_tails_selection = self.get_sterol_tails()

        self.all_heads = self.get_heads()


[docs]
    def get_leaflets(self):

        """
        Automatically assign non-sterol compounds to the upper and lower leaflet of a bilayer using the
        MDAnalysis.LeafletFinder
        """

        # Call LeafletFinder to get upper and lower leaflets
        leafletfinder = LeafletFinder(self.universe, **self.leaflet_kwargs)

        # Check for two leaflets
        self.n_leaflets = len(leafletfinder.groups())
        assert self.n_leaflets == 2, f"Bilayer is required. {self.n_leaflets} are found."

        # Init empty dict to store AtomGroups -> That would be not necessary, but I want the same variables for the
        # user specified case or the automatic case
        leaflet_selection = {}

        # Iterate over found groups
        for idx, leafgroup in enumerate(leafletfinder.groups_iter()):
            leaflet_selection[str(idx)] = leafgroup

        return leaflet_selection



[docs]
    def get_leaflets_sterol(self):

        """
        Assign sterol compounds to the upper and lower leaflet of a bilayer using a distance cut-off.
        """

        # Copy dict for leaflet selection without sterols, only the AtomGroups in the copied dict should be updated
        leaflet_selection = self.leaflet_selection_no_sterol.copy()

        # Iterate over each type of sterol in the membrane
        for rsn, head in self.sterol_heads.items():
            sterol = self.universe.select_atoms(f"resname {rsn} and name {head}")
            upper_sterol = distances.distance_array(reference=sterol,
                                                    configuration=self.leaflet_selection_no_sterol['0'],
                                                    box=self.universe.trajectory.ts.dimensions)
            lower_sterol = distances.distance_array(reference=sterol,
                                                    configuration=self.leaflet_selection_no_sterol['1'],
                                                    box=self.universe.trajectory.ts.dimensions)

            # ...determining the minimum distance to each leaflet for each cholesterol,...
            upper_sterol = np.min(upper_sterol, axis=1)
            lower_sterol = np.min(lower_sterol, axis=1)

            # ...the assignment is finished by checking for which leaflet the minimum distance is smallest.
            upper_sterol = sterol[upper_sterol < lower_sterol]
            lower_sterol = sterol.difference(upper_sterol)

            # Merge the atom selections for the phospholipids and cholesterol. "+" just adds the second selection on
            # top of the former one.
            leaflet_selection['0'] += upper_sterol
            leaflet_selection['1'] += lower_sterol

        leaflet_selection['0'] = leaflet_selection['0'][np.argsort(leaflet_selection['0'].resids)]
        leaflet_selection['1'] = leaflet_selection['1'][np.argsort(leaflet_selection['1'].resids)]

        return leaflet_selection



[docs]
    def get_resids(self):
        """
        Retrieve the residue indices for each residue and store it in a new dictionary.

        Returns
        ---------- 
        residue_ids: dict
            dictionary for each residue containing residue ids of it.
        """

        # Init empty dict to store atom selection of resids
        residue_ids = {}

        # Iterate over found leaflets
        for resname, head in self.heads.items():
            query_str = f"name {head} and resname {resname}"
            residue_ids[resname] = self.universe.select_atoms(query_str).resids
        for resname, head in self.sterol_heads.items():
            query_str = f"name {head} and resname {resname}"
            residue_ids[resname] = self.universe.select_atoms(query_str).resids
        return residue_ids



[docs]
    def get_heads(self):
        """
        Make an atomgroup containing all head groups (lipids + sterols).

        Returns
        ----------
        all_heads: MDAnalysis.AtomGroup
            AtomGroup containing headgroups of all lipids and sterols
        """

        # Init empty dict to store atom selection of resids
        # TODO Cause => UserWarning: Empty string to select atoms, empty group returned.
        #  warnings.warn("Empty string to select atoms, empty group returned.",
        all_heads = self.universe.select_atoms('')

        # Iterate over found leaflets
        for resname, head in self.heads.items():
            query_str = f"name {head} and resname {resname}"
            all_heads = all_heads | self.universe.select_atoms(query_str)
        for resname, head in self.sterol_heads.items():
            query_str = f"name {head} and resname {resname}"
            all_heads = all_heads | self.universe.select_atoms(query_str)
        return all_heads



[docs]
    def get_sterol_tails(self):
        """
        Make atomgroups for sterols

        Attributes
        ----------
        sterols_tail: dict
            dictionary containing the tail atomgroups for each sterol

        """

        # Init empty dicts
        sterols_tail = {}

        # Iterate over sterols -> user input
        for sterol, tail in self.sterol_tails.items():
            # Make atom group for sterol tail selection
            tail_sele_str = 'name ' + ' or name '.join(tail)
            tail_sele_str = f'resname {sterol} and ({tail_sele_str})'
            sterols_tail[sterol] = self.universe.select_atoms(tail_sele_str)
        return sterols_tail



[docs]
    def get_lipid_tails(self):

        """
        Make atomgroups for tailgroups for each chain of residues

        Attributes
        ----------
        self.resid_tails_selection: dict
            dictionary containing the tail atomgroups for all residues each tail
        """
        # Temporary dictionary for query collection of tails
        resid_tails_selection = {}
        tail_select_list = {}
        # Iterate over lipid types
        for resname in self.tails.keys():
            # Make for every type an extra dictionary
            # Iterate over tails (e.g. for standard phospholipids that 2)
            for i, tail in enumerate(self.tails[resname]):
                # Prepare a MDAnalysis selection string (I.e. ['C22', 'H2R'] -> 'name C22 or name H2R')
                tail_sele_str = 'name ' + ' or name '.join(tail)
                # Select for correct lipid type
                # (I.e. I.e. ['C22', 'H2R'] -> 'name C22 or name H2R' -> 'resname POPC and ('name C22 or name H2R')')
                tail_sele_str = f'(resname {resname} and ({tail_sele_str}))'
                tail_select_list.setdefault(i, []).append(tail_sele_str)
        # Create tail selection dictionary for each chain
        for i, query_list in tail_select_list.items():
            query = " or ".join(query_list)
            resid_tails_selection[i] = self.universe.select_atoms(query)
        return resid_tails_selection