Source code for domhmm.analysis.domhmm

"""
PropertyCalculation --- :mod:`domhmm.analysis.PropertyCalculation`
===========================================================

This module contains the :class:`PropertyCalculation` class.

"""

import logging as log
import sys

import matplotlib.pyplot as plt
import numpy as np
from hmmlearn.hmm import GaussianHMM
from scipy.sparse import csr_array
from scipy.spatial import Voronoi, ConvexHull
from sklearn import mixture
from tqdm import tqdm

from .base import LeafletAnalysisBase



[docs]
class PropertyCalculation(LeafletAnalysisBase):
    """
    The DirectorOrder class calculates an order parameter for each selected lipid according to the formula:

        P2 = 0.5 * (3 * cos(a)^2 - 1), (1)

    where an is the angle between a pre-defined director in the lipid (e.g. C-H or CC) and a reference axis.

    """

    def _prepare(self):
        """
        Prepare results array before the calculation of the height spectrum begins

        Attributes
        ----------
        """
        if self.verbose:
            log.basicConfig(level=log.INFO, stream=sys.stdout)
        log.info("Preparation Step")
        # Initialize result storage dictionaries
        self.results.train_data_per_type = {}
        self.results.GMM = {}
        self.results.HMM = {}
        self.results.HMM_Pred = {}
        self.results.Getis_Ord = {}
        self.max_tail_len = -1
        for _, each in self.tails.items():
            if len(each) > self.max_tail_len:
                self.max_tail_len = len(each)
        # Total number of parameters are area per lipid + order parameter for each tail = max_tail_len + 1
        self.results.train = np.zeros(
            (len(self.membrane_unique_resids), self.n_frames, 1 + self.max_tail_len + len(self.sterol_heads)),
            dtype=np.float32)
        # Initalize weight matrix storage for each leaflet.
        setattr(self.results, "upper_weight_all", [])
        setattr(self.results, "lower_weight_all", [])

        # Initialized leaflet assignment array for each frame
        self.leaflet_assignment = np.zeros((len(self.membrane_unique_resids), self.n_frames), dtype=np.int32)


[docs]
    @staticmethod
    def calc_order_parameter(chain):

        """
        Calculate average Scc order parameters per acyl chain according to the equation:

        S_cc = (3 * cos( theta )^2 - 1) / 2,

        where theta describes the angle between the z-axis of the system and the vector between two subsequent tail
        beads.

        Parameters
        ----------
        chain : Selection
            MDAnalysis Selection object

        Returns
        -------
        s_cc : numpy.ndarray
            Mean S_cc parameter for the selected chain of each residue
        """

        # Separate the coordinates according to their residue index
        ridx = np.where(np.diff(chain.resids) > 0)[0] + 1

        pos = np.split(chain.positions, ridx)

        # Calculate the normalized orientation vector between two subsequent tail beads
        vec = [np.diff(pos_i, axis=0) for pos_i in pos]

        vec_norm = [np.sqrt((vec_i ** 2).sum(axis=-1)) for vec_i in vec]

        vec = [vec_i / vec_norm_i.reshape(-1, 1) for vec_i, vec_norm_i in zip(vec, vec_norm)]
        # TODO z axis multiplication inside. It needs to be changed in future work
        # Choose the z-axis as membrane normal and take care of the machine precision
        dot_prod = [np.clip(vec_i[:, 2], -1., 1.) for vec_i in vec]

        # Calculate the order parameter
        s_cc = np.array([np.mean(0.5 * (3 * dot ** 2 - 1)) for dot in dot_prod])

        return s_cc



[docs]
    def order_parameter(self):
        """
        Calculation of scc order parameter for each chain of each residue.
        """
        # Iterate over each tail with chain id
        for chain, tail in self.resid_tails_selection.items():
            # SCC calculation
            s_cc = self.calc_order_parameter(tail)
            idx = self.get_residue_idx(self.resids_index_map, np.unique(tail.resids))
            self.results.train[idx, self.index, 1 + chain] = s_cc
        for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
            s_cc = self.calc_order_parameter(tail)
            idx = self.get_residue_idx(self.resids_index_map, np.unique(tail.resids))
            self.results.train[idx, self.index, 1 + self.max_tail_len + i] = s_cc



[docs]
    @staticmethod
    def area_per_lipid_vor(coor_xy, boxdim, frac):

        """
        Calculation of the area per lipid employing Voronoi tessellation on coordinates mapped to the xy plane.
        The function takes also the periodic boundary conditions of the box into account.

        Parameters
        ----------
        coor_xy : numpy.ndarray
            Coordinates of upper/lower leaflet
        boxdim : array
            Length of box vectors in all directions
        frac : float
            Fraction of box length in x and y outside the unit cell considered for Voronoi calculation 

        Returns
        -------
        vor : Voronoi Tesselation
            Scipy's Voronoi Diagram object
        apl : Area per lipid
            Area per lipid based on Scipy's Voronoi Diagram
        pbc_idx : Indices
            Unit cell indices for periodic image coordinates
        """

        # Number of points in the plane
        ncoor = coor_xy.shape[0]
        bx = boxdim[0]
        by = boxdim[1]
        # Create periodic images of the coordinates
        # to take periodic boundary conditions into account

        # Store coordinates of periodic images
        pbc = np.zeros((9 * ncoor, 2), dtype=np.float32)
        # Store unit cell indices for coordinates of periodic images
        pbc_idx = np.arange(9 * ncoor, dtype=np.int64) % ncoor

        # Iterate over all possible periodic images
        k = 0
        for i in [0, -1, 1]:
            for j in [0, -1, 1]:
                # Multiply the coordinates in a direction
                pbc[k * ncoor: (k + 1) * ncoor, 0] = coor_xy[:, 0] % bx + i * bx
                pbc[k * ncoor: (k + 1) * ncoor, 1] = coor_xy[:, 1] % by + j * by

                k += 1

        # Create a boolean mask for positions within the unit cell and a smaller fraction of positions outside the
        # unit cell
        # Check along x-axis
        f0 = pbc[:, 0] >= -frac * bx
        f1 = pbc[:, 0] <= bx + frac * bx

        # Check along y-axis
        f2 = pbc[:, 1] >= -frac * by
        f3 = pbc[:, 1] <= by + frac * by

        # Merge the four masks together
        mask = f0 * f1 * f2 * f3

        # Filter positions and unit cell indices of all periodic images
        pbc = pbc[mask]
        pbc_idx = pbc_idx[mask]

        # Call scipy's Voronoi implementation
        # There is the (rare!) possibility that two points have the exact same xy positions,
        # to prevent issues at further calculation steps, the qhull_option "QJ" was employed to introduce small random
        # displacement of the points to resolve these issue.

        # IMPORTANT: The use of "QJ" makes the resulting Voronoi diagram depending on frac. Values for ridge lengths
        # can vary!
        vor = Voronoi(pbc, qhull_options="QJ")

        # Iterate over all members of the unit cell and calculate their occupied area
        apl = np.array([ConvexHull(vor.vertices[vor.regions[vor.point_region[i]]]).volume for i in range(ncoor)])

        return vor, apl, pbc_idx



[docs]
    @staticmethod
    def weight_matrix(vor, pbc_idx, coor_xy):

        """
        Calculate the weight factors between neighbored lipid pairs based on a Voronoi tessellation.

        Parameters
        ----------
        vor : Voronoi Tesselation
            Scipy's Voronoi Diagram object
        pbc_idx : Indices
            Unit cell indices of periodic image coordinates
        coor_xy : numpy.ndarray
            Coordinates of upper/lower leaflet

        Returns
        -------
        weight_matrix : numpy.ndarray
            Weight factors wij between neighbored lipid pairs.
            Is 0 if lipids are not directly neighbored.
        """

        # Number of points in the plane
        ncoor = coor_xy.shape[0]

        # Calculate the distance for all pairs of points between which a ridge exists
        dij = vor.points[vor.ridge_points[:, 0]] - vor.points[vor.ridge_points[:, 1]]
        dij = np.sqrt(np.sum(dij ** 2, axis=1))

        # There is the (rare!) possibility that two points have the exact same xy positions,
        # to prevent issues at further calculation steps, their distance is set to a very small
        # distance of 4.7 Angstrom (2 times the VdW radius of a regular bead in MARTINI3)
        dij[dij < 1E-5] = 4.7

        # Calculate the distance for all pairs of vertices connected via a ridge
        vert_idx = np.array(vor.ridge_vertices)
        bij = vor.vertices[vert_idx[:, 0]] - vor.vertices[vert_idx[:, 1]]
        bij = np.sqrt(np.sum(bij ** 2, axis=1))

        # INFO: vor.ridge_points and vor.ridge_vertices should be sorted -> Check vor.ridge_dict

        # Calculate weight factor
        wij = bij / dij

        # Set up an empty array to store the weight factors for each lipid
        weight_matrix = np.zeros((ncoor, ncoor))

        # Select all indices of ridges that contain members of the unit cell
        mask_unit_cell = np.logical_or(vor.ridge_points[:, 0] < ncoor, vor.ridge_points[:, 1] < ncoor)

        # Apply the modulus operator since some of the indices in "unit_cell_point" will point to coordinates outside
        # the unit cell. Applying the modulus operator "%" will allow an indexing of the "weight_matrix". However, some
        # of the indices in "unit_cell_point" will be doubled that shouldn't be an issue since the same weight factor is
        # then just put several times in the same entry of the array (no summing or something similar!)
        # Previous: unit_cell_point = vor.ridge_points[mask_unit_cell] % ncoor

        # Transform the indices in vor.ridge_points back to unit cell indices -> Filter than for all indices of
        # ridges that contain members of the unit cell
        unit_cell_point = pbc_idx[vor.ridge_points][mask_unit_cell]

        weight_matrix[unit_cell_point[:, 0], unit_cell_point[:, 1]] = wij[mask_unit_cell]
        weight_matrix[unit_cell_point[:, 1], unit_cell_point[:, 0]] = wij[mask_unit_cell]
        return weight_matrix


    def _single_frame(self):
        """
        Calculate data from a single frame of the trajectory.
        """

        # Get number of frame from trajectory
        self.frame = self.universe.trajectory.ts.frame
        # Calculate correct index if skipping step not equals 1 or start point not equals 0
        self.index = ( self.frame - self.start ) // self.step

        # Update leaflet assignment (if leaflet_frame_rate is None, leaflets will never get updated during analysis)
        if self.leaflet_frame_rate is not None and not self.index % self.leaflet_frame_rate:
            # Call leaflet assignment functions for non-sterol and sterol compounds
            self.leaflet_selection_no_sterol = self.get_leaflets()
            self.leaflet_selection = self.get_leaflets_sterol()

            # Write assignments to array
            assignment_index = int(self.index / self.leaflet_frame_rate)
            start_index = assignment_index * self.leaflet_frame_rate
            end_index = (assignment_index + 1) * self.leaflet_frame_rate
            if end_index > self.leaflet_assignment.shape[1]:
                end_index = self.leaflet_assignment.shape[1]

            self.uidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["0"].resids)
            self.lidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["1"].resids)

            self.leaflet_assignment[self.lidx, start_index:end_index] = 1
            self.leaflet_assignment[self.uidx, start_index:end_index] = 0
            self.leaflet_assignment[self.lidx, start_index:end_index] = 1

        # Update sterol assignment. Don't do the update if it was already done in the if-statement before
        if not self.index % self.sterol_frame_rate and (
                self.leaflet_frame_rate is None or self.index % self.leaflet_frame_rate):
            # Call leaflet assignment function for sterol compounds
            self.leaflet_selection = self.get_leaflets_sterol()

            # Write assignments to array
            assignment_index = int(self.index / self.sterol_frame_rate)
            start_index = assignment_index * self.sterol_frame_rate
            end_index = (assignment_index + 1) * self.sterol_frame_rate
            if end_index > self.leaflet_assignment.shape[1]:
                end_index = self.leaflet_assignment.shape[1]

            self.uidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["0"].resids)
            self.lidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["1"].resids)

            self.leaflet_assignment[self.uidx, start_index:end_index] = 0
            self.leaflet_assignment[self.lidx, start_index:end_index] = 1

        if self.leaflet_selection["0"].select_atoms("group leaf1", leaf1=self.leaflet_selection["1"]):
            raise ValueError("Atoms in both leaflets !")

        # ------------------------------ Local Normals/Area per Lipid ------------------------------------------------ #
        boxdim = self.universe.trajectory.ts.dimensions[0:3]
        upper_coor_xy = self.leaflet_selection[str(0)].positions
        lower_coor_xy = self.leaflet_selection[str(1)].positions
        # Check Transmembrane domain existance
        if self.tmd_protein is not None:
            tmd_upper_coor_xy = self.tmd_protein["0"]
            # Check if dimension of coordinates is same in both array
            if tmd_upper_coor_xy.shape[1] == upper_coor_xy.shape[1]:
                upper_coor_xy = np.append(upper_coor_xy, tmd_upper_coor_xy, axis=0)
            tmd_lower_coor_xy = self.tmd_protein["1"]
            # Check if dimension of coordinates is same in both array
            if tmd_lower_coor_xy.shape[1] == lower_coor_xy.shape[1]:
                lower_coor_xy = np.append(lower_coor_xy, tmd_lower_coor_xy, axis=0)
        upper_vor, upper_apl, upper_pbc_idx = self.area_per_lipid_vor(coor_xy=upper_coor_xy, boxdim=boxdim,
                                                                      frac=self.frac)
        lower_vor, lower_apl, lower_pbc_idx = self.area_per_lipid_vor(coor_xy=lower_coor_xy, boxdim=boxdim,
                                                                      frac=self.frac)
        if self.tmd_protein is not None:
            # Remove TMD Protein numbers from training data
            upper_apl = np.array(upper_apl[:-len(tmd_upper_coor_xy)])
            lower_apl = np.array(lower_apl[:-len(tmd_lower_coor_xy)])
        self.results.train[self.uidx, self.index, 0] = upper_apl
        self.results.train[self.lidx, self.index, 0] = lower_apl
        # TODO Local normal calculation

        # ------------------------------ Order parameter ------------------------------------------------------------- #
        self.order_parameter()
        # ------------------------------ Weight Matrix --------------------------------------------------------------- #
        upper_weight_matrix = self.weight_matrix(upper_vor, pbc_idx=upper_pbc_idx, coor_xy=upper_coor_xy)
        lower_weight_matrix = self.weight_matrix(lower_vor, pbc_idx=lower_pbc_idx, coor_xy=lower_coor_xy)
        if self.tmd_protein is not None:
            # Remove TMD Protein numbers from weight matrix
            upper_weight_matrix = upper_weight_matrix[:-len(tmd_upper_coor_xy),:-len(tmd_upper_coor_xy)]
            lower_weight_matrix = lower_weight_matrix[:-len(tmd_lower_coor_xy),:-len(tmd_lower_coor_xy)]

        # Keep weight matrices in scipy.sparse.csr_array format since both is sparse matrices
        self.results["upper_weight_all"].append(csr_array(upper_weight_matrix))
        self.results["lower_weight_all"].append(csr_array(lower_weight_matrix))

    def _conclude(self):
        """
        Calculate the final results of the analysis

        Extract the obtained data and put them into a clear and accessible data structure
        """
        log.info("Conclusion step is starting.")
        self.prepare_train_data()
        # -------------------------------------------------------------

        # Check for user-provided HMMs
        if not any(self.trained_hmms):
            # No user-provided HMMs, perform usual workflow
            log.info("Gaussian Mixture Model training is starting.")
            self.GMM(gmm_kwargs=self.gmm_kwargs)

            log.info("Hidden Markov Model training is starting.")
            self.HMM(hmm_kwargs=self.hmm_kwargs)
        else:
            # User-provided HMMs found, use them!
            log.info("No Gaussian Mixture Model is trained.")
            log.info("No Hidden Markov Model is trained. Instead use the user-provided HMMs.")

            # Use the provided dictionary directly, the checks for validity were already done before
            self.results['HMM'] = self.trained_hmms

            # Same workflow as in self.HMM()
            if self.result_plots:
                # Plot result of hmm
                self.plot_hmm_result()

            # Make predictions based on HMM model
            self.predict_states()

            # Validate states and result prediction
            self.state_validate()
            if self.result_plots:
                # Plot prediction result
                self.predict_plot()

        log.info("Getis-Ord Statistic calculation is starting.")

        self.getis_ord()
        log.info("Clustering is starting.")
        self.result_clustering()
        if self.result_plots:
            self.clustering_plot()


[docs]
    def prepare_train_data(self):
        """
        Prepare train data for GMM and HMM while separating self.results.train with respect to each residue
        """
        self.results.train_data_per_type = {}
        resid_dict = {}
        for resname in self.unique_resnames:
            idx = self.get_residue_idx(self.resids_index_map, self.residue_ids[resname])
            resid_dict[resname] = idx
            self.results.train_data_per_type[f"{resname}"] = [[] for _ in range(3)]
        if self.asymmetric_membrane:
            # For asymmetric membranes, models will be trained for each leaflets' each residue type
            leaflet_residx = {}
            # leaflet_train_residx contains residue indexes that are not too flip-floppy for model training
            leaflet_train_residx = {}
            # Save each residues indexes with respect to leaflet assignment
            for resname, idx in resid_dict.items():
                leaflet_assign = self.leaflet_assignment[idx]
                leaflet_train_residx[resname] = {0: [], 1: []}
                leaflet_residx[resname] = {0: [], 1: []}
                for i in range(len(leaflet_assign)):
                    if (leaflet_assign[i] == 0).all():
                        leaflet_train_residx[resname][0].append(idx[i])
                        leaflet_residx[resname][0].append(idx[i])
                    elif (leaflet_assign[i] == 1).all():
                        leaflet_train_residx[resname][1].append(idx[i])
                        leaflet_residx[resname][1].append(idx[i])
                    else:
                        # If a residue is %80 of time belongs to a leaflet, accept it as residue of that leaflet
                        lower_leaflet_percentage = len(np.nonzero(leaflet_assign[i] == 1)[0]) / len(leaflet_assign[i])
                        if lower_leaflet_percentage >= 0.8:
                            leaflet_train_residx[resname][1].append(idx[i])
                        elif lower_leaflet_percentage <= 0.2:
                            leaflet_train_residx[resname][0].append(idx[i])
                        if lower_leaflet_percentage > 0.5:
                            leaflet_residx[resname][1].append(idx[i])
                        else:
                            leaflet_residx[resname][0].append(idx[i])
            self.leaflet_residx = leaflet_residx
            # Prepare rest of train data for each residue
            for resname, tails in self.tails.items():
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                # Select columns of area per lipid and tails' scc parameters
                residx = leaflet_train_residx[resname]
                upper_leaflet_data = self.results.train[residx[0]][:, :, 0:len(tails) + 1]
                lower_leaflet_data = self.results.train[residx[1]][:, :, 0:len(tails) + 1]
                self.results.train_data_per_type[resname][1] = [upper_leaflet_data, lower_leaflet_data]
                # TODO idx might referenced before assignment ?
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]

            for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                idx = self.get_residue_idx(self.resids_index_map, rsn_ids)
                # Select columns of area per lipid and sterol's scc parameter
                residx = leaflet_train_residx[resname]
                upper_leaflet_data = self.results.train[residx[0]][:, :, [0, 1 + self.max_tail_len + i]]
                lower_leaflet_data = self.results.train[residx[1]][:, :, [0, 1 + self.max_tail_len + i]]
                self.results.train_data_per_type[resname][1] = [upper_leaflet_data, lower_leaflet_data]
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]
        else:
            # For symmetric membranes, models will be trained for each residue type
            for resname, tails in self.tails.items():
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                idx = resid_dict[resname]
                # Select columns of area per lipid and tails' scc parameters
                self.results.train_data_per_type[resname][1] = self.results.train[idx][:, :, 0:len(tails) + 1]
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]

            for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                idx = resid_dict[resname]
                # Select columns of area per lipid and sterol's scc parameter
                self.results.train_data_per_type[resname][1] = self.results.train[idx][:, :,
                                                               [0, 1 + self.max_tail_len + i]]
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]


    # ------------------------------ FIT GAUSSIAN MIXTURE MODEL ------------------------------------------------------ #

[docs]
    def GMM(self, gmm_kwargs):
        """
        Fit Gaussian Mixture

        Parameters
        ----------
        gmm_kwargs : dict
            Additional parameters for mixture.GaussianMixture
        """
        # Iterate over each residue and implement gaussian mixture model
        for res, data in self.results.train_data_per_type.items():
            if self.asymmetric_membrane:
                temp_dict = {}
                for leaflet in range(2):
                    gmm_data = data[1][leaflet]
                    if len(gmm_data) == 0:
                        temp_dict[leaflet] = None
                    else:
                        gmm = mixture.GaussianMixture(n_components=2, **gmm_kwargs).fit(
                            gmm_data.reshape(-1, gmm_data.shape[2]))
                        temp_dict[leaflet] = gmm
                    log.info(f"Leaflet {leaflet}, {res} Gaussian Mixture Model is trained.")
                self.results["GMM"][res] = temp_dict
            else:
                gmm = mixture.GaussianMixture(n_components=2, **gmm_kwargs).fit(data[1].reshape(-1, data[1].shape[2]))
                self.results["GMM"][res] = gmm
                log.info(f"{res} Gaussian Mixture Model is trained.")

        # Check for convergence
        for resname, each in self.results["GMM"].items():
            if self.asymmetric_membrane:
                if each[0] is not None and not each[0].converged_:
                    log.warning(f"{resname} lower leaflet Gaussian Mixture Model is not converged.")
                if each[1] is not None and not each[1].converged_:
                    log.warning(f"{resname} upper leaflet Gaussian Mixture Model is not converged.")
            else:
                if not each.converged_:
                    log.warning(f"{resname} Gaussian Mixture Model is not converged.")


    # ------------------------------ HIDDEN MARKOV MODEL ------------------------------------------------------------- #


[docs]
    def HMM(self, hmm_kwargs):
        """
        Create Gaussian based Hidden Markov Models for each residue type

        Parameters
        ----------
        hmm_kwargs : dict
            Additional parameters for hmmlearn.hmm.GaussianHMM
        """
        # Iterate over each residue and implement gaussian-hidden markov model
        for resname, data in self.results.train_data_per_type.items():
            if self.asymmetric_membrane:
                temp_dict = {}
                for leaflet in range(2):
                    gmm = self.results["GMM"][resname][leaflet]
                    if gmm is not None:
                        hmm_data = data[1][leaflet]
                        hmm = self.fit_hmm(data=hmm_data, gmm=gmm, hmm_kwargs=hmm_kwargs,
                                           n_repeats=self.n_init_hmm)
                        temp_dict[leaflet] = hmm
                    else:
                        temp_dict[leaflet] = None
                    log.info(f"Leaflet {leaflet}, {resname} Gaussian Hidden Markov Model is trained.")
                self.results["HMM"][resname] = temp_dict
            else:
                hmm = self.fit_hmm(data=data[1], gmm=self.results["GMM"][resname], hmm_kwargs=hmm_kwargs, n_repeats=self.n_init_hmm)
                self.results["HMM"][resname] = hmm
                log.info(f"{resname} Gaussian Hidden Markov Model is trained.")
        if self.result_plots:
            # Plot result of hmm
            self.plot_hmm_result()
        # Make predictions based on HMM model
        self.predict_states()
        # Validate states and result prediction
        self.state_validate()
        if self.result_plots:
            # Plot prediction result
            self.predict_plot()



[docs]
    def fit_hmm(self, data, gmm, hmm_kwargs, n_repeats=10):

        """
        Fit several HMM models to the data and return the best one.

        Parameters
        ----------
        data : numpy.ndarray
            Data of the single lipid properties for one lipid type at each time step
        gmm : GaussianMixture Model
            Scikit-learn object
        hmm_kwargs: dict
            Additional parameters for Hidden Markov Model
        n_repeats : int
            Number of independent fits

        Returns
        -------
        best_ghmm : GaussianHMM
            hmmlearn object

        """
        best_ghmm = None
        # Specify the length of the sequence for each lipid
        n_lipids = data.shape[0]
        dim = data.shape[2]
        lengths = np.repeat(self.n_frames, n_lipids)

        # The HMM fitting is started multiple times from
        # different starting conditions

        # Initialize the best score with minus infinity
        best_score = -np.inf

        # Re-start the HMM fitting 10 times
        for i in tqdm(range(n_repeats)):

            # Initialize a HMM for one lipid type
            ghmm_i = GaussianHMM(n_components=2,
                                 means_prior=gmm.means_,
                                 covars_prior=gmm.covariances_,
                                 **hmm_kwargs)

            # Train the HMM based on the data for every lipid and frame
            ghmm_i.fit(data.reshape(-1, dim),
                       lengths=lengths)

            # Obtain the log-likelihood
            # probability of the current model
            score_i = ghmm_i.score(data.reshape(-1, dim),
                                   lengths=lengths)

            # Check if the quality of the result improved
            if score_i > best_score:
                best_score = score_i
                best_ghmm = ghmm_i

            # Delete the current model
            del ghmm_i

        return best_ghmm



[docs]
    def plot_hmm_result(self):
        """
        Plots Hidden Markov Models training history

        """
        x_len = 0
        for resname, ghmm in self.results['HMM'].items():
            if self.asymmetric_membrane:
                for leaflet in range(2):
                    if ghmm[leaflet] is not None:
                        x_len = max(x_len, len(ghmm[leaflet].monitor_.history) - 1)
                        plt.semilogy(np.arange(len(ghmm[leaflet].monitor_.history) - 1),
                                     np.diff(np.array(ghmm[leaflet].monitor_.history)),
                                     ls="-", label=f"{resname}_{leaflet}", lw=2)
            else:
                x_len = max(x_len, len(ghmm.monitor_.history) - 1)
                plt.semilogy(np.arange(len(ghmm.monitor_.history) - 1), np.diff(np.array(ghmm.monitor_.history)),
                             ls="-", label=resname, lw=2)
        x_len = (x_len // 100 + 1) * 100
        plt.legend(fontsize=15)
        plt.semilogy(np.arange(x_len), np.repeat(1E-4, x_len), color="k", ls="--", lw=2)
        plt.xlim(0, x_len)
        plt.ylim(1E-5, 15E5)
        plt.ylabel(r"$\Delta(log(\hat{L}))$", fontsize=18)
        plt.xlabel("Iterations", fontsize=18)
        plt.tick_params(axis="both", labelsize=11)
        plt.text(s="Tolerance 1e-4", x=1, y=1E-4 + 0.00005, color="k", fontsize=15)
        plt.title("a", fontsize=20, fontweight="bold", loc="left")
        plt.tight_layout()
        if self.save_plots:
            plt.savefig("a.pdf")
        plt.show()



[docs]
    def predict_states(self):
        """
        Each residues prediction assignment to ordered or disordered domains.

        """
        if self.asymmetric_membrane:
            # Since we select stable lipids for training, we need all training data of lipids to predict order of
            # frequently flip-flop doing ones
            for resname, tails in self.tails.items():
                temp_array = []
                for leaflet in range(2):
                    idx = self.leaflet_residx[resname][leaflet]
                    data = self.results.train[idx][:, :, 0:len(tails) + 1]
                    shape = data.shape
                    hmm = self.results['HMM'][resname][leaflet]
                    if hmm is not None:
                        lengths = np.repeat(shape[1], shape[0])
                        prediction = hmm.predict(data.reshape(-1, shape[2]), lengths=lengths).reshape(shape[0],
                                                                                                      shape[1])
                        prediction = self.hmm_diff_checker(hmm.means_, prediction)
                        temp_array.append([idx, prediction])
                    else:
                        temp_array.append([])

                if len(temp_array[0]) == 0:
                    idx = np.array(temp_array[1][0]).argsort()
                    result = temp_array[1][1]
                elif len(temp_array[1]) == 0:
                    idx = np.array(temp_array[0][0]).argsort()
                    result = temp_array[0][1]
                else:
                    idx = np.concatenate((temp_array[0][0], temp_array[1][0])).argsort()
                    result = np.concatenate((temp_array[0][1], temp_array[1][1]))
                result = result[idx]
                self.results['HMM_Pred'][resname] = result

            for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
                temp_array = []
                for leaflet in range(2):
                    idx = self.leaflet_residx[resname][leaflet]
                    data = self.results.train[idx][:, :, [0, 1 + self.max_tail_len + i]]
                    shape = data.shape
                    hmm = self.results['HMM'][resname][leaflet]
                    if hmm is not None:
                        lengths = np.repeat(shape[1], shape[0])
                        prediction = hmm.predict(data.reshape(-1, shape[2]), lengths=lengths).reshape(shape[0],
                                                                                                      shape[1])
                        prediction = self.hmm_diff_checker(hmm.means_, prediction)
                        temp_array.append([idx, prediction])
                    else:
                        temp_array.append(None)

                if temp_array[0] is None:
                    idx = np.array(temp_array[1][0]).argsort()
                    result = temp_array[1][1]
                elif temp_array[1] is None:
                    idx = np.array(temp_array[0][0]).argsort()
                    result = temp_array[0][1]
                else:
                    idx = np.concatenate((temp_array[0][0], temp_array[1][0])).argsort()
                    result = np.concatenate((temp_array[0][1], temp_array[1][1]))
                result = result[idx]
                self.results['HMM_Pred'][resname] = result
        else:
            for resname, data in self.results.train_data_per_type.items():
                shape = data[1].shape
                hmm = self.results['HMM'][resname]
                # Lengths consists of number of frames and number of residues
                lengths = np.repeat(shape[1], shape[0])
                prediction = hmm.predict(data[1].reshape(-1, shape[2]), lengths=lengths).reshape(shape[0], shape[1])
                # Save prediction result of each residue
                self.results['HMM_Pred'][resname] = prediction



[docs]
    def state_validate(self):
        """
        Validate state assignments of HMM model by checking means of the model of each residue.
        """
        if not self.asymmetric_membrane:
            # Asymmetric membrane validation is done in prediction step due to nature of it
            for resname, hmm in self.results["HMM"].items():
                if hmm is not None:
                    means = hmm.means_
                    prediction_results = self.results['HMM_Pred'][resname]
                    self.results['HMM_Pred'][resname] = self.hmm_diff_checker(means, prediction_results)



[docs]
    @staticmethod
    def hmm_diff_checker(means, prediction_results):
        """
        Checks if prediction assignments correct with respect to means. Since HMM is unsupervised, model can assign 0 to
         disordered domains and 1 to ordered domains. In these cases needs to be changed to vice versa.

        Parameters
        ----------
        means : np.ndarray
            Table of model which contains means of area per lipid and order parameters
        prediction_results: np.ndarray
            Initial prediction results which is 1s and 0s

        Returns
        -------
        prediction_results : np.ndarray
            Same or flipped prediction results with respect to means table

        """
        diff_percents = (means[1, 0] - means[0, 0]) / means[0, 0]
        if diff_percents > 0.0:
            return np.abs(prediction_results - 1)
        else:
            return prediction_results



[docs]
    @staticmethod
    def get_residue_idx(resids_index_map, resids):
        """
        Checks resids index map (lookup table) to return corresponding indexes to use in DomHMM result section.

        Parameters
        ----------
        resids_index_map: dict
            Lookup table where each residue id's index in self.membrane_unique_resids is kepts
        resids: numpy.ndarray
            Residue id list for indexing

        Returns
        -------
        idx: numpy.ndarray
            Indexes in the same order with resids

        """
        idx = np.array([resids_index_map[resid] for resid in resids])
        return idx



[docs]
    def predict_plot(self):
        """
        Plots convergence of HMM model training process
        """
        t = np.linspace(8, 10, self.n_frames)
        for resname in self.unique_resnames:
            plt.plot(t, self.results['HMM_Pred'][resname].mean(0), label=resname)
        plt.xticks([8, 8.5, 9, 9.5, 10])
        plt.xlabel(r"t ($\mu$s)", fontsize=18)
        plt.ylabel(r"$\bar{O}_{Lipid}$", fontsize=18)
        plt.legend(fontsize=15, ncols=1, loc="lower left")
        plt.ylim(0, 1)
        plt.xlim(8, 10)
        plt.title("b", fontsize=20, fontweight="bold", loc="left")
        plt.tight_layout()
        if self.save_plots:
            plt.savefig("b.pdf")
        plt.show()


    # ------------------------------ GETIS-ORD STATISTIC ------------------------------------------------------------- #

[docs]
    def getis_ord(self):
        """
        Calculates Getis-Ord statistic for identification model
        """
        self.getis_ord_stat(self.results["upper_weight_all"], 0)
        self.getis_ord_stat(self.results["lower_weight_all"], 1)
        log.info("Getis-Ord for leaflets are calculated.")
        if self.result_plots:
            self.getis_ord_plot()
        self.results["Getis_Ord"]["Permut_0"] = self.permut_getis_ord_stat(self.results["upper_weight_all"], 0)
        self.results["Getis_Ord"]["Permut_1"] = self.permut_getis_ord_stat(self.results["lower_weight_all"], 1)
        log.info("Permutations of Getis-Ord are calculated.")
        self.results["z_score"] = self.z_score_calc()
        log.info("Z score is calculated.")



[docs]
    def getis_ord_stat(self, weight_matrix_all, leaflet):
        """
            Getis-Ord Local Spatial Autocorrelation Statistics calculation based on the predicted order states
            of each lipid and the weighting factors between neighbored lipids.

            Parameters
            ----------
            weight_matrix_all : sparse.csr_array
                Weight matrix for all lipid in a leaflet at current time step
            leaflet : int
                0 = upper leaflet, 1 = lower leafet

            Returns
            -------
            g_star_i : list of numpy.ndarrays
                G*i values for each lipid at each time step
            w_ii_all : list of numpy.ndarrays
                Self-influence of the lipids
            """
        # Get the number of frames
        nframes = self.n_frames

        # Initialize empty lists to store the G*i values and the self-influence of the lipids in a lipid
        g_star_i = []
        w_ii_all = []

        # Iterate over frames
        for step in range(nframes):
            # Get the weight matrix of the leaflet at the current time step
            weight_matrix = weight_matrix_all[step]

            # Number of lipids in the leaflet
            n = weight_matrix.shape[0]

            # In case the code was already executed beforehand
            weight_matrix[range(n), range(n)] = 0.

            # Get the order state of each lipid in the leaflet at the current time step
            order_states = self.get_leaflet_step_order(leaflet=leaflet, step=step)

            # Number of neighbors per lipid -> The number is 0 (or close to 0) for not neighboured lipids
            nneighbor = np.sum(weight_matrix > 1E-5, axis=1)
            # Parameters for the Getis-Ord statistic
            w_ii = np.sum(weight_matrix, axis=-1) / nneighbor  # Self-influence!
            weight_matrix[range(n), range(n)] = w_ii
            w_star_i = np.sum(weight_matrix, axis=-1)  # + w_ii
            s_star_1i = np.sum(weight_matrix.power(2), axis=-1)  # + w_ii**2

            # Empirical standard deviation over all order states in the leaflet
            s = np.std(order_states, ddof=1)

            # Employ matrix-vector multiplication
            so = weight_matrix @ order_states

            # Calculate the nominator for G*i
            nom = so - w_star_i * 0.5

            # Calculate the denominator for G*i
            denom = s * np.sqrt((n * s_star_1i - w_star_i ** 2) / (n - 1))

            g_star = nom / denom

            assert not np.any(nneighbor < 1), "Lipid found without a neighbor!"

            g_star_i.append(g_star)
            w_ii_all.append(w_ii)
        self.results['Getis_Ord'][leaflet] = {f"g_star_i_{leaflet}": g_star_i, f"w_ii_{leaflet}": w_ii_all}



[docs]
    def getis_ord_plot(self):
        """
        Plots average order state for each lipid type per frame
        """

        # Get number of lipid residues
        resnum = len(self.unique_resnames)

        # For each residue type there is an empty list initialized
        g_star_i_temp = [[] for _ in range(resnum)]

        # Iterate over all frames
        for step in range(self.n_frames):

            # Get the indices of the lipids in leaflet 0 and 1 and their corresponding positions
            index_dict_0, pos_dict_0 = self.get_leaflet_step_order_index(leaflet=0, step=step)
            index_dict_1, pos_dict_1 = self.get_leaflet_step_order_index(leaflet=1, step=step)

            # Iterate over the lipid types
            for i, rsn in enumerate(self.unique_resnames):
                # Merge Getis-Ord values for the upper and the lower leaflet to display them as a histogram
                # The G* values should be sorted according to the indices of the resids
                g_start_sorted_0 = self.results['Getis_Ord'][0]['g_star_i_0'][step][index_dict_0[rsn]]
                # The G* values should be sorted according to the indices of the resids
                g_start_sorted_1 = self.results['Getis_Ord'][1]['g_star_i_1'][step][index_dict_1[rsn]]
                g_star_i_temp[i] += list(np.append(g_start_sorted_0, g_start_sorted_1))

        for i in range(resnum):
            plt.hist(g_star_i_temp[i], bins=np.linspace(-3, 3, 201), density=True, histtype="step", lw=2,
                     label=self.unique_resnames[i])

        plt.legend(fontsize=15, loc="upper left", ncols=2)

        plt.xlim(-3, 3)
        plt.ylim(0, .9)

        _ = plt.xlabel("$G^*_i$", fontsize=18)
        plt.ylabel("$p(G^*_i)$", fontsize=18)
        plt.tick_params(labelsize=11)

        plt.title("c", fontsize=20, fontweight="bold", loc="left")
        plt.tight_layout()
        if self.save_plots:
            plt.savefig("c.pdf")
        plt.show()



[docs]
    def permut_getis_ord_stat(self, weight_matrix_all, leaflet):

        """
        Getis-Ord Local Spatial Autocorrelation Statistics calculation based on the predicted order states
        of each lipid and the weighting factors between neighbored lipids.

        Parameters
        ----------
        weight_matrix_all : sparse.csr_array
            Weight matrices for all lipid in a leaflet at each time step
        leaflet : int
            0 = upper leaflet, 1 = lower leafet

        Returns
        -------
        g_star_i : list of numpy.ndarrays
            G*i values for each lipid at each time step
        """

        # Get the number of frames
        nframes = self.n_frames

        # Initialize empty lists to store the G*i values and the self-influence of the lipids in a lipid
        g_star_i = []

        # Do 10 permutations per frame
        n_permut = 10

        # Iterate over frames
        for step in tqdm(range(nframes)):

            for permut in range(n_permut):
                # Get the weightmatrix of the leaflet at the current time step
                weight_matrix = weight_matrix_all[step]

                # Number of lipids in the leaflet
                n = weight_matrix.shape[0]

                # In case the code was already executed beforehand
                weight_matrix[range(n), range(n)] = 0.0

                # Get the order state of each lipid in the leaflet at the current time step
                order_states = self.get_leaflet_step_order(leaflet=leaflet, step=step)

                np.random.shuffle(order_states)

                # Number of neighbors per lipid -> The number is 0 (or close to 0) for not neighboured lipids
                nneighbor = np.sum(weight_matrix > 1E-5, axis=1)

                # Parameters for the Getis-Ord statistic
                w_ii = np.sum(weight_matrix, axis=-1) / nneighbor  # Self-influence!
                weight_matrix[range(n), range(n)] = w_ii
                w_star_i = np.sum(weight_matrix, axis=-1)  # + w_ii
                s_star_1i = np.sum(weight_matrix ** 2, axis=-1)  # + w_ii**2

                # Empirical standard deviation over all order states in the leaflet
                s = np.std(order_states, ddof=1)

                # Employ matrix-vector multiplication
                so = weight_matrix @ order_states

                # Calculate the nominator for G*i
                nom = so - w_star_i * 0.5

                # Calculate the denominator for G*i
                denom = s * np.sqrt((n * s_star_1i - w_star_i ** 2) / (n - 1))

                g_star = nom / denom

                assert not np.any(nneighbor < 1), "Lipid found without a neighbor!"

                g_star_i += list(g_star)

        return g_star_i



[docs]
    def z_score_calc(self):
        """
        Z score calculation for Getis-Ord statistics
        """
        result = {}
        for i in range(2):
            z_score = {}
            getis_ord_permut = self.results["Getis_Ord"][f"Permut_{i}"]
            z_score["z_a"] = np.quantile(getis_ord_permut, self.p_value)
            z_score["z1_a"] = np.quantile(getis_ord_permut, 1 - self.p_value)
            result[i] = z_score
        return result


    # ------------------------------ HIERARCHICAL CLUSTERING --------------------------------------------------------- #

[docs]
    def clustering_plot(self):
        """
        Runs hierarchical clustering and plots clustering results in different frames.
        """

        n_frames = self.n_frames
        # Plot %5, %50 and %95 points of frame list
        frame_list = [int(n_frames / 20), int(n_frames / 2), int(n_frames / 1.05)]
        fig, ax = plt.subplots(1, len(frame_list), figsize=(20, 5))

        # Iterate over three frames illustrate the clustering results
        for k, i in enumerate(frame_list):
            order_states_0 = self.get_leaflet_step_order(0, i)

            # Clustering
            # ----------------------------------------------------------------------------------------------------------------------
            core_lipids = self.assign_core_lipids(weight_matrix_f=self.results["upper_weight_all"][i],
                                                  g_star_i_f=self.results['Getis_Ord'][0]['g_star_i_0'][i],
                                                  order_states_f=order_states_0,
                                                  w_ii_f=self.results["Getis_Ord"][0]["w_ii_0"][i],
                                                  z_score=self.results["z_score"][0])

            clusters = self.hierarchical_clustering(weight_matrix_f=self.results["upper_weight_all"][i],
                                                    w_ii_f=self.results["Getis_Ord"][0]["w_ii_0"][i],
                                                    core_lipids=core_lipids)

            # Plot coordinates
            # ----------------------------------------------------------------------------------------------------------------------
            residue_indexes, positions = self.get_leaflet_step_order_index(leaflet=0, step=i)

            for resname, index in residue_indexes.items():
                ax[k].scatter(positions[resname][:, 0],
                              positions[resname][:, 1], marker="s", alpha=1, s=5, label=resname)

            # Choose color scheme for clustering coloring
            colors = plt.cm.viridis_r(np.linspace(0, 1.0, len(clusters.values())))

            # Goto correct frame of the trajectory
            # TODO Statement seems to have no effect
            self.universe.trajectory[self.start:self.stop:self.step][i]

            # Prepare positions for cluster plotting
            leaflet_assignment_mask = self.leaflet_assignment[:, i] == 0

            positions = (self.membrane.residues[leaflet_assignment_mask].atoms & self.all_heads).positions

            label_length = len(residue_indexes)

            # Iterate over clusters and plot the residues
            print(f"Number of clusters in frame {i}: {len(clusters.values())}")
            for j, val in enumerate(clusters.values()):
                idx = np.array(list(val), dtype=int)
                ax[k].scatter(positions[idx, 0],
                              positions[idx, 1],
                              s=100, marker="o", color=colors[j], zorder=-10)
            if self.tmd_protein is not None:
                label_length += len(self.tmd_protein["0"])
                for protein in self.tmd_protein["0"]:
                    ax[k].scatter(protein[0], protein[1], s=100, marker="^", color="black", label="TMD Protein")
            ax[k].set_xticks([])
            ax[k].set_yticks([])
            ax[k].set_aspect("equal")

        handles, labels = ax[0].get_legend_handles_labels()
        plt.figlegend(handles=handles, labels=labels, ncol=label_length, loc='lower center', fontsize=15, frameon=False)
        ax[0].set_title("d", fontsize=20, fontweight="bold", loc="left")

        ax[0].set_title(label=f"Frame {frame_list[0]}", fontsize=18)
        ax[1].set_title(label=f"Frame {frame_list[1]}", fontsize=18)
        ax[2].set_title(label=f"Frame {frame_list[2]}", fontsize=18)
        if self.save_plots:
            plt.savefig("d.pdf")
        plt.show()



[docs]
    def result_clustering(self):
        """
        Runs hierarchical clustering for each frame and saves result
        """
        self.results["Clustering"] = {'0': {}, '1': {}}

        # Iterate over all frames
        for i in tqdm(range(self.n_frames), total=self.n_frames):

            # Iterate over both leaflets
            for j, leaflet_ in enumerate(['upper', 'lower']):
                # Get order states
                order_states_leaf = self.get_leaflet_step_order(j, i)

                core_lipids = self.assign_core_lipids(weight_matrix_f=self.results[f"{leaflet_}_weight_all"][i],
                                                      g_star_i_f=self.results['Getis_Ord'][j][f'g_star_i_{j}'][i],
                                                      order_states_f=order_states_leaf,
                                                      w_ii_f=self.results["Getis_Ord"][j][f"w_ii_{j}"][i],
                                                      z_score=self.results["z_score"][j])

                clusters = self.hierarchical_clustering(weight_matrix_f=self.results[f"{leaflet_}_weight_all"][i],
                                                        w_ii_f=self.results["Getis_Ord"][j][f"w_ii_{j}"][i],
                                                        core_lipids=core_lipids)
                frame_number = self.start + i * self.step
                self.results["Clustering"][str(j)][frame_number] = list(clusters.values())



[docs]
    @staticmethod
    def assign_core_lipids(weight_matrix_f, g_star_i_f, order_states_f, w_ii_f, z_score):

        """
        Assign lipids as core members (aka lipids with a high positive autocorrelation)
        depending on the Getis-Ord spatial local autocorrelation statistic.

        Parameters
        ----------
        weight_matrix_f : numpy.ndarray
            Matrix containing the weight factors between all lipid pairs at one time step
        g_star_i_f : numpy.ndarray
            Getis-Ord spatial local autocorrelation statistic for every lipid at one time step
        order_states_f: numpy.ndarray
            Order states for every lipid at one time step
        w_ii_f: numpy.ndarray
            Self-influence weight factor for every lipid at one time step
        z_score: dict
            Contains boundary of the reaction region


        Returns
        -------
        core_lipids : numpy.ndarray (bool)
           Contains a TRUE value if the lipid is a core member, otherwise it FALSE
        """

        # Define boundary of the reaction region
        z1_a = z_score["z1_a"]
        z_a = z_score["z_a"]

        # Assign core members according to their auto-correlation
        core_lipids = g_star_i_f > z1_a

        # Assign lipids with a mid-range auto-correlation (-z_1-a, z_1-a) * Ordered
        low_corr = (g_star_i_f <= z1_a) & (g_star_i_f >= z_a) & (order_states_f == 1)

        # Add iteratively new lipids to the core members
        n_cores_old = np.inf
        n_cores_new = np.sum(core_lipids)

        # Iterate until self-consistency is reached
        while n_cores_old != n_cores_new:
            # Check how tightly the lipids are connected to the core members
            new_core_lipids = (weight_matrix_f[core_lipids].sum(0) > w_ii_f) & low_corr

            # Assign lipids to core members if condition is full-filled
            core_lipids[new_core_lipids] = True

            # Update number of core lipids
            n_cores_old = n_cores_new
            n_cores_new = np.sum(core_lipids)

        return core_lipids



[docs]
    @staticmethod
    def hierarchical_clustering(weight_matrix_f, w_ii_f, core_lipids):

        """
        Hierarchical clustering approach to identify spatial related Lo domains.

        Parameters
        ----------
        weight_matrix_f : numpy.ndarray
            Matrix containing the weight factors between all lipid pairs at one time step
        w_ii_f: numpy.ndarray
            Self-influence weight factor for every lipid at one time step
        core_lipids : numpy.ndarray
            Array contains information which lipid is assigned as core member

        Returns
        -------
        clusters : dict
            The dictionary contains each found cluster

        """

        # Merge iteratively clusters
        n_clusters_old = np.inf
        n_clusters_new = np.sum(core_lipids)

        # Get the indices of the core lipids
        core_lipids_id = np.where(core_lipids)[0]

        # Store clusters in a Python dictionary
        # Initialize all core lipids as clusters
        clusters = dict(
            zip(
                core_lipids_id.astype("U"),
                [[id] for id in core_lipids_id]
            )
        )

        # Iterate until self-consistency is reached
        while n_clusters_old != n_clusters_new:

            # Get a list of the IDs of current clusters
            cluster_ids = list(clusters.keys())

            # Iterate over all clusters i
            for i, id_i in enumerate(cluster_ids):

                # If cluster i was already merged and deleted, skip it!
                if id_i not in clusters.keys():
                    continue

                # The cluster weights are defined as the sum
                # over the weights of all lipid members
                cluster_weights_i = np.sum(weight_matrix_f[clusters[id_i]], axis=0)

                # Compare cluster weights to the self-influence of each lipid
                merge_condition_i = cluster_weights_i > w_ii_f

                # Iterate over all clusters j
                for id_j in cluster_ids[(i + 1):]:

                    # Do not merge a cluster with itself
                    if id_i == id_j:
                        continue
                    # If cluster j was already merged and deleted, skip it!
                    if id_j not in clusters.keys():
                        continue

                    # Calculate cluster weights and compare to lipids self-influence
                    cluster_weights_j = np.sum(weight_matrix_f[clusters[id_j]], axis=0)
                    merge_condition_j = cluster_weights_j > w_ii_f

                    # If the condition is fullfilled for any lipid -> Merge the clusters
                    if np.any(merge_condition_i[clusters[id_j]]) or np.any(merge_condition_j[clusters[id_i]]):
                        # Merge cluster j into cluster i
                        clusters[id_i] += clusters[id_j]

                        # Delete cluster j from the cluster dict
                        del clusters[id_j]

            # Update cluster numbers
            n_clusters_old = n_clusters_new
            n_clusters_new = len(clusters.keys())

        return clusters


    # ------------------------------ HELPER FUNCTIONS ---------------------------------------------------------------- #

[docs]
    def get_leaflet_step_order(self, leaflet, step):
        """
        Receive residue's order state with respect to the leaflet

        Parameters
        ----------
        leaflet : numpy.ndarray
            leaflet index
        step: numpy.ndarray
            step index

        Returns
        -------
        order_states : numpy.ndarray
            Numpy array contains order state results of the leaflet at step in order of system's residues
        """

        #Init two empty lists for ...
        temp = [] #... order states prediction
        idxs = [] #... indices of lipids
        
        #Iterate over lipids
        for res, data in self.results.train_data_per_type.items():

            #Get indices from resids (e.g. resid 1, is index 0)
            idx = self.get_residue_idx(self.resids_index_map, data[0])

            #Store the indices of the residues in the current leaflet at the current step
            idxs.append(idx[self.leaflet_assignment[idx, step] == leaflet])

            #Get the predicted HMM order states for the residues in the current leaflet at the current step
            temp.append(self.results["HMM_Pred"][res][:, step][self.leaflet_assignment[idx, step] == leaflet])

        #Sort the obtained indices
        sorted_idxs = np.argsort( np.concatenate(idxs) )

        #It is not always the case that the lipid order is equal (e.g., depending on resids of cholesterol for example). Here the order states are sorted
        #so that they correspond to the resids in the leaflet selection -> With that they should fit to the order of the lipids in the weight matrix
        order_states = np.concatenate(temp)[sorted_idxs]

        return order_states



    


[docs]
    def get_leaflet_step_order_index(self, leaflet, step):
        """
        Receive residue's indexes and positions for a specific leaflet at any frame of the trajecytory.

        Parameters
        ----------
        leaflet : numpy.ndarray
            leaflet index
        step: numpy.ndarray
            step index

        Returns
        -------
        indexes : dict
            dictionary contains numpy.arrays containing residue's indexes for each unique residue type
        positions : dict
            dictionary contains numpy.arrays containing residue's positions for each unique residue type
        """

        leaflet_assignment_step = self.leaflet_assignment[:, step]
        leaflet_assignment_mask = leaflet_assignment_step == leaflet

        indexes = {}
        positions = {}

        # Go to correct frame of the trajectory
        self.universe.trajectory[self.start:self.stop:self.step][step]

        for res in self.unique_resnames:

            indexes[res] = np.where(self.membrane.residues[leaflet_assignment_mask].resnames == res)[0]

            if res in self.heads.keys():
                positions[res] = (self.membrane.residues[leaflet_assignment_mask].atoms & self.universe.select_atoms(
                    f"resname {res} and name {self.heads[res]}")).positions
            else:
                positions[res] = (self.membrane.residues[leaflet_assignment_mask].atoms & self.universe.select_atoms(
                    f"resname {res} and name {self.sterol_heads[res]}")).positions

        return indexes, positions