Source code for domhmm.analysis.domhmm

"""
PropertyCalculation --- :mod:`domhmm.analysis.PropertyCalculation`
===========================================================

This module contains the :class:`PropertyCalculation` class.

"""

import logging as log
import sys

import matplotlib.pyplot as plt
import numpy as np
from hmmlearn.hmm import GaussianHMM
from scipy.sparse import csr_array
from scipy.spatial import Voronoi, ConvexHull
from scipy import stats
from sklearn import mixture
from tqdm import tqdm

from .base import LeafletAnalysisBase



[docs]
class PropertyCalculation(LeafletAnalysisBase):
    """
    The DirectorOrder class calculates an order parameter for each selected lipid according to the formula:

        P2 = 0.5 * (3 * cos(a)^2 - 1), (1)

    where an is the angle between a pre-defined director in the lipid (e.g. C-H or CC) and a reference axis.

    """

    def _prepare(self):
        """
        Prepare results array before the calculation of the height spectrum begins

        Attributes
        ----------
        """
        if self.verbose:
            log.basicConfig(level=log.INFO, stream=sys.stdout)
        log.info("Preparation Step")
        # Initialize result storage dictionaries
        self.results.train_data_per_type = {}
        self.results.GMM = {}
        self.results.HMM = {}
        self.results.HMM_Pred = {}
        self.results.Getis_Ord = {}
        self.max_tail_len = -1
        for _, each in self.tails.items():
            if len(each) > self.max_tail_len:
                self.max_tail_len = len(each)
        # Total number of parameters are area per lipid + order parameter for each tail = max_tail_len + 1
        self.results.train = np.zeros(
            (len(self.membrane_unique_resids), self.n_frames, 1 + self.max_tail_len + len(self.sterol_heads)),
            dtype=np.float32)
        # Initalize weight matrix storage for each leaflet.
        setattr(self.results, "upper_weight_all", [])
        setattr(self.results, "lower_weight_all", [])

        # Initialized leaflet assignment array for each frame
        self.leaflet_assignment = np.zeros((len(self.membrane_unique_resids), self.n_frames), dtype=np.int32)


[docs]
    @staticmethod
    def calc_order_parameter(chain):

        """
        Calculate average Scc order parameters per acyl chain according to the equation:

        S_cc = (3 * cos( theta )^2 - 1) / 2,

        where theta describes the angle between the z-axis of the system and the vector between two subsequent tail
        beads.

        Parameters
        ----------
        chain : Selection
            MDAnalysis Selection object

        Returns
        -------
        s_cc : numpy.ndarray
            Mean S_cc parameter for the selected chain of each residue
        """

        # Separate the coordinates according to their residue index
        ridx = np.where(np.diff(chain.resids) > 0)[0] + 1

        pos = np.split(chain.positions, ridx)

        # Calculate the normalized orientation vector between two subsequent tail beads
        vec = [np.diff(pos_i, axis=0) for pos_i in pos]

        vec_norm = [np.sqrt((vec_i ** 2).sum(axis=-1)) for vec_i in vec]

        vec = [vec_i / vec_norm_i.reshape(-1, 1) for vec_i, vec_norm_i in zip(vec, vec_norm)]
        # TODO z axis multiplication inside. It needs to be changed in future work
        # Choose the z-axis as membrane normal and take care of the machine precision
        dot_prod = [np.clip(vec_i[:, 2], -1., 1.) for vec_i in vec]

        # Calculate the order parameter
        s_cc = np.array([np.mean(0.5 * (3 * dot ** 2 - 1)) for dot in dot_prod])

        return s_cc



[docs]
    def order_parameter(self):
        """
        Calculation of scc order parameter for each chain of each residue.
        """
        # Iterate over each tail with chain id
        for chain, tail in self.resid_tails_selection.items():
            # SCC calculation
            s_cc = self.calc_order_parameter(tail)
            idx = self.get_residue_idx(self.resids_index_map, np.unique(tail.resids))
            self.results.train[idx, self.index, 1 + chain] = s_cc
        for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
            s_cc = self.calc_order_parameter(tail)
            idx = self.get_residue_idx(self.resids_index_map, np.unique(tail.resids))
            self.results.train[idx, self.index, 1 + self.max_tail_len + i] = s_cc



[docs]
    @staticmethod
    def area_per_lipid_vor(coor_xy, boxdim, frac):

        """
        Calculation of the area per lipid employing Voronoi tessellation on coordinates mapped to the xy plane.
        The function takes also the periodic boundary conditions of the box into account.

        Parameters
        ----------
        coor_xy : numpy.ndarray
            Coordinates of upper/lower leaflet
        boxdim : array
            Length of box vectors in all directions
        frac : float
            Fraction of box length in x and y outside the unit cell considered for Voronoi calculation 

        Returns
        -------
        vor : Voronoi Tesselation
            Scipy's Voronoi Diagram object
        apl : Area per lipid
            Area per lipid based on Scipy's Voronoi Diagram
        pbc_idx : Indices
            Unit cell indices for periodic image coordinates
        """

        # Number of points in the plane
        ncoor = coor_xy.shape[0]
        bx = boxdim[0]
        by = boxdim[1]
        # Create periodic images of the coordinates
        # to take periodic boundary conditions into account

        # Store coordinates of periodic images
        pbc = np.zeros((9 * ncoor, 2), dtype=np.float32)
        # Store unit cell indices for coordinates of periodic images
        pbc_idx = np.arange(9 * ncoor, dtype=np.int64) % ncoor

        # Iterate over all possible periodic images
        k = 0
        for i in [0, -1, 1]:
            for j in [0, -1, 1]:
                # Multiply the coordinates in a direction
                pbc[k * ncoor: (k + 1) * ncoor, 0] = coor_xy[:, 0] % bx + i * bx
                pbc[k * ncoor: (k + 1) * ncoor, 1] = coor_xy[:, 1] % by + j * by

                k += 1

        # Create a boolean mask for positions within the unit cell and a smaller fraction of positions outside the
        # unit cell
        # Check along x-axis
        f0 = pbc[:, 0] >= -frac * bx
        f1 = pbc[:, 0] <= bx + frac * bx

        # Check along y-axis
        f2 = pbc[:, 1] >= -frac * by
        f3 = pbc[:, 1] <= by + frac * by

        # Merge the four masks together
        mask = f0 * f1 * f2 * f3

        # Filter positions and unit cell indices of all periodic images
        pbc = pbc[mask]
        pbc_idx = pbc_idx[mask]

        # Call scipy's Voronoi implementation
        # There is the (rare!) possibility that two points have the exact same xy positions,
        # to prevent issues at further calculation steps, the qhull_option "QJ" was employed to introduce small random
        # displacement of the points to resolve these issue.

        # IMPORTANT: The use of "QJ" makes the resulting Voronoi diagram depending on frac. Values for ridge lengths
        # can vary!
        vor = Voronoi(pbc, qhull_options="QJ")

        # Iterate over all members of the unit cell and calculate their occupied area
        apl = np.array([ConvexHull(vor.vertices[vor.regions[vor.point_region[i]]]).volume for i in range(ncoor)])

        return vor, apl, pbc_idx



[docs]
    @staticmethod
    def weight_matrix(vor, pbc_idx, coor_xy):

        """
        Calculate the weight factors between neighbored lipid pairs based on a Voronoi tessellation.

        Parameters
        ----------
        vor : Voronoi Tesselation
            Scipy's Voronoi Diagram object
        pbc_idx : Indices
            Unit cell indices of periodic image coordinates
        coor_xy : numpy.ndarray
            Coordinates of upper/lower leaflet

        Returns
        -------
        weight_matrix : numpy.ndarray
            Weight factors wij between neighbored lipid pairs.
            Is 0 if lipids are not directly neighbored.
        """

        # Number of points in the plane
        ncoor = coor_xy.shape[0]

        # Calculate the distance for all pairs of points between which a ridge exists
        dij = vor.points[vor.ridge_points[:, 0]] - vor.points[vor.ridge_points[:, 1]]
        dij = np.sqrt(np.sum(dij ** 2, axis=1))

        # There is the (rare!) possibility that two points have the exact same xy positions,
        # to prevent issues at further calculation steps, their distance is set to a very small
        # distance of 4.7 Angstrom (2 times the VdW radius of a regular bead in MARTINI3)
        dij[dij < 1E-5] = 4.7

        # Calculate the distance for all pairs of vertices connected via a ridge
        vert_idx = np.array(vor.ridge_vertices)
        bij = vor.vertices[vert_idx[:, 0]] - vor.vertices[vert_idx[:, 1]]
        bij = np.sqrt(np.sum(bij ** 2, axis=1))

        # INFO: vor.ridge_points and vor.ridge_vertices should be sorted -> Check vor.ridge_dict

        # Calculate weight factor
        wij = bij / dij

        # Set up an empty array to store the weight factors for each lipid
        weight_matrix = np.zeros((ncoor, ncoor))

        # Select all indices of ridges that contain members of the unit cell
        mask_unit_cell = np.logical_or(vor.ridge_points[:, 0] < ncoor, vor.ridge_points[:, 1] < ncoor)

        # Apply the modulus operator since some of the indices in "unit_cell_point" will point to coordinates outside
        # the unit cell. Applying the modulus operator "%" will allow an indexing of the "weight_matrix". However, some
        # of the indices in "unit_cell_point" will be doubled that shouldn't be an issue since the same weight factor is
        # then just put several times in the same entry of the array (no summing or something similar!)
        # Previous: unit_cell_point = vor.ridge_points[mask_unit_cell] % ncoor

        # Transform the indices in vor.ridge_points back to unit cell indices -> Filter than for all indices of
        # ridges that contain members of the unit cell
        unit_cell_point = pbc_idx[vor.ridge_points][mask_unit_cell]

        weight_matrix[unit_cell_point[:, 0], unit_cell_point[:, 1]] = wij[mask_unit_cell]
        weight_matrix[unit_cell_point[:, 1], unit_cell_point[:, 0]] = wij[mask_unit_cell]
        return weight_matrix


    def _single_frame(self):
        """
        Calculate data from a single frame of the trajectory.
        """

        # Get number of frame from trajectory
        self.frame = self.universe.trajectory.ts.frame
        # Calculate correct index if skipping step not equals 1 or start point not equals 0
        self.index = ( self.frame - self.start ) // self.step

        # Update leaflet assignment (if leaflet_frame_rate is None, leaflets will never get updated during analysis)
        if self.leaflet_frame_rate is not None and not self.index % self.leaflet_frame_rate:
            # Call leaflet assignment functions for non-sterol and sterol compounds
            self.leaflet_selection_no_sterol = self.get_leaflets()
            self.leaflet_selection = self.get_leaflets_sterol()

            # Write assignments to array
            assignment_index = int(self.index / self.leaflet_frame_rate)
            start_index = assignment_index * self.leaflet_frame_rate
            end_index = (assignment_index + 1) * self.leaflet_frame_rate
            if end_index > self.leaflet_assignment.shape[1]:
                end_index = self.leaflet_assignment.shape[1]

            self.uidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["0"].resids)
            self.lidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["1"].resids)

            self.leaflet_assignment[self.lidx, start_index:end_index] = 1
            self.leaflet_assignment[self.uidx, start_index:end_index] = 0
            self.leaflet_assignment[self.lidx, start_index:end_index] = 1

        # Update sterol assignment. Don't do the update if it was already done in the if-statement before
        if not self.index % self.sterol_frame_rate and (
                self.leaflet_frame_rate is None or self.index % self.leaflet_frame_rate):
            # Call leaflet assignment function for sterol compounds
            self.leaflet_selection = self.get_leaflets_sterol()

            # Write assignments to array
            assignment_index = int(self.index / self.sterol_frame_rate)
            start_index = assignment_index * self.sterol_frame_rate
            end_index = (assignment_index + 1) * self.sterol_frame_rate
            if end_index > self.leaflet_assignment.shape[1]:
                end_index = self.leaflet_assignment.shape[1]

            self.uidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["0"].resids)
            self.lidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["1"].resids)

            self.leaflet_assignment[self.uidx, start_index:end_index] = 0
            self.leaflet_assignment[self.lidx, start_index:end_index] = 1

        if self.leaflet_selection["0"].select_atoms("group leaf1", leaf1=self.leaflet_selection["1"]):
            raise ValueError("Atoms in both leaflets !")

        # ------------------------------ Local Normals/Area per Lipid ------------------------------------------------ #
        boxdim = self.universe.trajectory.ts.dimensions[0:3]
        upper_coor_xy = self.leaflet_selection[str(0)].positions
        lower_coor_xy = self.leaflet_selection[str(1)].positions
        # Check Transmembrane domain existence
        if self.tmd_protein is not None:
            tmd_upper_coor_xy = self.tmd_protein["0"]
            # Check if dimension of coordinates is same in both array
            if tmd_upper_coor_xy.shape[1] == upper_coor_xy.shape[1]:
                upper_coor_xy = np.append(upper_coor_xy, tmd_upper_coor_xy, axis=0)
            tmd_lower_coor_xy = self.tmd_protein["1"]
            # Check if dimension of coordinates is same in both array
            if tmd_lower_coor_xy.shape[1] == lower_coor_xy.shape[1]:
                lower_coor_xy = np.append(lower_coor_xy, tmd_lower_coor_xy, axis=0)
        upper_vor, upper_apl, upper_pbc_idx = self.area_per_lipid_vor(coor_xy=upper_coor_xy, boxdim=boxdim,
                                                                      frac=self.frac)
        lower_vor, lower_apl, lower_pbc_idx = self.area_per_lipid_vor(coor_xy=lower_coor_xy, boxdim=boxdim,
                                                                      frac=self.frac)
        if self.tmd_protein is not None:
            # Remove TMD Protein numbers from training data
            upper_apl = np.array(upper_apl[:-len(tmd_upper_coor_xy)])
            lower_apl = np.array(lower_apl[:-len(tmd_lower_coor_xy)])
        self.results.train[self.uidx, self.index, 0] = upper_apl
        self.results.train[self.lidx, self.index, 0] = lower_apl
        # TODO Local normal calculation

        # ------------------------------ Order parameter ------------------------------------------------------------- #
        self.order_parameter()
        # ------------------------------ Weight Matrix --------------------------------------------------------------- #
        upper_weight_matrix = self.weight_matrix(upper_vor, pbc_idx=upper_pbc_idx, coor_xy=upper_coor_xy)
        lower_weight_matrix = self.weight_matrix(lower_vor, pbc_idx=lower_pbc_idx, coor_xy=lower_coor_xy)
        if self.tmd_protein is not None:
            # Remove TMD Protein numbers from weight matrix
            upper_weight_matrix = upper_weight_matrix[:-len(tmd_upper_coor_xy),:-len(tmd_upper_coor_xy)]
            lower_weight_matrix = lower_weight_matrix[:-len(tmd_lower_coor_xy),:-len(tmd_lower_coor_xy)]

        # Keep weight matrices in scipy.sparse.csr_array format since both is sparse matrices
        self.results["upper_weight_all"].append(csr_array(upper_weight_matrix))
        self.results["lower_weight_all"].append(csr_array(lower_weight_matrix))

    def _conclude(self):
        """
        Calculate the final results of the analysis

        Extract the obtained data and put them into a clear and accessible data structure
        """
        log.info("Conclusion step is starting.")
        self.prepare_train_data()
        # -------------------------------------------------------------

        # Check for user-provided HMMs
        if not any(self.trained_hmms):
            # No user-provided HMMs, perform usual workflow
            log.info("Gaussian Mixture Model training is starting.")
            self.GMM(gmm_kwargs=self.gmm_kwargs)

            log.info("Hidden Markov Model training is starting.")
            self.HMM(hmm_kwargs=self.hmm_kwargs)
        else:
            # User-provided HMMs found, use them!
            log.info("No Gaussian Mixture Model is trained.")
            log.info("No Hidden Markov Model is trained. Instead use the user-provided HMMs.")

            # Use the provided dictionary directly, the checks for validity were already done before
            self.results['HMM'] = self.trained_hmms

            # Same workflow as in self.HMM()
            if self.result_plots:
                # Plot result of hmm
                self.plot_hmm_result()

            # Make predictions based on HMM model
            self.predict_states()

            # Validate states and result prediction
            self.state_validate()
            if self.result_plots:
                # Plot prediction result
                self.predict_plot()

        log.info("Getis-Ord Statistic calculation is starting.")

        self.getis_ord()

        #The user argument do_clustering decides if the hierarchical clustering is (not) performed
        if self.do_clustering == False: pass
        else:
            log.info("Clustering is starting.")
            self.result_clustering()
            
            if self.result_plots:
                self.clustering_plot()

        print("It's done! DomHMM finished successfully. Have a nice day and enjoy your data! :-)")


[docs]
    def prepare_train_data(self):
        """
        Prepare train data for GMM and HMM while separating self.results.train with respect to each residue
        """
        self.results.train_data_per_type = {}
        resid_dict = {}
        for resname in self.unique_resnames:
            idx = self.get_residue_idx(self.resids_index_map, self.residue_ids[resname])
            resid_dict[resname] = idx
            self.results.train_data_per_type[f"{resname}"] = [[] for _ in range(3)]
        if self.asymmetric_membrane:
            # For asymmetric membranes, models will be trained for each leaflets' each residue type
            leaflet_residx = {}
            # leaflet_train_residx contains residue indexes that are not too flip-floppy for model training
            leaflet_train_residx = {}
            # Save each residues indexes with respect to leaflet assignment
            for resname, idx in resid_dict.items():
                leaflet_assign = self.leaflet_assignment[idx]
                leaflet_train_residx[resname] = {0: [], 1: []}
                leaflet_residx[resname] = {0: [], 1: []}
                for i in range(len(leaflet_assign)):
                    if (leaflet_assign[i] == 0).all():
                        leaflet_train_residx[resname][0].append(idx[i])
                        leaflet_residx[resname][0].append(idx[i])
                    elif (leaflet_assign[i] == 1).all():
                        leaflet_train_residx[resname][1].append(idx[i])
                        leaflet_residx[resname][1].append(idx[i])
                    else:
                        # If a residue is %80 of time belongs to a leaflet, accept it as residue of that leaflet
                        lower_leaflet_percentage = len(np.nonzero(leaflet_assign[i] == 1)[0]) / len(leaflet_assign[i])
                        if lower_leaflet_percentage >= 0.8:
                            leaflet_train_residx[resname][1].append(idx[i])
                        elif lower_leaflet_percentage <= 0.2:
                            leaflet_train_residx[resname][0].append(idx[i])
                        if lower_leaflet_percentage > 0.5:
                            leaflet_residx[resname][1].append(idx[i])
                        else:
                            leaflet_residx[resname][0].append(idx[i])
            self.leaflet_residx = leaflet_residx
            # Prepare rest of train data for each residue
            for resname, tails in self.tails.items():
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                # Select columns of area per lipid and tails' scc parameters
                residx = leaflet_train_residx[resname]
                upper_leaflet_data = self.results.train[residx[0]][:, :, 0:len(tails) + 1]
                lower_leaflet_data = self.results.train[residx[1]][:, :, 0:len(tails) + 1]
                self.results.train_data_per_type[resname][1] = [upper_leaflet_data, lower_leaflet_data]
                # TODO idx might referenced before assignment ?
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]

            for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                idx = self.get_residue_idx(self.resids_index_map, rsn_ids)
                # Select columns of area per lipid and sterol's scc parameter
                residx = leaflet_train_residx[resname]
                upper_leaflet_data = self.results.train[residx[0]][:, :, [0, 1 + self.max_tail_len + i]]
                lower_leaflet_data = self.results.train[residx[1]][:, :, [0, 1 + self.max_tail_len + i]]
                self.results.train_data_per_type[resname][1] = [upper_leaflet_data, lower_leaflet_data]
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]
        else:
            # For symmetric membranes, models will be trained for each residue type
            for resname, tails in self.tails.items():
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                idx = resid_dict[resname]
                # Select columns of area per lipid and tails' scc parameters
                self.results.train_data_per_type[resname][1] = self.results.train[idx][:, :, 0:len(tails) + 1]
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]

            for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
                rsn_ids = self.residue_ids[resname]
                self.results.train_data_per_type[resname][0] = rsn_ids
                idx = resid_dict[resname]
                # Select columns of area per lipid and sterol's scc parameter
                self.results.train_data_per_type[resname][1] = self.results.train[idx][:, :,
                                                               [0, 1 + self.max_tail_len + i]]
                self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]


    # ------------------------------ FIT GAUSSIAN MIXTURE MODEL ------------------------------------------------------ #

[docs]
    def GMM(self, gmm_kwargs):
        """
        Fit Gaussian Mixture

        Parameters
        ----------
        gmm_kwargs : dict
            Additional parameters for mixture.GaussianMixture
        """
        # Iterate over each residue and implement gaussian mixture model
        for res, data in self.results.train_data_per_type.items():
            if self.asymmetric_membrane:
                temp_dict = {}
                for leaflet in range(2):
                    gmm_data = data[1][leaflet]
                    if len(gmm_data) == 0:
                        temp_dict[leaflet] = None
                    else:
                        gmm = mixture.GaussianMixture(n_components=2, **gmm_kwargs).fit(
                            gmm_data.reshape(-1, gmm_data.shape[2]))
                        temp_dict[leaflet] = gmm
                    log.info(f"Leaflet {leaflet}, {res} Gaussian Mixture Model is trained.")
                self.results["GMM"][res] = temp_dict
            else:
                gmm = mixture.GaussianMixture(n_components=2, **gmm_kwargs).fit(data[1].reshape(-1, data[1].shape[2]))
                self.results["GMM"][res] = gmm
                log.info(f"{res} Gaussian Mixture Model is trained.")

        # Check for convergence
        for resname, each in self.results["GMM"].items():
            if self.asymmetric_membrane:
                if each[0] is not None and not each[0].converged_:
                    log.warning(f"{resname} lower leaflet Gaussian Mixture Model is not converged.")
                if each[1] is not None and not each[1].converged_:
                    log.warning(f"{resname} upper leaflet Gaussian Mixture Model is not converged.")
            else:
                if not each.converged_:
                    log.warning(f"{resname} Gaussian Mixture Model is not converged.")               

            


[docs]
    def mixture_plot(self, resname, gmm_model, hmm_model, leaflet = None):

        """
        Plot results of the Gaussian Mixture Model for each residue type.
        The function should help to spot flaws in the results of the Gaussian Mixture Model.

        The Gaussian Mixture Model in DomHMM is trained on a three-dimensional space (1 APL + 2 Acyl chains), hence also
        its ouput (mean vectors and covariance matrices) are three0-dimensional. However, plots in 3 dimensions are rather hard 
        to understand, therefore we plot here the projections of the raw/fitted probability densities on the xz, yz, and xy plane.

        Parameters
        ----------
        resname : str
            Resname of the actual residue type
        gmm_model : GaussianMixture Model
            Scikit-learn object
        hmm_model : Hidden Markow Model
            hmmlearn object
        leaflet : int or None
            If membrane is asymmetric ensure that plots are made for upper and lower leaflet
        """
        
        #Define parameters that define a grid for the calculations of the fitted distributions
        #The chosen values should cover a wide range of different use cases as long as enough sample data is provided!
        MIN_APL, MAX_APL, STEP_APL = 0, 200, .5
        MIN_SCC, MAX_SCC, STEP_SCC = -.55, 1.05, .05
        
        x01, y01 = np.mgrid[MIN_APL:MAX_APL:STEP_APL, MIN_SCC:MAX_SCC:STEP_SCC]
        x2, y2 = np.mgrid[MIN_SCC:MAX_SCC:STEP_SCC, MIN_SCC:MAX_SCC:STEP_SCC]

        xy01 = np.dstack((x01, y01))
        xy2 = np.dstack((x2, y2))

        #Define colors for the markers that mark the fitted means of the Gaussians
        cx = ["crimson","cyan"]
        cx_hmm = ["orange", "hotpink"]

        #Get trainings data for this type lipid and check if its asymmetric
        train_data_per_type = self.results["train_data_per_type"][resname][1]
        
        if leaflet != None: train_data_per_type = train_data_per_type[ leaflet ]
        else: pass

        #Joint distributions for current lipid type

        cm =1/2.54

        #Do the plot for non-sterolic lipid types
        if resname not in self.sterol_heads.keys():
            
            #Init suplots
            fig, ax = plt.subplots(1, 3, figsize = (35*cm, 10*cm))

            #Area per Lipid - Scc Chain A
            ax[0].hist2d(x = train_data_per_type.reshape(-1, 3)[:, 0],
                         y = train_data_per_type.reshape(-1, 3)[:, 1],
                         bins = [np.linspace(MIN_APL, MAX_APL, int(np.round( (MAX_APL - MIN_APL) / 1 + 1 )) ), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))],
                         density = True, cmap="viridis")

            #Area per Lipid - Scc Chain B
            ax[1].hist2d(x = train_data_per_type.reshape(-1, 3)[:, 0],
                         y = train_data_per_type.reshape(-1, 3)[:, 2],
                         bins = [np.linspace(MIN_APL, MAX_APL, int(np.round( (MAX_APL - MIN_APL) / 1 + 1 )) ), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))],
                         density = True, cmap="viridis")

            #Scc Chain A - Scc Chain B
            ax[2].hist2d(x = train_data_per_type.reshape(-1, 3)[:, 1],
                         y = train_data_per_type.reshape(-1, 3)[:, 2],
                         bins = [np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 ))), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))],
                         density = True, cmap="viridis")
        
            #Calculate and display fitted distributions
            #Iterate over both Gaussian distributions
            for i in range(2):

                #-------------------------------------------------------GMM-------------------------------------------------------

                #Area per Lipid (0) - Scc Chain A (1)
                rv0 = stats.multivariate_normal(gmm_model.means_[i][0:2], gmm_model.covariances_[i][:2,:2])
                z0 = rv0.pdf(xy01)

                #Area per Lipid (0) - Scc Chain B (2)
                rv1 = stats.multivariate_normal(gmm_model.means_[i][::2], gmm_model.covariances_[i][::2, ::2])
                z1 = rv1.pdf(xy01)

                #Scc Chain A (1) - Scc Chain B (2)
                rv2 = stats.multivariate_normal(gmm_model.means_[i][1:], gmm_model.covariances_[i][1:, 1:])
                z2 = rv2.pdf(xy2)

                #Plot contours -> Be aware that the first two plots share the same ranges
                ax[0].contour(x01, y01, z0, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25)
                ax[1].contour(x01, y01, z1, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25)
                ax[2].contour(x2, y2, z2, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25)

                del rv0
                del rv1
                del rv2
                del z0
                del z1
                del z2

                #Mark with a cross the means of the fitted Gaussians
                ax[0].scatter( gmm_model.means_[i][0], gmm_model.means_[i][1], marker = "x", s = 100, zorder = 100, color=cx[i])
                ax[1].scatter( gmm_model.means_[i][0], gmm_model.means_[i][2], marker = "x", s = 100, zorder = 100, color=cx[i], label = f"GMM Mean {i}")
                ax[2].scatter( gmm_model.means_[i][1], gmm_model.means_[i][2], marker = "x", s = 100, zorder = 100, color=cx[i])


                #-------------------------------------------------------HMM-------------------------------------------------------

                #Area per Lipid (0) - Scc Chain A (1)
                rv0 = stats.multivariate_normal(hmm_model.means_[i][0:2], hmm_model.covars_[i][:2,:2])
                z0 = rv0.pdf(xy01)

                #Area per Lipid (0) - Scc Chain B (2)
                rv1 = stats.multivariate_normal(hmm_model.means_[i][::2], hmm_model.covars_[i][::2, ::2])
                z1 = rv1.pdf(xy01)

                #Scc Chain A (1) - Scc Chain B (2)
                rv2 = stats.multivariate_normal(hmm_model.means_[i][1:], hmm_model.covars_[i][1:, 1:])
                z2 = rv2.pdf(xy2)

                #Plot contours -> Be aware that the first two plots share the same ranges
                ax[0].contour(x01, y01, z0, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25)
                ax[1].contour(x01, y01, z1, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25)
                ax[2].contour(x2, y2, z2, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25)

                del rv0
                del rv1
                del rv2
                del z0
                del z1
                del z2

                #Mark with a circle the means of the fitted Gaussians from the HMM
                ax[0].scatter( hmm_model.means_[i][0], hmm_model.means_[i][1], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i])
                ax[1].scatter( hmm_model.means_[i][0], hmm_model.means_[i][2], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i], label = f"HMM Mean {i}")
                ax[2].scatter( hmm_model.means_[i][1], hmm_model.means_[i][2], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i])

            #Set ticks on the y axis
            for i in range(3): ax[i].set_yticks([-0.5, 0, 0.5, 1], [r"$-0.5$", r"$0$", r"$0.5$", r"$1$"])

            #Set ticks on the x axis
            ax[0].set_xticks( np.linspace(MIN_APL, MAX_APL, int( np.round( (MAX_APL - MIN_APL) / 25 + 1 ) ) ) )
            ax[1].set_xticks( np.linspace(MIN_APL, MAX_APL, int( np.round( (MAX_APL - MIN_APL) / 25 + 1 ) ) ) ) 
            ax[2].set_xticks([-0.5, 0, 0.5, 1], [r"$-0.5$", r"$0$", r"$0.5$", r"$1$"])

            #Label y axis
            yl0 = ax[0].set_ylabel(r"$p(\bar{S}_{CC}^{\text{sn-2}})$", labelpad=-7,fontsize=18)
            yl1 = ax[1].set_ylabel(r"$p(\bar{S}_{CC}^{\text{sn-1}})$", labelpad=-7,fontsize=18)
            yl2 = ax[2].set_ylabel(r"$p(\bar{S}_{CC}^{\text{sn-1}})$", labelpad=-7,fontsize=18)

            #Label x axis
            xl0 = ax[0].set_xlabel(r"$p(a)$", labelpad=0,fontsize=18)
            xl1 = ax[1].set_xlabel(r"$p(a)$", labelpad=0,fontsize=18)
            xl2 = ax[2].set_xlabel(r"$p(\bar{S}_{CC}^{\text{sn-2}})$", labelpad=0,fontsize=18)

            #Make plot more readable
            for i in range(3):
                ax[i].tick_params(rotation=45)
                ax[i].grid(False)
                ax[i].tick_params(axis="both", labelsize=11)

            plt.subplots_adjust(hspace=0.1, wspace = 0.25)
            
            lg = ax[1].legend(loc = "upper center", bbox_to_anchor = ( 0.5, -0.25), fancybox = False, framealpha = 0.5, facecolor = "grey", ncols = 4)

            if leaflet != None: plt.savefig(f"GMM_{resname}_{leaflet}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl0, xl1, xl2, yl0, yl1, yl2, lg), bbox_inches = "tight")
            else: plt.savefig(f"GMM_{resname}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl0, xl1, xl2, yl0, yl1, yl2, lg), bbox_inches = "tight")

        #Do the plot for sterolic lipid types
        else:

            #Init only one subplot
            fig, ax = plt.subplots(1, 1, figsize = (15*cm, 15*cm))

            #Area per Lipid - Scc Chain A
            ax.hist2d(x = train_data_per_type.reshape(-1, 2)[:, 0],
                      y = train_data_per_type.reshape(-1, 2)[:, 1],
                      bins = [np.linspace(MIN_APL, MAX_APL, int(np.round( (MAX_APL - MIN_APL) / 1 + 1 )) ), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))],
                      density = True, cmap="viridis")


            #Calculate and display fitted distributions
            #Iterate over both Gaussian distributions
            for i in range(2):

                #-------------------------------------------------------GMM-------------------------------------------------------
                
                #Area per Lipid (0) - Scc Chain C (1)
                rv0 = stats.multivariate_normal(gmm_model.means_[i], gmm_model.covariances_[i])
                z0 = rv0.pdf(xy01)

                #Plot contours
                ax.contour(x01, y01, z0, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25)
                
                del rv0
                del z0

                #Mark with a cross the means of the fitted Gaussians
                ax.scatter( gmm_model.means_[i][0], gmm_model.means_[i][1], marker = "x", s = 100, zorder = 100, color=cx[i], label = f"GMM Mean {i}")

                #-------------------------------------------------------HMM-------------------------------------------------------

                #Area per Lipid (0) - Scc Chain C (1)
                rv0 = stats.multivariate_normal(hmm_model.means_[i], hmm_model.covars_[i])
                z0 = rv0.pdf(xy01)

                #Plot contours
                ax.contour(x01, y01, z0, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25)

                del rv0
                del z0

                #Mark with a circle the means of the fitted Gaussians from the HMM
                ax.scatter( hmm_model.means_[i][0], hmm_model.means_[i][1], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i], label = f"HMM Mean {i}")

            #Set ticks on the x and y axis
            ax.set_yticks([-0.5, 0, 0.5, 1], [r"$-0.5$", r"$0$", r"$0.5$", r"$1$"])
            ax.set_xticks( np.linspace(MIN_APL, MAX_APL, int( np.round( (MAX_APL - MIN_APL) / 25 + 1 ) ) ) )

            #Label the x and y axis
            yl = ax.set_ylabel(r"$p(P_2)$", labelpad=-7,fontsize=18)
            xl = ax.set_xlabel(r"$p(a)$", labelpad=0,fontsize=18)

            # Make plot more readable
            ax.tick_params(rotation=45)
            ax.grid(False)
            ax.tick_params(axis="both", labelsize=11)

            lg = ax.legend(loc = "upper center", bbox_to_anchor = ( 0.5, -0.25), fancybox = False, framealpha = 0.5, facecolor = "grey", ncols = 4)

            if leaflet != None: plt.savefig(f"GMM_{resname}_{leaflet}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl, yl, lg), bbox_inches = "tight")
            else: plt.savefig(f"GMM_{resname}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl, yl, lg), bbox_inches = "tight")

        plt.close()


    # ------------------------------ HIDDEN MARKOV MODEL ------------------------------------------------------------- #


[docs]
    def HMM(self, hmm_kwargs):
        """
        Create Gaussian based Hidden Markov Models for each residue type

        Parameters
        ----------
        hmm_kwargs : dict
            Additional parameters for hmmlearn.hmm.GaussianHMM
        """
        # Iterate over each residue and implement gaussian-hidden markov model
        for resname, data in self.results.train_data_per_type.items():
            if self.asymmetric_membrane:
                temp_dict = {}
                for leaflet in range(2):
                    gmm = self.results["GMM"][resname][leaflet]
                    if gmm is not None:
                        hmm_data = data[1][leaflet]
                        hmm = self.fit_hmm(data=hmm_data, gmm=gmm, hmm_kwargs=hmm_kwargs,
                                           n_repeats=self.n_init_hmm)
                        temp_dict[leaflet] = hmm
                    else:
                        temp_dict[leaflet] = None
                    log.info(f"Leaflet {leaflet}, {resname} Gaussian Hidden Markov Model is trained.")
                self.results["HMM"][resname] = temp_dict
            else:
                hmm = self.fit_hmm(data=data[1], gmm=self.results["GMM"][resname], hmm_kwargs=hmm_kwargs, n_repeats=self.n_init_hmm)
                self.results["HMM"][resname] = hmm
                log.info(f"{resname} Gaussian Hidden Markov Model is trained.")
        if self.result_plots:
            # Plot result of hmm
            self.plot_hmm_result()
        # Make predictions based on HMM model
        self.predict_states()
        # Validate states and result prediction
        self.state_validate()
        if self.result_plots:
            # Plot prediction result
            self.predict_plot()

            #Plot results of GMM and HMM

            #Iterate over fitted Gaussian Mixture Models
            for resname in self.results["GMM"].keys():

                gmm_trained = self.results["GMM"][resname]
                hmm_trained = self.results["HMM"][resname]
                
                if self.asymmetric_membrane:
                    
                    if gmm_trained[0] is not None and hmm_trained[0] is not None and gmm_trained[0].converged_ and hmm_trained[0].monitor_.converged: self.mixture_plot(resname = resname, gmm_model = gmm_trained[0], hmm_model = hmm_trained[0], leaflet = 0)
                    
                    if gmm_trained[1] is not None and hmm_trained[1] is not None and gmm_trained[1].converged_ and hmm_trained[1].monitor_.converged: self.mixture_plot(resname = resname, gmm_model = gmm_trained[1], hmm_model = hmm_trained[1], leaflet = 1)
                   
                else:
                    if gmm_trained.converged_ and hmm_trained.monitor_.converged: self.mixture_plot(resname = resname, gmm_model = gmm_trained, hmm_model = hmm_trained)





[docs]
    def fit_hmm(self, data, gmm, hmm_kwargs, n_repeats=10):

        """
        Fit several HMM models to the data and return the best one.

        Parameters
        ----------
        data : numpy.ndarray
            Data of the single lipid properties for one lipid type at each time step
        gmm : GaussianMixture Model
            Scikit-learn object
        hmm_kwargs: dict
            Additional parameters for Hidden Markov Model
        n_repeats : int
            Number of independent fits

        Returns
        -------
        best_ghmm : GaussianHMM
            hmmlearn object

        """
        best_ghmm = None
        # Specify the length of the sequence for each lipid
        n_lipids = data.shape[0]
        dim = data.shape[2]
        lengths = np.repeat(self.n_frames, n_lipids)

        # The HMM fitting is started multiple times from
        # different starting conditions

        # Initialize the best score with minus infinity
        best_score = -np.inf

        # Re-start the HMM fitting 10 times
        for i in tqdm(range(n_repeats)):

            # Initialize a HMM for one lipid type
            ghmm_i = GaussianHMM(n_components=2,
                                 means_prior=gmm.means_,
                                 covars_prior=gmm.covariances_,
                                 **hmm_kwargs)

            #Check whether an optimization of the mean vectors and the covariance matrices is requested
            #Optimization of means is not required  -> Take it from the Gaussian Mixture Model
            if "m" not in hmm_kwargs['params']: ghmm_i.means_ = gmm.means_
            #Optimization of covariances is not required -> Take it from the Gaussian Mixture Model
            if "c" not in hmm_kwargs['params']: ghmm_i.covars_ = gmm.covariances_

            # Train the HMM based on the data for every lipid and frame
            ghmm_i.fit(data.reshape(-1, dim),
                       lengths=lengths)

            # Obtain the log-likelihood
            # probability of the current model
            score_i = ghmm_i.score(data.reshape(-1, dim),
                                   lengths=lengths)

            # Check if the quality of the result improved
            if score_i > best_score:
                best_score = score_i
                best_ghmm = ghmm_i

            # Delete the current model
            del ghmm_i

        return best_ghmm



[docs]
    def plot_hmm_result(self):
        """
        Plots convergence of HMM model training process.
        """
        x_len = 0
        for resname, ghmm in self.results['HMM'].items():
            if self.asymmetric_membrane:
                for leaflet in range(2):
                    if ghmm[leaflet] is not None:
                        x_len = max(x_len, len(ghmm[leaflet].monitor_.history) - 1)
                        plt.semilogy(np.arange(len(ghmm[leaflet].monitor_.history) - 1),
                                     np.diff(np.array(ghmm[leaflet].monitor_.history)),
                                     ls="-", label=f"{resname}_{leaflet}", lw=2)
            else:
                x_len = max(x_len, len(ghmm.monitor_.history) - 1)
                plt.semilogy(np.arange(len(ghmm.monitor_.history) - 1), np.diff(np.array(ghmm.monitor_.history)),
                             ls="-", label=resname, lw=2)
        x_len = (x_len // 100 + 1) * 100
        plt.legend(fontsize=15)
        plt.semilogy(np.arange(x_len), np.repeat(1E-4, x_len), color="k", ls="--", lw=2)
        plt.xlim(0, x_len)
        plt.ylim(1E-5, 15E5)
        plt.ylabel(r"$\Delta(log(\hat{L}))$", fontsize=18)
        plt.xlabel("Iterations", fontsize=18)
        plt.tick_params(axis="both", labelsize=11)
        plt.text(s="Tolerance 1e-4", x=1, y=1E-4 + 0.00005, color="k", fontsize=15)
        plt.title("a", fontsize=20, fontweight="bold", loc="left")
        plt.tight_layout()
        if self.save_plots:
            plt.savefig("a.pdf")
        plt.show()



[docs]
    def predict_states(self):
        """
        Each residues prediction assignment to ordered or disordered domains.

        """
        if self.asymmetric_membrane:
            # Since we select stable lipids for training, we need all training data of lipids to predict order of
            # frequently flip-flop doing ones
            for resname, tails in self.tails.items():
                temp_array = []
                for leaflet in range(2):
                    idx = self.leaflet_residx[resname][leaflet]
                    data = self.results.train[idx][:, :, 0:len(tails) + 1]
                    shape = data.shape
                    hmm = self.results['HMM'][resname][leaflet]
                    if hmm is not None:
                        lengths = np.repeat(shape[1], shape[0])
                        prediction = hmm.predict(data.reshape(-1, shape[2]), lengths=lengths).reshape(shape[0],
                                                                                                      shape[1])
                        prediction = self.hmm_diff_checker(hmm.means_, prediction)
                        temp_array.append([idx, prediction])
                    else:
                        temp_array.append([])

                # Leaflet checks in case lipid is only in one leaflet
                if len(temp_array[0]) == 0:
                    idx = np.array(temp_array[1][0]).argsort()
                    result = temp_array[1][1]
                elif len(temp_array[1]) == 0:
                    idx = np.array(temp_array[0][0]).argsort()
                    result = temp_array[0][1]
                else:
                    idx = np.concatenate((temp_array[0][0], temp_array[1][0])).argsort()
                    result = np.concatenate((temp_array[0][1], temp_array[1][1]))
                result = result[idx]
                self.results['HMM_Pred'][resname] = result

            for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()):
                temp_array = []
                for leaflet in range(2):
                    idx = self.leaflet_residx[resname][leaflet]
                    data = self.results.train[idx][:, :, [0, 1 + self.max_tail_len + i]]
                    shape = data.shape
                    hmm = self.results['HMM'][resname][leaflet]
                    if hmm is not None:
                        lengths = np.repeat(shape[1], shape[0])
                        prediction = hmm.predict(data.reshape(-1, shape[2]), lengths=lengths).reshape(shape[0],
                                                                                                      shape[1])
                        prediction = self.hmm_diff_checker(hmm.means_, prediction)
                        temp_array.append([idx, prediction])
                    else:
                        temp_array.append(None)

                # Leaflet checks in case lipid is only in one leaflet
                if temp_array[0] is None:
                    idx = np.array(temp_array[1][0]).argsort()
                    result = temp_array[1][1]
                elif temp_array[1] is None:
                    idx = np.array(temp_array[0][0]).argsort()
                    result = temp_array[0][1]
                else:
                    idx = np.concatenate((temp_array[0][0], temp_array[1][0])).argsort()
                    result = np.concatenate((temp_array[0][1], temp_array[1][1]))
                result = result[idx]
                self.results['HMM_Pred'][resname] = result
        else:
            # Symmetric membrane case
            for resname, data in self.results.train_data_per_type.items():
                shape = data[1].shape
                hmm = self.results['HMM'][resname]
                # Lengths consists of number of frames and number of residues
                lengths = np.repeat(shape[1], shape[0])
                prediction = hmm.predict(data[1].reshape(-1, shape[2]), lengths=lengths).reshape(shape[0], shape[1])
                # Save prediction result of each residue
                self.results['HMM_Pred'][resname] = prediction



[docs]
    def state_validate(self):
        """
        Validate state assignments of HMM model by checking means of the model of each residue.
        """
        if not self.asymmetric_membrane:
            # Asymmetric membrane validation is done in prediction step due to nature of it
            for resname, hmm in self.results["HMM"].items():
                if hmm is not None:
                    means = hmm.means_
                    prediction_results = self.results['HMM_Pred'][resname]
                    self.results['HMM_Pred'][resname] = self.hmm_diff_checker(means, prediction_results)



[docs]
    @staticmethod
    def hmm_diff_checker(means, prediction_results):
        """
        Checks if prediction assignments correct with respect to means. Since HMM is unsupervised, model can assign 0 to
         disordered domains and 1 to ordered domains. In these cases needs to be changed to vice versa.

        Parameters
        ----------
        means : np.ndarray
            Table of model which contains means of area per lipid and order parameters
        prediction_results: np.ndarray
            Initial prediction results which is 1s and 0s

        Returns
        -------
        prediction_results : np.ndarray
            Same or flipped prediction results with respect to means table

        """
        diff_percents = (means[1, 0] - means[0, 0]) / means[0, 0]
        if diff_percents > 0.0:
            return np.abs(prediction_results - 1)
        else:
            return prediction_results



[docs]
    @staticmethod
    def get_residue_idx(resids_index_map, resids):
        """
        Checks resids index map (lookup table) to return corresponding indexes to use in DomHMM result section.

        Parameters
        ----------
        resids_index_map: dict
            Lookup table where each residue id's index in self.membrane_unique_resids is kepts
        resids: numpy.ndarray
            Residue id list for indexing

        Returns
        -------
        idx: numpy.ndarray
            Indexes in the same order with resids

        """
        idx = np.array([resids_index_map[resid] for resid in resids])
        return idx



[docs]
    def predict_plot(self):
        """
        Plots average ordered lipids of HMM prediction for each lipid type.
        """
        start_time = self.universe.trajectory[self.start].time
        end_time = self.universe.trajectory[self.stop - 1].time
        # Configured as printing 0.1 microsecond instead 1000 nanoseconds for readability.
        if end_time - start_time >= 1e8:
            time_unit = "ms"
            scale_factor = 1e9
        elif end_time - start_time >= 1e5:
            time_unit = "μs"
            scale_factor = 1e6
        else:
            time_unit = "ns"
            scale_factor = 1e3
        start_time /= scale_factor
        end_time /= scale_factor

        t = np.linspace(start_time, end_time, self.n_frames)
        for resname in self.unique_resnames:
            plt.plot(t, self.results['HMM_Pred'][resname].mean(0), label=resname)

        plt.xlabel(f"t ({time_unit})", fontsize=18)
        plt.ylabel(r"$\bar{O}_{Lipid}$", fontsize=18)
        plt.legend(fontsize=15, ncols=1, loc="lower left")
        plt.ylim(0, 1)
        plt.title("b", fontsize=20, fontweight="bold", loc="left")
        plt.tight_layout()
        if self.save_plots:
            plt.savefig("b.pdf")
        plt.show()


    # ------------------------------ GETIS-ORD STATISTIC ------------------------------------------------------------- #

[docs]
    def getis_ord(self):
        """
        Calculates Getis-Ord statistic for identification model
        """
        self.getis_ord_stat(self.results["upper_weight_all"], 0)
        self.getis_ord_stat(self.results["lower_weight_all"], 1)
        log.info("Getis-Ord for leaflets are calculated.")
        if self.result_plots:
            self.getis_ord_plot()
        self.results["Getis_Ord"]["Permut_0"] = self.permut_getis_ord_stat(self.results["upper_weight_all"], 0)
        self.results["Getis_Ord"]["Permut_1"] = self.permut_getis_ord_stat(self.results["lower_weight_all"], 1)
        log.info("Permutations of Getis-Ord are calculated.")
        self.results["z_score"] = self.z_score_calc()
        log.info("Z score is calculated.")



[docs]
    def getis_ord_stat(self, weight_matrix_all, leaflet):
        """
            Getis-Ord Local Spatial Autocorrelation Statistics calculation based on the predicted order states
            of each lipid and the weighting factors between neighbored lipids.

            Parameters
            ----------
            weight_matrix_all : sparse.csr_array
                Weight matrix for all lipid in a leaflet at current time step
            leaflet : int
                0 = upper leaflet, 1 = lower leafet

            Returns
            -------
            g_star_i : list of numpy.ndarrays
                G*i values for each lipid at each time step
            w_ii_all : list of numpy.ndarrays
                Self-influence of the lipids
            """
        # Get the number of frames
        nframes = self.n_frames

        # Initialize empty lists to store the G*i values and the self-influence of the lipids in a lipid
        g_star_i = []
        w_ii_all = []

        # Iterate over frames
        for step in range(nframes):
            # Get the weight matrix of the leaflet at the current time step
            weight_matrix = weight_matrix_all[step]

            # Number of lipids in the leaflet
            n = weight_matrix.shape[0]

            # In case the code was already executed beforehand
            weight_matrix[range(n), range(n)] = 0.

            # Get the order state of each lipid in the leaflet at the current time step
            order_states = self.get_leaflet_step_order(leaflet=leaflet, step=step)

            # Number of neighbors per lipid -> The number is 0 (or close to 0) for not neighboured lipids
            nneighbor = np.sum(weight_matrix > 1E-5, axis=1)
            # Parameters for the Getis-Ord statistic
            w_ii = np.sum(weight_matrix, axis=-1) / nneighbor  # Self-influence!
            weight_matrix[range(n), range(n)] = w_ii
            w_star_i = np.sum(weight_matrix, axis=-1)  # + w_ii
            s_star_1i = np.sum(weight_matrix.power(2), axis=-1)  # + w_ii**2

            # Empirical standard deviation over all order states in the leaflet
            s = np.std(order_states, ddof=1)

            # Employ matrix-vector multiplication
            so = weight_matrix @ order_states

            # Calculate the nominator for G*i
            nom = so - w_star_i * 0.5

            # Calculate the denominator for G*i
            denom = s * np.sqrt((n * s_star_1i - w_star_i ** 2) / (n - 1))

            g_star = nom / denom

            assert not np.any(nneighbor < 1), "Lipid found without a neighbor!"

            g_star_i.append(g_star)
            w_ii_all.append(w_ii)
        self.results['Getis_Ord'][leaflet] = {f"g_star_i_{leaflet}": g_star_i, f"w_ii_{leaflet}": w_ii_all}



[docs]
    def getis_ord_plot(self):
        """
        Plots average order state for each lipid type per frame
        """

        # Get number of lipid residues
        resnum = len(self.unique_resnames)

        # For each residue type there is an empty list initialized
        g_star_i_temp = [[] for _ in range(resnum)]

        # Iterate over all frames
        for step in range(self.n_frames):

            # Get the indices of the lipids in leaflet 0 and 1 and their corresponding positions
            index_dict_0, pos_dict_0 = self.get_leaflet_step_order_index(leaflet=0, step=step)
            index_dict_1, pos_dict_1 = self.get_leaflet_step_order_index(leaflet=1, step=step)

            # Iterate over the lipid types
            for i, rsn in enumerate(self.unique_resnames):
                # Merge Getis-Ord values for the upper and the lower leaflet to display them as a histogram
                # The G* values should be sorted according to the indices of the resids
                g_start_sorted_0 = self.results['Getis_Ord'][0]['g_star_i_0'][step][index_dict_0[rsn]]
                # The G* values should be sorted according to the indices of the resids
                g_start_sorted_1 = self.results['Getis_Ord'][1]['g_star_i_1'][step][index_dict_1[rsn]]
                g_star_i_temp[i] += list(np.append(g_start_sorted_0, g_start_sorted_1))

        for i in range(resnum):
            plt.hist(g_star_i_temp[i], bins=np.linspace(-3, 3, 201), density=True, histtype="step", lw=2,
                     label=self.unique_resnames[i])

        plt.legend(fontsize=15, loc="upper left", ncols=2)

        plt.xlim(-3, 3)
        plt.ylim(0, .9)

        _ = plt.xlabel("$G^*_i$", fontsize=18)
        plt.ylabel("$p(G^*_i)$", fontsize=18)
        plt.tick_params(labelsize=11)

        plt.title("c", fontsize=20, fontweight="bold", loc="left")
        plt.tight_layout()
        if self.save_plots:
            plt.savefig("c.pdf")
        plt.show()



[docs]
    def permut_getis_ord_stat(self, weight_matrix_all, leaflet):

        """
        Getis-Ord Local Spatial Autocorrelation Statistics calculation based on the predicted order states
        of each lipid and the weighting factors between neighbored lipids.

        Parameters
        ----------
        weight_matrix_all : sparse.csr_array
            Weight matrices for all lipid in a leaflet at each time step
        leaflet : int
            0 = upper leaflet, 1 = lower leafet

        Returns
        -------
        g_star_i : list of numpy.ndarrays
            G*i values for each lipid at each time step
        """

        # Get the number of frames
        nframes = self.n_frames

        # Initialize empty lists to store the G*i values and the self-influence of the lipids in a lipid
        g_star_i = []

        # Do 10 permutations per frame
        n_permut = 10

        # Iterate over frames
        for step in tqdm(range(nframes)):

            for permut in range(n_permut):
                # Get the weightmatrix of the leaflet at the current time step
                weight_matrix = weight_matrix_all[step]

                # Number of lipids in the leaflet
                n = weight_matrix.shape[0]

                # In case the code was already executed beforehand
                weight_matrix[range(n), range(n)] = 0.0

                # Get the order state of each lipid in the leaflet at the current time step
                order_states = self.get_leaflet_step_order(leaflet=leaflet, step=step)

                np.random.shuffle(order_states)

                # Number of neighbors per lipid -> The number is 0 (or close to 0) for not neighboured lipids
                nneighbor = np.sum(weight_matrix > 1E-5, axis=1)

                # Parameters for the Getis-Ord statistic
                w_ii = np.sum(weight_matrix, axis=-1) / nneighbor  # Self-influence!
                weight_matrix[range(n), range(n)] = w_ii
                w_star_i = np.sum(weight_matrix, axis=-1)  # + w_ii
                s_star_1i = np.sum(weight_matrix ** 2, axis=-1)  # + w_ii**2

                # Empirical standard deviation over all order states in the leaflet
                s = np.std(order_states, ddof=1)

                # Employ matrix-vector multiplication
                so = weight_matrix @ order_states

                # Calculate the nominator for G*i
                nom = so - w_star_i * 0.5

                # Calculate the denominator for G*i
                denom = s * np.sqrt((n * s_star_1i - w_star_i ** 2) / (n - 1))

                g_star = nom / denom

                assert not np.any(nneighbor < 1), "Lipid found without a neighbor!"

                g_star_i += list(g_star)

        return g_star_i



[docs]
    def z_score_calc(self):
        """
        Z score calculation for Getis-Ord statistics
        """
        result = {}
        for i in range(2):
            z_score = {}
            getis_ord_permut = self.results["Getis_Ord"][f"Permut_{i}"]
            z_score["z_a"] = np.quantile(getis_ord_permut, self.p_value)
            z_score["z1_a"] = np.quantile(getis_ord_permut, 1 - self.p_value)
            result[i] = z_score
        return result


    # ------------------------------ HIERARCHICAL CLUSTERING --------------------------------------------------------- #

[docs]
    def clustering_plot(self):
        """
        Runs hierarchical clustering and plots clustering results in different frames.
        """

        n_frames = self.n_frames
        # Plot %5, %50 and %95 points of frame list
        frame_list = [int(n_frames / 20), int(n_frames / 2), int(n_frames / 1.05)]
        fig, ax = plt.subplots(1, len(frame_list), figsize=(20, 5))

        # Iterate over three frames illustrate the clustering results
        for k, i in enumerate(frame_list):
            order_states_0 = self.get_leaflet_step_order(0, i)

            # Clustering
            # ----------------------------------------------------------------------------------------------------------------------
            core_lipids = self.assign_core_lipids(weight_matrix_f=self.results["upper_weight_all"][i],
                                                  g_star_i_f=self.results['Getis_Ord'][0]['g_star_i_0'][i],
                                                  order_states_f=order_states_0,
                                                  w_ii_f=self.results["Getis_Ord"][0]["w_ii_0"][i],
                                                  z_score=self.results["z_score"][0])

            clusters = self.hierarchical_clustering(weight_matrix_f=self.results["upper_weight_all"][i],
                                                    w_ii_f=self.results["Getis_Ord"][0]["w_ii_0"][i],
                                                    core_lipids=core_lipids)

            # Plot coordinates
            # ----------------------------------------------------------------------------------------------------------------------
            residue_indexes, positions = self.get_leaflet_step_order_index(leaflet=0, step=i)

            for resname, index in residue_indexes.items():
                ax[k].scatter(positions[resname][:, 0],
                              positions[resname][:, 1], marker="s", alpha=1, s=5, label=resname)

            # Choose color scheme for clustering coloring
            colors = plt.cm.nipy_spectral(np.linspace(0, 1.0, len(clusters.values())))

            # Goto correct frame of the trajectory
            self.universe.trajectory[self.start:self.stop:self.step][i]

            # Prepare positions for cluster plotting
            leaflet_assignment_mask = self.leaflet_assignment[:, i] == 0

            positions = (self.membrane.residues[leaflet_assignment_mask].atoms & self.all_heads).positions

            label_length = len(residue_indexes)

            # Iterate over clusters and plot the residues
            print(f"Number of clusters in frame {i}: {len(clusters.values())}")
            for j, val in enumerate(clusters.values()):
                idx = np.array(list(val), dtype=int)
                ax[k].scatter(positions[idx, 0],
                              positions[idx, 1],
                              s=100, marker="o", color=colors[j], zorder=-10)
            if self.tmd_protein is not None:
                label_length += len(self.tmd_protein["0"])
                for protein in self.tmd_protein["0"]:
                    ax[k].scatter(protein[0], protein[1], s=100, marker="^", color="black", label="TMD Protein")
            ax[k].set_xticks([])
            ax[k].set_yticks([])
            ax[k].set_aspect("equal")

        handles, labels = ax[0].get_legend_handles_labels()
        plt.figlegend(handles=handles, labels=labels, ncol=label_length, loc='lower center', fontsize=15, frameon=False)
        ax[0].set_title("d", fontsize=20, fontweight="bold", loc="left")

        ax[0].set_title(label=f"Frame {frame_list[0]}", fontsize=18)
        ax[1].set_title(label=f"Frame {frame_list[1]}", fontsize=18)
        ax[2].set_title(label=f"Frame {frame_list[2]}", fontsize=18)
        plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0, hspace=None)
        if self.save_plots:
            plt.savefig("d.pdf")
        plt.show()



[docs]
    def result_clustering(self):
        """
        Runs hierarchical clustering for each frame and saves result
        """
        self.results["Clustering"] = {'0': {}, '1': {}}

        # Iterate over all frames
        for i in tqdm(range(self.n_frames), total=self.n_frames):

            # Iterate over both leaflets
            for j, leaflet_ in enumerate(['upper', 'lower']):
                # Get order states
                order_states_leaf = self.get_leaflet_step_order(j, i)

                core_lipids = self.assign_core_lipids(weight_matrix_f=self.results[f"{leaflet_}_weight_all"][i],
                                                      g_star_i_f=self.results['Getis_Ord'][j][f'g_star_i_{j}'][i],
                                                      order_states_f=order_states_leaf,
                                                      w_ii_f=self.results["Getis_Ord"][j][f"w_ii_{j}"][i],
                                                      z_score=self.results["z_score"][j])

                clusters = self.hierarchical_clustering(weight_matrix_f=self.results[f"{leaflet_}_weight_all"][i],
                                                        w_ii_f=self.results["Getis_Ord"][j][f"w_ii_{j}"][i],
                                                        core_lipids=core_lipids)
                frame_number = self.start + i * self.step
                leaflet_index_resid_map = self.get_leaflet_step_index_to_resid(j, i)
                cluster_result = []
                for cluster in clusters.values():
                    cluster_result.append([leaflet_index_resid_map[leaflet_index] for leaflet_index in cluster])
                self.results["Clustering"][str(j)][frame_number] = cluster_result



[docs]
    @staticmethod
    def assign_core_lipids(weight_matrix_f, g_star_i_f, order_states_f, w_ii_f, z_score):

        """
        Assign lipids as core members (aka lipids with a high positive autocorrelation)
        depending on the Getis-Ord spatial local autocorrelation statistic.

        Parameters
        ----------
        weight_matrix_f : numpy.ndarray
            Matrix containing the weight factors between all lipid pairs at one time step
        g_star_i_f : numpy.ndarray
            Getis-Ord spatial local autocorrelation statistic for every lipid at one time step
        order_states_f: numpy.ndarray
            Order states for every lipid at one time step
        w_ii_f: numpy.ndarray
            Self-influence weight factor for every lipid at one time step
        z_score: dict
            Contains boundary of the reaction region


        Returns
        -------
        core_lipids : numpy.ndarray (bool)
           Contains a TRUE value if the lipid is a core member, otherwise it FALSE
        """

        # Define boundary of the reaction region
        z1_a = z_score["z1_a"]
        z_a = z_score["z_a"]

        # Assign core members according to their auto-correlation
        core_lipids = g_star_i_f > z1_a

        # Assign lipids with a mid-range auto-correlation (-z_1-a, z_1-a) * Ordered
        low_corr = (g_star_i_f <= z1_a) & (g_star_i_f >= z_a) & (order_states_f == 1)

        # Add iteratively new lipids to the core members
        n_cores_old = np.inf
        n_cores_new = np.sum(core_lipids)

        # Iterate until self-consistency is reached
        while n_cores_old != n_cores_new:
            # Check how tightly the lipids are connected to the core members
            new_core_lipids = (weight_matrix_f[core_lipids].sum(0) > w_ii_f) & low_corr

            # Assign lipids to core members if condition is full-filled
            core_lipids[new_core_lipids] = True

            # Update number of core lipids
            n_cores_old = n_cores_new
            n_cores_new = np.sum(core_lipids)

        return core_lipids



[docs]
    @staticmethod
    def hierarchical_clustering(weight_matrix_f, w_ii_f, core_lipids):

        """
        Hierarchical clustering approach to identify spatial related Lo domains.

        Parameters
        ----------
        weight_matrix_f : numpy.ndarray
            Matrix containing the weight factors between all lipid pairs at one time step
        w_ii_f: numpy.ndarray
            Self-influence weight factor for every lipid at one time step
        core_lipids : numpy.ndarray
            Array contains information which lipid is assigned as core member

        Returns
        -------
        clusters : dict
            The dictionary contains each found cluster

        """

        # Merge iteratively clusters
        n_clusters_old = np.inf
        n_clusters_new = np.sum(core_lipids)

        # Get the indices of the core lipids
        core_lipids_id = np.where(core_lipids)[0]

        # Store clusters in a Python dictionary
        # Initialize all core lipids as clusters
        clusters = dict(
            zip(
                core_lipids_id.astype("U"),
                [[id] for id in core_lipids_id]
            )
        )

        # Iterate until self-consistency is reached
        while n_clusters_old != n_clusters_new:

            # Get a list of the IDs of current clusters
            cluster_ids = list(clusters.keys())

            # Iterate over all clusters i
            for i, id_i in enumerate(cluster_ids):

                # If cluster i was already merged and deleted, skip it!
                if id_i not in clusters.keys():
                    continue

                # The cluster weights are defined as the sum
                # over the weights of all lipid members
                cluster_weights_i = np.sum(weight_matrix_f[clusters[id_i]], axis=0)

                # Compare cluster weights to the self-influence of each lipid
                merge_condition_i = cluster_weights_i > w_ii_f

                # Iterate over all clusters j
                for id_j in cluster_ids[(i + 1):]:

                    # Do not merge a cluster with itself
                    if id_i == id_j:
                        continue
                    # If cluster j was already merged and deleted, skip it!
                    if id_j not in clusters.keys():
                        continue

                    # Calculate cluster weights and compare to lipids self-influence
                    cluster_weights_j = np.sum(weight_matrix_f[clusters[id_j]], axis=0)
                    merge_condition_j = cluster_weights_j > w_ii_f

                    # If the condition is fullfilled for any lipid -> Merge the clusters
                    if np.any(merge_condition_i[clusters[id_j]]) or np.any(merge_condition_j[clusters[id_i]]):
                        # Merge cluster j into cluster i
                        clusters[id_i] += clusters[id_j]

                        # Delete cluster j from the cluster dict
                        del clusters[id_j]

            # Update cluster numbers
            n_clusters_old = n_clusters_new
            n_clusters_new = len(clusters.keys())

        return clusters


    # ------------------------------ HELPER FUNCTIONS ---------------------------------------------------------------- #

[docs]
    def get_leaflet_step_order(self, leaflet, step):
        """
        Receive residue's order state with respect to the leaflet

        Parameters
        ----------
        leaflet : int
            leaflet index
        step: int
            step index

        Returns
        -------
        order_states : numpy.ndarray
            Numpy array contains order state results of the leaflet at step in order of system's residues
        """

        #Init two empty lists for ...
        temp = [] #... order states prediction
        idxs = [] #... indices of lipids

        #Iterate over lipids
        for res, data in self.results.train_data_per_type.items():

            #Get indices from resids (e.g. resid 1, is index 0)
            idx = self.get_residue_idx(self.resids_index_map, data[0])

            #Store the indices of the residues in the current leaflet at the current step
            idxs.append(idx[self.leaflet_assignment[idx, step] == leaflet])

            #Get the predicted HMM order states for the residues in the current leaflet at the current step
            temp.append(self.results["HMM_Pred"][res][:, step][self.leaflet_assignment[idx, step] == leaflet])

        #Sort the obtained indices
        sorted_idxs = np.argsort( np.concatenate(idxs) )

        #It is not always the case that the lipid order is equal (e.g., depending on resids of cholesterol for example). Here the order states are sorted
        #so that they correspond to the resids in the leaflet selection -> With that they should fit to the order of the lipids in the weight matrix
        order_states = np.concatenate(temp)[sorted_idxs]

        return order_states






[docs]
    def get_leaflet_step_order_index(self, leaflet, step):
        """
        Receive residue's indexes and positions for a specific leaflet at any frame of the trajectory.

        Parameters
        ----------
        leaflet : int
            leaflet index
        step: int
            step index

        Returns
        -------
        indexes : dict
            dictionary contains numpy.arrays containing residue's indexes for each unique residue type
        positions : dict
            dictionary contains numpy.arrays containing residue's positions for each unique residue type
        """

        leaflet_assignment_step = self.leaflet_assignment[:, step]
        leaflet_assignment_mask = leaflet_assignment_step == leaflet

        indexes = {}
        positions = {}

        # Go to correct frame of the trajectory
        self.universe.trajectory[self.start:self.stop:self.step][step]

        for res in self.unique_resnames:

            indexes[res] = np.where(self.membrane.residues[leaflet_assignment_mask].resnames == res)[0]

            if res in self.heads.keys():
                positions[res] = (self.membrane.residues[leaflet_assignment_mask].atoms & self.universe.select_atoms(
                    f"resname {res} and name {self.heads[res]}")).positions
            else:
                positions[res] = (self.membrane.residues[leaflet_assignment_mask].atoms & self.universe.select_atoms(
                    f"resname {res} and name {self.sterol_heads[res]}")).positions

        return indexes, positions



[docs]
    def get_leaflet_step_index_to_resid(self, leaflet, step):
        """
        Returns leaflet index to residue id map for a specific leaflet at specific frame of the trajectory

        Parameters
        ----------
        leaflet : int
            leaflet index
        step: int
            step index

        Returns
        -------
        result_map : dict
            dictionary contains leaflet index of residue as key and residue id as value
        """
        leaflet_assignment_step = self.leaflet_assignment[:, step]
        leaflet_assignment_mask = leaflet_assignment_step == leaflet
        result_map = {}
        for resname in self.unique_resnames:
            sys_index = np.where(self.membrane.residues.resnames == resname)[0]
            sys_index =sys_index[leaflet_assignment_mask[sys_index]]
            leaflet_index = np.where(self.membrane.residues[leaflet_assignment_mask].resnames == resname)[0]
            for i in range(0, len(leaflet_index)):
                result_map[leaflet_index[i]] = self.index_resid_map[sys_index[i]]
        return result_map