Source code for domhmm.analysis.domhmm

"""
PropertyCalculation --- :mod:`domhmm.analysis.PropertyCalculation`
===========================================================

This module contains the :class:`PropertyCalculation` class.

"""

import logging as log
import sys

import matplotlib.pyplot as plt
import numpy as np
from hmmlearn.hmm import GaussianHMM
from scipy.sparse import csr_array
from scipy.spatial import Voronoi, ConvexHull
from scipy import stats
from sklearn import mixture
from tqdm import tqdm

from .base import LeafletAnalysisBase


[docs] class PropertyCalculation(LeafletAnalysisBase): """ The DirectorOrder class calculates an order parameter for each selected lipid according to the formula: P2 = 0.5 * (3 * cos(a)^2 - 1), (1) where an is the angle between a pre-defined director in the lipid (e.g. C-H or CC) and a reference axis. """ def _prepare(self): """ Prepare results array before the calculation of the height spectrum begins Attributes ---------- """ if self.verbose: log.basicConfig(level=log.INFO, stream=sys.stdout) log.info("Preparation Step") # Initialize result storage dictionaries self.results.train_data_per_type = {} self.results.GMM = {} self.results.HMM = {} self.results.HMM_Pred = {} self.results.Getis_Ord = {} self.max_tail_len = -1 for _, each in self.tails.items(): if len(each) > self.max_tail_len: self.max_tail_len = len(each) # Total number of parameters are area per lipid + order parameter for each tail = max_tail_len + 1 self.results.train = np.zeros( (len(self.membrane_unique_resids), self.n_frames, 1 + self.max_tail_len + len(self.sterol_heads)), dtype=np.float32) # Initalize weight matrix storage for each leaflet. setattr(self.results, "upper_weight_all", []) setattr(self.results, "lower_weight_all", []) # Initialized leaflet assignment array for each frame self.leaflet_assignment = np.zeros((len(self.membrane_unique_resids), self.n_frames), dtype=np.int32)
[docs] @staticmethod def calc_order_parameter(chain): """ Calculate average Scc order parameters per acyl chain according to the equation: S_cc = (3 * cos( theta )^2 - 1) / 2, where theta describes the angle between the z-axis of the system and the vector between two subsequent tail beads. Parameters ---------- chain : Selection MDAnalysis Selection object Returns ------- s_cc : numpy.ndarray Mean S_cc parameter for the selected chain of each residue """ # Separate the coordinates according to their residue index ridx = np.where(np.diff(chain.resids) > 0)[0] + 1 pos = np.split(chain.positions, ridx) # Calculate the normalized orientation vector between two subsequent tail beads vec = [np.diff(pos_i, axis=0) for pos_i in pos] vec_norm = [np.sqrt((vec_i ** 2).sum(axis=-1)) for vec_i in vec] vec = [vec_i / vec_norm_i.reshape(-1, 1) for vec_i, vec_norm_i in zip(vec, vec_norm)] # TODO z axis multiplication inside. It needs to be changed in future work # Choose the z-axis as membrane normal and take care of the machine precision dot_prod = [np.clip(vec_i[:, 2], -1., 1.) for vec_i in vec] # Calculate the order parameter s_cc = np.array([np.mean(0.5 * (3 * dot ** 2 - 1)) for dot in dot_prod]) return s_cc
[docs] def order_parameter(self): """ Calculation of scc order parameter for each chain of each residue. """ # Iterate over each tail with chain id for chain, tail in self.resid_tails_selection.items(): # SCC calculation s_cc = self.calc_order_parameter(tail) idx = self.get_residue_idx(self.resids_index_map, np.unique(tail.resids)) self.results.train[idx, self.index, 1 + chain] = s_cc for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()): s_cc = self.calc_order_parameter(tail) idx = self.get_residue_idx(self.resids_index_map, np.unique(tail.resids)) self.results.train[idx, self.index, 1 + self.max_tail_len + i] = s_cc
[docs] @staticmethod def area_per_lipid_vor(coor_xy, boxdim, frac): """ Calculation of the area per lipid employing Voronoi tessellation on coordinates mapped to the xy plane. The function takes also the periodic boundary conditions of the box into account. Parameters ---------- coor_xy : numpy.ndarray Coordinates of upper/lower leaflet boxdim : array Length of box vectors in all directions frac : float Fraction of box length in x and y outside the unit cell considered for Voronoi calculation Returns ------- vor : Voronoi Tesselation Scipy's Voronoi Diagram object apl : Area per lipid Area per lipid based on Scipy's Voronoi Diagram pbc_idx : Indices Unit cell indices for periodic image coordinates """ # Number of points in the plane ncoor = coor_xy.shape[0] bx = boxdim[0] by = boxdim[1] # Create periodic images of the coordinates # to take periodic boundary conditions into account # Store coordinates of periodic images pbc = np.zeros((9 * ncoor, 2), dtype=np.float32) # Store unit cell indices for coordinates of periodic images pbc_idx = np.arange(9 * ncoor, dtype=np.int64) % ncoor # Iterate over all possible periodic images k = 0 for i in [0, -1, 1]: for j in [0, -1, 1]: # Multiply the coordinates in a direction pbc[k * ncoor: (k + 1) * ncoor, 0] = coor_xy[:, 0] % bx + i * bx pbc[k * ncoor: (k + 1) * ncoor, 1] = coor_xy[:, 1] % by + j * by k += 1 # Create a boolean mask for positions within the unit cell and a smaller fraction of positions outside the # unit cell # Check along x-axis f0 = pbc[:, 0] >= -frac * bx f1 = pbc[:, 0] <= bx + frac * bx # Check along y-axis f2 = pbc[:, 1] >= -frac * by f3 = pbc[:, 1] <= by + frac * by # Merge the four masks together mask = f0 * f1 * f2 * f3 # Filter positions and unit cell indices of all periodic images pbc = pbc[mask] pbc_idx = pbc_idx[mask] # Call scipy's Voronoi implementation # There is the (rare!) possibility that two points have the exact same xy positions, # to prevent issues at further calculation steps, the qhull_option "QJ" was employed to introduce small random # displacement of the points to resolve these issue. # IMPORTANT: The use of "QJ" makes the resulting Voronoi diagram depending on frac. Values for ridge lengths # can vary! vor = Voronoi(pbc, qhull_options="QJ") # Iterate over all members of the unit cell and calculate their occupied area apl = np.array([ConvexHull(vor.vertices[vor.regions[vor.point_region[i]]]).volume for i in range(ncoor)]) return vor, apl, pbc_idx
[docs] @staticmethod def weight_matrix(vor, pbc_idx, coor_xy): """ Calculate the weight factors between neighbored lipid pairs based on a Voronoi tessellation. Parameters ---------- vor : Voronoi Tesselation Scipy's Voronoi Diagram object pbc_idx : Indices Unit cell indices of periodic image coordinates coor_xy : numpy.ndarray Coordinates of upper/lower leaflet Returns ------- weight_matrix : numpy.ndarray Weight factors wij between neighbored lipid pairs. Is 0 if lipids are not directly neighbored. """ # Number of points in the plane ncoor = coor_xy.shape[0] # Calculate the distance for all pairs of points between which a ridge exists dij = vor.points[vor.ridge_points[:, 0]] - vor.points[vor.ridge_points[:, 1]] dij = np.sqrt(np.sum(dij ** 2, axis=1)) # There is the (rare!) possibility that two points have the exact same xy positions, # to prevent issues at further calculation steps, their distance is set to a very small # distance of 4.7 Angstrom (2 times the VdW radius of a regular bead in MARTINI3) dij[dij < 1E-5] = 4.7 # Calculate the distance for all pairs of vertices connected via a ridge vert_idx = np.array(vor.ridge_vertices) bij = vor.vertices[vert_idx[:, 0]] - vor.vertices[vert_idx[:, 1]] bij = np.sqrt(np.sum(bij ** 2, axis=1)) # INFO: vor.ridge_points and vor.ridge_vertices should be sorted -> Check vor.ridge_dict # Calculate weight factor wij = bij / dij # Set up an empty array to store the weight factors for each lipid weight_matrix = np.zeros((ncoor, ncoor)) # Select all indices of ridges that contain members of the unit cell mask_unit_cell = np.logical_or(vor.ridge_points[:, 0] < ncoor, vor.ridge_points[:, 1] < ncoor) # Apply the modulus operator since some of the indices in "unit_cell_point" will point to coordinates outside # the unit cell. Applying the modulus operator "%" will allow an indexing of the "weight_matrix". However, some # of the indices in "unit_cell_point" will be doubled that shouldn't be an issue since the same weight factor is # then just put several times in the same entry of the array (no summing or something similar!) # Previous: unit_cell_point = vor.ridge_points[mask_unit_cell] % ncoor # Transform the indices in vor.ridge_points back to unit cell indices -> Filter than for all indices of # ridges that contain members of the unit cell unit_cell_point = pbc_idx[vor.ridge_points][mask_unit_cell] weight_matrix[unit_cell_point[:, 0], unit_cell_point[:, 1]] = wij[mask_unit_cell] weight_matrix[unit_cell_point[:, 1], unit_cell_point[:, 0]] = wij[mask_unit_cell] return weight_matrix
def _single_frame(self): """ Calculate data from a single frame of the trajectory. """ # Get number of frame from trajectory self.frame = self.universe.trajectory.ts.frame # Calculate correct index if skipping step not equals 1 or start point not equals 0 self.index = ( self.frame - self.start ) // self.step # Update leaflet assignment (if leaflet_frame_rate is None, leaflets will never get updated during analysis) if self.leaflet_frame_rate is not None and not self.index % self.leaflet_frame_rate: # Call leaflet assignment functions for non-sterol and sterol compounds self.leaflet_selection_no_sterol = self.get_leaflets() self.leaflet_selection = self.get_leaflets_sterol() # Write assignments to array assignment_index = int(self.index / self.leaflet_frame_rate) start_index = assignment_index * self.leaflet_frame_rate end_index = (assignment_index + 1) * self.leaflet_frame_rate if end_index > self.leaflet_assignment.shape[1]: end_index = self.leaflet_assignment.shape[1] self.uidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["0"].resids) self.lidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["1"].resids) self.leaflet_assignment[self.lidx, start_index:end_index] = 1 self.leaflet_assignment[self.uidx, start_index:end_index] = 0 self.leaflet_assignment[self.lidx, start_index:end_index] = 1 # Update sterol assignment. Don't do the update if it was already done in the if-statement before if not self.index % self.sterol_frame_rate and ( self.leaflet_frame_rate is None or self.index % self.leaflet_frame_rate): # Call leaflet assignment function for sterol compounds self.leaflet_selection = self.get_leaflets_sterol() # Write assignments to array assignment_index = int(self.index / self.sterol_frame_rate) start_index = assignment_index * self.sterol_frame_rate end_index = (assignment_index + 1) * self.sterol_frame_rate if end_index > self.leaflet_assignment.shape[1]: end_index = self.leaflet_assignment.shape[1] self.uidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["0"].resids) self.lidx = self.get_residue_idx(self.resids_index_map, self.leaflet_selection["1"].resids) self.leaflet_assignment[self.uidx, start_index:end_index] = 0 self.leaflet_assignment[self.lidx, start_index:end_index] = 1 if self.leaflet_selection["0"].select_atoms("group leaf1", leaf1=self.leaflet_selection["1"]): raise ValueError("Atoms in both leaflets !") # ------------------------------ Local Normals/Area per Lipid ------------------------------------------------ # boxdim = self.universe.trajectory.ts.dimensions[0:3] upper_coor_xy = self.leaflet_selection[str(0)].positions lower_coor_xy = self.leaflet_selection[str(1)].positions # Check Transmembrane domain existence if self.tmd_protein is not None: tmd_upper_coor_xy = self.tmd_protein["0"] # Check if dimension of coordinates is same in both array if tmd_upper_coor_xy.shape[1] == upper_coor_xy.shape[1]: upper_coor_xy = np.append(upper_coor_xy, tmd_upper_coor_xy, axis=0) tmd_lower_coor_xy = self.tmd_protein["1"] # Check if dimension of coordinates is same in both array if tmd_lower_coor_xy.shape[1] == lower_coor_xy.shape[1]: lower_coor_xy = np.append(lower_coor_xy, tmd_lower_coor_xy, axis=0) upper_vor, upper_apl, upper_pbc_idx = self.area_per_lipid_vor(coor_xy=upper_coor_xy, boxdim=boxdim, frac=self.frac) lower_vor, lower_apl, lower_pbc_idx = self.area_per_lipid_vor(coor_xy=lower_coor_xy, boxdim=boxdim, frac=self.frac) if self.tmd_protein is not None: # Remove TMD Protein numbers from training data upper_apl = np.array(upper_apl[:-len(tmd_upper_coor_xy)]) lower_apl = np.array(lower_apl[:-len(tmd_lower_coor_xy)]) self.results.train[self.uidx, self.index, 0] = upper_apl self.results.train[self.lidx, self.index, 0] = lower_apl # TODO Local normal calculation # ------------------------------ Order parameter ------------------------------------------------------------- # self.order_parameter() # ------------------------------ Weight Matrix --------------------------------------------------------------- # upper_weight_matrix = self.weight_matrix(upper_vor, pbc_idx=upper_pbc_idx, coor_xy=upper_coor_xy) lower_weight_matrix = self.weight_matrix(lower_vor, pbc_idx=lower_pbc_idx, coor_xy=lower_coor_xy) if self.tmd_protein is not None: # Remove TMD Protein numbers from weight matrix upper_weight_matrix = upper_weight_matrix[:-len(tmd_upper_coor_xy),:-len(tmd_upper_coor_xy)] lower_weight_matrix = lower_weight_matrix[:-len(tmd_lower_coor_xy),:-len(tmd_lower_coor_xy)] # Keep weight matrices in scipy.sparse.csr_array format since both is sparse matrices self.results["upper_weight_all"].append(csr_array(upper_weight_matrix)) self.results["lower_weight_all"].append(csr_array(lower_weight_matrix)) def _conclude(self): """ Calculate the final results of the analysis Extract the obtained data and put them into a clear and accessible data structure """ log.info("Conclusion step is starting.") self.prepare_train_data() # ------------------------------------------------------------- # Check for user-provided HMMs if not any(self.trained_hmms): # No user-provided HMMs, perform usual workflow log.info("Gaussian Mixture Model training is starting.") self.GMM(gmm_kwargs=self.gmm_kwargs) log.info("Hidden Markov Model training is starting.") self.HMM(hmm_kwargs=self.hmm_kwargs) else: # User-provided HMMs found, use them! log.info("No Gaussian Mixture Model is trained.") log.info("No Hidden Markov Model is trained. Instead use the user-provided HMMs.") # Use the provided dictionary directly, the checks for validity were already done before self.results['HMM'] = self.trained_hmms # Same workflow as in self.HMM() if self.result_plots: # Plot result of hmm self.plot_hmm_result() # Make predictions based on HMM model self.predict_states() # Validate states and result prediction self.state_validate() if self.result_plots: # Plot prediction result self.predict_plot() log.info("Getis-Ord Statistic calculation is starting.") self.getis_ord() #The user argument do_clustering decides if the hierarchical clustering is (not) performed if self.do_clustering == False: pass else: log.info("Clustering is starting.") self.result_clustering() if self.result_plots: self.clustering_plot() print("It's done! DomHMM finished successfully. Have a nice day and enjoy your data! :-)")
[docs] def prepare_train_data(self): """ Prepare train data for GMM and HMM while separating self.results.train with respect to each residue """ self.results.train_data_per_type = {} resid_dict = {} for resname in self.unique_resnames: idx = self.get_residue_idx(self.resids_index_map, self.residue_ids[resname]) resid_dict[resname] = idx self.results.train_data_per_type[f"{resname}"] = [[] for _ in range(3)] if self.asymmetric_membrane: # For asymmetric membranes, models will be trained for each leaflets' each residue type leaflet_residx = {} # leaflet_train_residx contains residue indexes that are not too flip-floppy for model training leaflet_train_residx = {} # Save each residues indexes with respect to leaflet assignment for resname, idx in resid_dict.items(): leaflet_assign = self.leaflet_assignment[idx] leaflet_train_residx[resname] = {0: [], 1: []} leaflet_residx[resname] = {0: [], 1: []} for i in range(len(leaflet_assign)): if (leaflet_assign[i] == 0).all(): leaflet_train_residx[resname][0].append(idx[i]) leaflet_residx[resname][0].append(idx[i]) elif (leaflet_assign[i] == 1).all(): leaflet_train_residx[resname][1].append(idx[i]) leaflet_residx[resname][1].append(idx[i]) else: # If a residue is %80 of time belongs to a leaflet, accept it as residue of that leaflet lower_leaflet_percentage = len(np.nonzero(leaflet_assign[i] == 1)[0]) / len(leaflet_assign[i]) if lower_leaflet_percentage >= 0.8: leaflet_train_residx[resname][1].append(idx[i]) elif lower_leaflet_percentage <= 0.2: leaflet_train_residx[resname][0].append(idx[i]) if lower_leaflet_percentage > 0.5: leaflet_residx[resname][1].append(idx[i]) else: leaflet_residx[resname][0].append(idx[i]) self.leaflet_residx = leaflet_residx # Prepare rest of train data for each residue for resname, tails in self.tails.items(): rsn_ids = self.residue_ids[resname] self.results.train_data_per_type[resname][0] = rsn_ids # Select columns of area per lipid and tails' scc parameters residx = leaflet_train_residx[resname] upper_leaflet_data = self.results.train[residx[0]][:, :, 0:len(tails) + 1] lower_leaflet_data = self.results.train[residx[1]][:, :, 0:len(tails) + 1] self.results.train_data_per_type[resname][1] = [upper_leaflet_data, lower_leaflet_data] # TODO idx might referenced before assignment ? self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx] for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()): rsn_ids = self.residue_ids[resname] self.results.train_data_per_type[resname][0] = rsn_ids idx = self.get_residue_idx(self.resids_index_map, rsn_ids) # Select columns of area per lipid and sterol's scc parameter residx = leaflet_train_residx[resname] upper_leaflet_data = self.results.train[residx[0]][:, :, [0, 1 + self.max_tail_len + i]] lower_leaflet_data = self.results.train[residx[1]][:, :, [0, 1 + self.max_tail_len + i]] self.results.train_data_per_type[resname][1] = [upper_leaflet_data, lower_leaflet_data] self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx] else: # For symmetric membranes, models will be trained for each residue type for resname, tails in self.tails.items(): rsn_ids = self.residue_ids[resname] self.results.train_data_per_type[resname][0] = rsn_ids idx = resid_dict[resname] # Select columns of area per lipid and tails' scc parameters self.results.train_data_per_type[resname][1] = self.results.train[idx][:, :, 0:len(tails) + 1] self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx] for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()): rsn_ids = self.residue_ids[resname] self.results.train_data_per_type[resname][0] = rsn_ids idx = resid_dict[resname] # Select columns of area per lipid and sterol's scc parameter self.results.train_data_per_type[resname][1] = self.results.train[idx][:, :, [0, 1 + self.max_tail_len + i]] self.results.train_data_per_type[resname][2] = self.leaflet_assignment[idx]
# ------------------------------ FIT GAUSSIAN MIXTURE MODEL ------------------------------------------------------ #
[docs] def GMM(self, gmm_kwargs): """ Fit Gaussian Mixture Parameters ---------- gmm_kwargs : dict Additional parameters for mixture.GaussianMixture """ # Iterate over each residue and implement gaussian mixture model for res, data in self.results.train_data_per_type.items(): if self.asymmetric_membrane: temp_dict = {} for leaflet in range(2): gmm_data = data[1][leaflet] if len(gmm_data) == 0: temp_dict[leaflet] = None else: gmm = mixture.GaussianMixture(n_components=2, **gmm_kwargs).fit( gmm_data.reshape(-1, gmm_data.shape[2])) temp_dict[leaflet] = gmm log.info(f"Leaflet {leaflet}, {res} Gaussian Mixture Model is trained.") self.results["GMM"][res] = temp_dict else: gmm = mixture.GaussianMixture(n_components=2, **gmm_kwargs).fit(data[1].reshape(-1, data[1].shape[2])) self.results["GMM"][res] = gmm log.info(f"{res} Gaussian Mixture Model is trained.") # Check for convergence for resname, each in self.results["GMM"].items(): if self.asymmetric_membrane: if each[0] is not None and not each[0].converged_: log.warning(f"{resname} lower leaflet Gaussian Mixture Model is not converged.") if each[1] is not None and not each[1].converged_: log.warning(f"{resname} upper leaflet Gaussian Mixture Model is not converged.") else: if not each.converged_: log.warning(f"{resname} Gaussian Mixture Model is not converged.")
[docs] def mixture_plot(self, resname, gmm_model, hmm_model, leaflet = None): """ Plot results of the Gaussian Mixture Model for each residue type. The function should help to spot flaws in the results of the Gaussian Mixture Model. The Gaussian Mixture Model in DomHMM is trained on a three-dimensional space (1 APL + 2 Acyl chains), hence also its ouput (mean vectors and covariance matrices) are three0-dimensional. However, plots in 3 dimensions are rather hard to understand, therefore we plot here the projections of the raw/fitted probability densities on the xz, yz, and xy plane. Parameters ---------- resname : str Resname of the actual residue type gmm_model : GaussianMixture Model Scikit-learn object hmm_model : Hidden Markow Model hmmlearn object leaflet : int or None If membrane is asymmetric ensure that plots are made for upper and lower leaflet """ #Define parameters that define a grid for the calculations of the fitted distributions #The chosen values should cover a wide range of different use cases as long as enough sample data is provided! MIN_APL, MAX_APL, STEP_APL = 0, 200, .5 MIN_SCC, MAX_SCC, STEP_SCC = -.55, 1.05, .05 x01, y01 = np.mgrid[MIN_APL:MAX_APL:STEP_APL, MIN_SCC:MAX_SCC:STEP_SCC] x2, y2 = np.mgrid[MIN_SCC:MAX_SCC:STEP_SCC, MIN_SCC:MAX_SCC:STEP_SCC] xy01 = np.dstack((x01, y01)) xy2 = np.dstack((x2, y2)) #Define colors for the markers that mark the fitted means of the Gaussians cx = ["crimson","cyan"] cx_hmm = ["orange", "hotpink"] #Get trainings data for this type lipid and check if its asymmetric train_data_per_type = self.results["train_data_per_type"][resname][1] if leaflet != None: train_data_per_type = train_data_per_type[ leaflet ] else: pass #Joint distributions for current lipid type cm =1/2.54 #Do the plot for non-sterolic lipid types if resname not in self.sterol_heads.keys(): #Init suplots fig, ax = plt.subplots(1, 3, figsize = (35*cm, 10*cm)) #Area per Lipid - Scc Chain A ax[0].hist2d(x = train_data_per_type.reshape(-1, 3)[:, 0], y = train_data_per_type.reshape(-1, 3)[:, 1], bins = [np.linspace(MIN_APL, MAX_APL, int(np.round( (MAX_APL - MIN_APL) / 1 + 1 )) ), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))], density = True, cmap="viridis") #Area per Lipid - Scc Chain B ax[1].hist2d(x = train_data_per_type.reshape(-1, 3)[:, 0], y = train_data_per_type.reshape(-1, 3)[:, 2], bins = [np.linspace(MIN_APL, MAX_APL, int(np.round( (MAX_APL - MIN_APL) / 1 + 1 )) ), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))], density = True, cmap="viridis") #Scc Chain A - Scc Chain B ax[2].hist2d(x = train_data_per_type.reshape(-1, 3)[:, 1], y = train_data_per_type.reshape(-1, 3)[:, 2], bins = [np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 ))), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))], density = True, cmap="viridis") #Calculate and display fitted distributions #Iterate over both Gaussian distributions for i in range(2): #-------------------------------------------------------GMM------------------------------------------------------- #Area per Lipid (0) - Scc Chain A (1) rv0 = stats.multivariate_normal(gmm_model.means_[i][0:2], gmm_model.covariances_[i][:2,:2]) z0 = rv0.pdf(xy01) #Area per Lipid (0) - Scc Chain B (2) rv1 = stats.multivariate_normal(gmm_model.means_[i][::2], gmm_model.covariances_[i][::2, ::2]) z1 = rv1.pdf(xy01) #Scc Chain A (1) - Scc Chain B (2) rv2 = stats.multivariate_normal(gmm_model.means_[i][1:], gmm_model.covariances_[i][1:, 1:]) z2 = rv2.pdf(xy2) #Plot contours -> Be aware that the first two plots share the same ranges ax[0].contour(x01, y01, z0, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25) ax[1].contour(x01, y01, z1, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25) ax[2].contour(x2, y2, z2, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25) del rv0 del rv1 del rv2 del z0 del z1 del z2 #Mark with a cross the means of the fitted Gaussians ax[0].scatter( gmm_model.means_[i][0], gmm_model.means_[i][1], marker = "x", s = 100, zorder = 100, color=cx[i]) ax[1].scatter( gmm_model.means_[i][0], gmm_model.means_[i][2], marker = "x", s = 100, zorder = 100, color=cx[i], label = f"GMM Mean {i}") ax[2].scatter( gmm_model.means_[i][1], gmm_model.means_[i][2], marker = "x", s = 100, zorder = 100, color=cx[i]) #-------------------------------------------------------HMM------------------------------------------------------- #Area per Lipid (0) - Scc Chain A (1) rv0 = stats.multivariate_normal(hmm_model.means_[i][0:2], hmm_model.covars_[i][:2,:2]) z0 = rv0.pdf(xy01) #Area per Lipid (0) - Scc Chain B (2) rv1 = stats.multivariate_normal(hmm_model.means_[i][::2], hmm_model.covars_[i][::2, ::2]) z1 = rv1.pdf(xy01) #Scc Chain A (1) - Scc Chain B (2) rv2 = stats.multivariate_normal(hmm_model.means_[i][1:], hmm_model.covars_[i][1:, 1:]) z2 = rv2.pdf(xy2) #Plot contours -> Be aware that the first two plots share the same ranges ax[0].contour(x01, y01, z0, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25) ax[1].contour(x01, y01, z1, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25) ax[2].contour(x2, y2, z2, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25) del rv0 del rv1 del rv2 del z0 del z1 del z2 #Mark with a circle the means of the fitted Gaussians from the HMM ax[0].scatter( hmm_model.means_[i][0], hmm_model.means_[i][1], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i]) ax[1].scatter( hmm_model.means_[i][0], hmm_model.means_[i][2], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i], label = f"HMM Mean {i}") ax[2].scatter( hmm_model.means_[i][1], hmm_model.means_[i][2], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i]) #Set ticks on the y axis for i in range(3): ax[i].set_yticks([-0.5, 0, 0.5, 1], [r"$-0.5$", r"$0$", r"$0.5$", r"$1$"]) #Set ticks on the x axis ax[0].set_xticks( np.linspace(MIN_APL, MAX_APL, int( np.round( (MAX_APL - MIN_APL) / 25 + 1 ) ) ) ) ax[1].set_xticks( np.linspace(MIN_APL, MAX_APL, int( np.round( (MAX_APL - MIN_APL) / 25 + 1 ) ) ) ) ax[2].set_xticks([-0.5, 0, 0.5, 1], [r"$-0.5$", r"$0$", r"$0.5$", r"$1$"]) #Label y axis yl0 = ax[0].set_ylabel(r"$p(\bar{S}_{CC}^{\text{sn-2}})$", labelpad=-7,fontsize=18) yl1 = ax[1].set_ylabel(r"$p(\bar{S}_{CC}^{\text{sn-1}})$", labelpad=-7,fontsize=18) yl2 = ax[2].set_ylabel(r"$p(\bar{S}_{CC}^{\text{sn-1}})$", labelpad=-7,fontsize=18) #Label x axis xl0 = ax[0].set_xlabel(r"$p(a)$", labelpad=0,fontsize=18) xl1 = ax[1].set_xlabel(r"$p(a)$", labelpad=0,fontsize=18) xl2 = ax[2].set_xlabel(r"$p(\bar{S}_{CC}^{\text{sn-2}})$", labelpad=0,fontsize=18) #Make plot more readable for i in range(3): ax[i].tick_params(rotation=45) ax[i].grid(False) ax[i].tick_params(axis="both", labelsize=11) plt.subplots_adjust(hspace=0.1, wspace = 0.25) lg = ax[1].legend(loc = "upper center", bbox_to_anchor = ( 0.5, -0.25), fancybox = False, framealpha = 0.5, facecolor = "grey", ncols = 4) if leaflet != None: plt.savefig(f"GMM_{resname}_{leaflet}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl0, xl1, xl2, yl0, yl1, yl2, lg), bbox_inches = "tight") else: plt.savefig(f"GMM_{resname}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl0, xl1, xl2, yl0, yl1, yl2, lg), bbox_inches = "tight") #Do the plot for sterolic lipid types else: #Init only one subplot fig, ax = plt.subplots(1, 1, figsize = (15*cm, 15*cm)) #Area per Lipid - Scc Chain A ax.hist2d(x = train_data_per_type.reshape(-1, 2)[:, 0], y = train_data_per_type.reshape(-1, 2)[:, 1], bins = [np.linspace(MIN_APL, MAX_APL, int(np.round( (MAX_APL - MIN_APL) / 1 + 1 )) ), np.linspace(MIN_SCC, MAX_SCC, int(np.round( (MAX_SCC - MIN_SCC) / 0.025 + 1 )))], density = True, cmap="viridis") #Calculate and display fitted distributions #Iterate over both Gaussian distributions for i in range(2): #-------------------------------------------------------GMM------------------------------------------------------- #Area per Lipid (0) - Scc Chain C (1) rv0 = stats.multivariate_normal(gmm_model.means_[i], gmm_model.covariances_[i]) z0 = rv0.pdf(xy01) #Plot contours ax.contour(x01, y01, z0, colors = [cx[i]], alpha = 0.2, linewidths = 2.25, zorder = 25) del rv0 del z0 #Mark with a cross the means of the fitted Gaussians ax.scatter( gmm_model.means_[i][0], gmm_model.means_[i][1], marker = "x", s = 100, zorder = 100, color=cx[i], label = f"GMM Mean {i}") #-------------------------------------------------------HMM------------------------------------------------------- #Area per Lipid (0) - Scc Chain C (1) rv0 = stats.multivariate_normal(hmm_model.means_[i], hmm_model.covars_[i]) z0 = rv0.pdf(xy01) #Plot contours ax.contour(x01, y01, z0, colors = [cx_hmm[i]], alpha = 0.2, zorder = 50, linewidths = 1.25) del rv0 del z0 #Mark with a circle the means of the fitted Gaussians from the HMM ax.scatter( hmm_model.means_[i][0], hmm_model.means_[i][1], marker = "o", facecolor = "none", s = 100, zorder = 100, edgecolor=cx_hmm[i], label = f"HMM Mean {i}") #Set ticks on the x and y axis ax.set_yticks([-0.5, 0, 0.5, 1], [r"$-0.5$", r"$0$", r"$0.5$", r"$1$"]) ax.set_xticks( np.linspace(MIN_APL, MAX_APL, int( np.round( (MAX_APL - MIN_APL) / 25 + 1 ) ) ) ) #Label the x and y axis yl = ax.set_ylabel(r"$p(P_2)$", labelpad=-7,fontsize=18) xl = ax.set_xlabel(r"$p(a)$", labelpad=0,fontsize=18) # Make plot more readable ax.tick_params(rotation=45) ax.grid(False) ax.tick_params(axis="both", labelsize=11) lg = ax.legend(loc = "upper center", bbox_to_anchor = ( 0.5, -0.25), fancybox = False, framealpha = 0.5, facecolor = "grey", ncols = 4) if leaflet != None: plt.savefig(f"GMM_{resname}_{leaflet}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl, yl, lg), bbox_inches = "tight") else: plt.savefig(f"GMM_{resname}.pdf", dpi = 300, transparent=True, bbox_extra_artists = (xl, yl, lg), bbox_inches = "tight") plt.close()
# ------------------------------ HIDDEN MARKOV MODEL ------------------------------------------------------------- #
[docs] def HMM(self, hmm_kwargs): """ Create Gaussian based Hidden Markov Models for each residue type Parameters ---------- hmm_kwargs : dict Additional parameters for hmmlearn.hmm.GaussianHMM """ # Iterate over each residue and implement gaussian-hidden markov model for resname, data in self.results.train_data_per_type.items(): if self.asymmetric_membrane: temp_dict = {} for leaflet in range(2): gmm = self.results["GMM"][resname][leaflet] if gmm is not None: hmm_data = data[1][leaflet] hmm = self.fit_hmm(data=hmm_data, gmm=gmm, hmm_kwargs=hmm_kwargs, n_repeats=self.n_init_hmm) temp_dict[leaflet] = hmm else: temp_dict[leaflet] = None log.info(f"Leaflet {leaflet}, {resname} Gaussian Hidden Markov Model is trained.") self.results["HMM"][resname] = temp_dict else: hmm = self.fit_hmm(data=data[1], gmm=self.results["GMM"][resname], hmm_kwargs=hmm_kwargs, n_repeats=self.n_init_hmm) self.results["HMM"][resname] = hmm log.info(f"{resname} Gaussian Hidden Markov Model is trained.") if self.result_plots: # Plot result of hmm self.plot_hmm_result() # Make predictions based on HMM model self.predict_states() # Validate states and result prediction self.state_validate() if self.result_plots: # Plot prediction result self.predict_plot() #Plot results of GMM and HMM #Iterate over fitted Gaussian Mixture Models for resname in self.results["GMM"].keys(): gmm_trained = self.results["GMM"][resname] hmm_trained = self.results["HMM"][resname] if self.asymmetric_membrane: if gmm_trained[0] is not None and hmm_trained[0] is not None and gmm_trained[0].converged_ and hmm_trained[0].monitor_.converged: self.mixture_plot(resname = resname, gmm_model = gmm_trained[0], hmm_model = hmm_trained[0], leaflet = 0) if gmm_trained[1] is not None and hmm_trained[1] is not None and gmm_trained[1].converged_ and hmm_trained[1].monitor_.converged: self.mixture_plot(resname = resname, gmm_model = gmm_trained[1], hmm_model = hmm_trained[1], leaflet = 1) else: if gmm_trained.converged_ and hmm_trained.monitor_.converged: self.mixture_plot(resname = resname, gmm_model = gmm_trained, hmm_model = hmm_trained)
[docs] def fit_hmm(self, data, gmm, hmm_kwargs, n_repeats=10): """ Fit several HMM models to the data and return the best one. Parameters ---------- data : numpy.ndarray Data of the single lipid properties for one lipid type at each time step gmm : GaussianMixture Model Scikit-learn object hmm_kwargs: dict Additional parameters for Hidden Markov Model n_repeats : int Number of independent fits Returns ------- best_ghmm : GaussianHMM hmmlearn object """ best_ghmm = None # Specify the length of the sequence for each lipid n_lipids = data.shape[0] dim = data.shape[2] lengths = np.repeat(self.n_frames, n_lipids) # The HMM fitting is started multiple times from # different starting conditions # Initialize the best score with minus infinity best_score = -np.inf # Re-start the HMM fitting 10 times for i in tqdm(range(n_repeats)): # Initialize a HMM for one lipid type ghmm_i = GaussianHMM(n_components=2, means_prior=gmm.means_, covars_prior=gmm.covariances_, **hmm_kwargs) #Check whether an optimization of the mean vectors and the covariance matrices is requested #Optimization of means is not required -> Take it from the Gaussian Mixture Model if "m" not in hmm_kwargs['params']: ghmm_i.means_ = gmm.means_ #Optimization of covariances is not required -> Take it from the Gaussian Mixture Model if "c" not in hmm_kwargs['params']: ghmm_i.covars_ = gmm.covariances_ # Train the HMM based on the data for every lipid and frame ghmm_i.fit(data.reshape(-1, dim), lengths=lengths) # Obtain the log-likelihood # probability of the current model score_i = ghmm_i.score(data.reshape(-1, dim), lengths=lengths) # Check if the quality of the result improved if score_i > best_score: best_score = score_i best_ghmm = ghmm_i # Delete the current model del ghmm_i return best_ghmm
[docs] def plot_hmm_result(self): """ Plots convergence of HMM model training process. """ x_len = 0 for resname, ghmm in self.results['HMM'].items(): if self.asymmetric_membrane: for leaflet in range(2): if ghmm[leaflet] is not None: x_len = max(x_len, len(ghmm[leaflet].monitor_.history) - 1) plt.semilogy(np.arange(len(ghmm[leaflet].monitor_.history) - 1), np.diff(np.array(ghmm[leaflet].monitor_.history)), ls="-", label=f"{resname}_{leaflet}", lw=2) else: x_len = max(x_len, len(ghmm.monitor_.history) - 1) plt.semilogy(np.arange(len(ghmm.monitor_.history) - 1), np.diff(np.array(ghmm.monitor_.history)), ls="-", label=resname, lw=2) x_len = (x_len // 100 + 1) * 100 plt.legend(fontsize=15) plt.semilogy(np.arange(x_len), np.repeat(1E-4, x_len), color="k", ls="--", lw=2) plt.xlim(0, x_len) plt.ylim(1E-5, 15E5) plt.ylabel(r"$\Delta(log(\hat{L}))$", fontsize=18) plt.xlabel("Iterations", fontsize=18) plt.tick_params(axis="both", labelsize=11) plt.text(s="Tolerance 1e-4", x=1, y=1E-4 + 0.00005, color="k", fontsize=15) plt.title("a", fontsize=20, fontweight="bold", loc="left") plt.tight_layout() if self.save_plots: plt.savefig("a.pdf") plt.show()
[docs] def predict_states(self): """ Each residues prediction assignment to ordered or disordered domains. """ if self.asymmetric_membrane: # Since we select stable lipids for training, we need all training data of lipids to predict order of # frequently flip-flop doing ones for resname, tails in self.tails.items(): temp_array = [] for leaflet in range(2): idx = self.leaflet_residx[resname][leaflet] data = self.results.train[idx][:, :, 0:len(tails) + 1] shape = data.shape hmm = self.results['HMM'][resname][leaflet] if hmm is not None: lengths = np.repeat(shape[1], shape[0]) prediction = hmm.predict(data.reshape(-1, shape[2]), lengths=lengths).reshape(shape[0], shape[1]) prediction = self.hmm_diff_checker(hmm.means_, prediction) temp_array.append([idx, prediction]) else: temp_array.append([]) # Leaflet checks in case lipid is only in one leaflet if len(temp_array[0]) == 0: idx = np.array(temp_array[1][0]).argsort() result = temp_array[1][1] elif len(temp_array[1]) == 0: idx = np.array(temp_array[0][0]).argsort() result = temp_array[0][1] else: idx = np.concatenate((temp_array[0][0], temp_array[1][0])).argsort() result = np.concatenate((temp_array[0][1], temp_array[1][1])) result = result[idx] self.results['HMM_Pred'][resname] = result for i, (resname, tail) in enumerate(self.sterol_tails_selection.items()): temp_array = [] for leaflet in range(2): idx = self.leaflet_residx[resname][leaflet] data = self.results.train[idx][:, :, [0, 1 + self.max_tail_len + i]] shape = data.shape hmm = self.results['HMM'][resname][leaflet] if hmm is not None: lengths = np.repeat(shape[1], shape[0]) prediction = hmm.predict(data.reshape(-1, shape[2]), lengths=lengths).reshape(shape[0], shape[1]) prediction = self.hmm_diff_checker(hmm.means_, prediction) temp_array.append([idx, prediction]) else: temp_array.append(None) # Leaflet checks in case lipid is only in one leaflet if temp_array[0] is None: idx = np.array(temp_array[1][0]).argsort() result = temp_array[1][1] elif temp_array[1] is None: idx = np.array(temp_array[0][0]).argsort() result = temp_array[0][1] else: idx = np.concatenate((temp_array[0][0], temp_array[1][0])).argsort() result = np.concatenate((temp_array[0][1], temp_array[1][1])) result = result[idx] self.results['HMM_Pred'][resname] = result else: # Symmetric membrane case for resname, data in self.results.train_data_per_type.items(): shape = data[1].shape hmm = self.results['HMM'][resname] # Lengths consists of number of frames and number of residues lengths = np.repeat(shape[1], shape[0]) prediction = hmm.predict(data[1].reshape(-1, shape[2]), lengths=lengths).reshape(shape[0], shape[1]) # Save prediction result of each residue self.results['HMM_Pred'][resname] = prediction
[docs] def state_validate(self): """ Validate state assignments of HMM model by checking means of the model of each residue. """ if not self.asymmetric_membrane: # Asymmetric membrane validation is done in prediction step due to nature of it for resname, hmm in self.results["HMM"].items(): if hmm is not None: means = hmm.means_ prediction_results = self.results['HMM_Pred'][resname] self.results['HMM_Pred'][resname] = self.hmm_diff_checker(means, prediction_results)
[docs] @staticmethod def hmm_diff_checker(means, prediction_results): """ Checks if prediction assignments correct with respect to means. Since HMM is unsupervised, model can assign 0 to disordered domains and 1 to ordered domains. In these cases needs to be changed to vice versa. Parameters ---------- means : np.ndarray Table of model which contains means of area per lipid and order parameters prediction_results: np.ndarray Initial prediction results which is 1s and 0s Returns ------- prediction_results : np.ndarray Same or flipped prediction results with respect to means table """ diff_percents = (means[1, 0] - means[0, 0]) / means[0, 0] if diff_percents > 0.0: return np.abs(prediction_results - 1) else: return prediction_results
[docs] @staticmethod def get_residue_idx(resids_index_map, resids): """ Checks resids index map (lookup table) to return corresponding indexes to use in DomHMM result section. Parameters ---------- resids_index_map: dict Lookup table where each residue id's index in self.membrane_unique_resids is kepts resids: numpy.ndarray Residue id list for indexing Returns ------- idx: numpy.ndarray Indexes in the same order with resids """ idx = np.array([resids_index_map[resid] for resid in resids]) return idx
[docs] def predict_plot(self): """ Plots average ordered lipids of HMM prediction for each lipid type. """ start_time = self.universe.trajectory[self.start].time end_time = self.universe.trajectory[self.stop - 1].time # Configured as printing 0.1 microsecond instead 1000 nanoseconds for readability. if end_time - start_time >= 1e8: time_unit = "ms" scale_factor = 1e9 elif end_time - start_time >= 1e5: time_unit = "μs" scale_factor = 1e6 else: time_unit = "ns" scale_factor = 1e3 start_time /= scale_factor end_time /= scale_factor t = np.linspace(start_time, end_time, self.n_frames) for resname in self.unique_resnames: plt.plot(t, self.results['HMM_Pred'][resname].mean(0), label=resname) plt.xlabel(f"t ({time_unit})", fontsize=18) plt.ylabel(r"$\bar{O}_{Lipid}$", fontsize=18) plt.legend(fontsize=15, ncols=1, loc="lower left") plt.ylim(0, 1) plt.title("b", fontsize=20, fontweight="bold", loc="left") plt.tight_layout() if self.save_plots: plt.savefig("b.pdf") plt.show()
# ------------------------------ GETIS-ORD STATISTIC ------------------------------------------------------------- #
[docs] def getis_ord(self): """ Calculates Getis-Ord statistic for identification model """ self.getis_ord_stat(self.results["upper_weight_all"], 0) self.getis_ord_stat(self.results["lower_weight_all"], 1) log.info("Getis-Ord for leaflets are calculated.") if self.result_plots: self.getis_ord_plot() self.results["Getis_Ord"]["Permut_0"] = self.permut_getis_ord_stat(self.results["upper_weight_all"], 0) self.results["Getis_Ord"]["Permut_1"] = self.permut_getis_ord_stat(self.results["lower_weight_all"], 1) log.info("Permutations of Getis-Ord are calculated.") self.results["z_score"] = self.z_score_calc() log.info("Z score is calculated.")
[docs] def getis_ord_stat(self, weight_matrix_all, leaflet): """ Getis-Ord Local Spatial Autocorrelation Statistics calculation based on the predicted order states of each lipid and the weighting factors between neighbored lipids. Parameters ---------- weight_matrix_all : sparse.csr_array Weight matrix for all lipid in a leaflet at current time step leaflet : int 0 = upper leaflet, 1 = lower leafet Returns ------- g_star_i : list of numpy.ndarrays G*i values for each lipid at each time step w_ii_all : list of numpy.ndarrays Self-influence of the lipids """ # Get the number of frames nframes = self.n_frames # Initialize empty lists to store the G*i values and the self-influence of the lipids in a lipid g_star_i = [] w_ii_all = [] # Iterate over frames for step in range(nframes): # Get the weight matrix of the leaflet at the current time step weight_matrix = weight_matrix_all[step] # Number of lipids in the leaflet n = weight_matrix.shape[0] # In case the code was already executed beforehand weight_matrix[range(n), range(n)] = 0. # Get the order state of each lipid in the leaflet at the current time step order_states = self.get_leaflet_step_order(leaflet=leaflet, step=step) # Number of neighbors per lipid -> The number is 0 (or close to 0) for not neighboured lipids nneighbor = np.sum(weight_matrix > 1E-5, axis=1) # Parameters for the Getis-Ord statistic w_ii = np.sum(weight_matrix, axis=-1) / nneighbor # Self-influence! weight_matrix[range(n), range(n)] = w_ii w_star_i = np.sum(weight_matrix, axis=-1) # + w_ii s_star_1i = np.sum(weight_matrix.power(2), axis=-1) # + w_ii**2 # Empirical standard deviation over all order states in the leaflet s = np.std(order_states, ddof=1) # Employ matrix-vector multiplication so = weight_matrix @ order_states # Calculate the nominator for G*i nom = so - w_star_i * 0.5 # Calculate the denominator for G*i denom = s * np.sqrt((n * s_star_1i - w_star_i ** 2) / (n - 1)) g_star = nom / denom assert not np.any(nneighbor < 1), "Lipid found without a neighbor!" g_star_i.append(g_star) w_ii_all.append(w_ii) self.results['Getis_Ord'][leaflet] = {f"g_star_i_{leaflet}": g_star_i, f"w_ii_{leaflet}": w_ii_all}
[docs] def getis_ord_plot(self): """ Plots average order state for each lipid type per frame """ # Get number of lipid residues resnum = len(self.unique_resnames) # For each residue type there is an empty list initialized g_star_i_temp = [[] for _ in range(resnum)] # Iterate over all frames for step in range(self.n_frames): # Get the indices of the lipids in leaflet 0 and 1 and their corresponding positions index_dict_0, pos_dict_0 = self.get_leaflet_step_order_index(leaflet=0, step=step) index_dict_1, pos_dict_1 = self.get_leaflet_step_order_index(leaflet=1, step=step) # Iterate over the lipid types for i, rsn in enumerate(self.unique_resnames): # Merge Getis-Ord values for the upper and the lower leaflet to display them as a histogram # The G* values should be sorted according to the indices of the resids g_start_sorted_0 = self.results['Getis_Ord'][0]['g_star_i_0'][step][index_dict_0[rsn]] # The G* values should be sorted according to the indices of the resids g_start_sorted_1 = self.results['Getis_Ord'][1]['g_star_i_1'][step][index_dict_1[rsn]] g_star_i_temp[i] += list(np.append(g_start_sorted_0, g_start_sorted_1)) for i in range(resnum): plt.hist(g_star_i_temp[i], bins=np.linspace(-3, 3, 201), density=True, histtype="step", lw=2, label=self.unique_resnames[i]) plt.legend(fontsize=15, loc="upper left", ncols=2) plt.xlim(-3, 3) plt.ylim(0, .9) _ = plt.xlabel("$G^*_i$", fontsize=18) plt.ylabel("$p(G^*_i)$", fontsize=18) plt.tick_params(labelsize=11) plt.title("c", fontsize=20, fontweight="bold", loc="left") plt.tight_layout() if self.save_plots: plt.savefig("c.pdf") plt.show()
[docs] def permut_getis_ord_stat(self, weight_matrix_all, leaflet): """ Getis-Ord Local Spatial Autocorrelation Statistics calculation based on the predicted order states of each lipid and the weighting factors between neighbored lipids. Parameters ---------- weight_matrix_all : sparse.csr_array Weight matrices for all lipid in a leaflet at each time step leaflet : int 0 = upper leaflet, 1 = lower leafet Returns ------- g_star_i : list of numpy.ndarrays G*i values for each lipid at each time step """ # Get the number of frames nframes = self.n_frames # Initialize empty lists to store the G*i values and the self-influence of the lipids in a lipid g_star_i = [] # Do 10 permutations per frame n_permut = 10 # Iterate over frames for step in tqdm(range(nframes)): for permut in range(n_permut): # Get the weightmatrix of the leaflet at the current time step weight_matrix = weight_matrix_all[step] # Number of lipids in the leaflet n = weight_matrix.shape[0] # In case the code was already executed beforehand weight_matrix[range(n), range(n)] = 0.0 # Get the order state of each lipid in the leaflet at the current time step order_states = self.get_leaflet_step_order(leaflet=leaflet, step=step) np.random.shuffle(order_states) # Number of neighbors per lipid -> The number is 0 (or close to 0) for not neighboured lipids nneighbor = np.sum(weight_matrix > 1E-5, axis=1) # Parameters for the Getis-Ord statistic w_ii = np.sum(weight_matrix, axis=-1) / nneighbor # Self-influence! weight_matrix[range(n), range(n)] = w_ii w_star_i = np.sum(weight_matrix, axis=-1) # + w_ii s_star_1i = np.sum(weight_matrix ** 2, axis=-1) # + w_ii**2 # Empirical standard deviation over all order states in the leaflet s = np.std(order_states, ddof=1) # Employ matrix-vector multiplication so = weight_matrix @ order_states # Calculate the nominator for G*i nom = so - w_star_i * 0.5 # Calculate the denominator for G*i denom = s * np.sqrt((n * s_star_1i - w_star_i ** 2) / (n - 1)) g_star = nom / denom assert not np.any(nneighbor < 1), "Lipid found without a neighbor!" g_star_i += list(g_star) return g_star_i
[docs] def z_score_calc(self): """ Z score calculation for Getis-Ord statistics """ result = {} for i in range(2): z_score = {} getis_ord_permut = self.results["Getis_Ord"][f"Permut_{i}"] z_score["z_a"] = np.quantile(getis_ord_permut, self.p_value) z_score["z1_a"] = np.quantile(getis_ord_permut, 1 - self.p_value) result[i] = z_score return result
# ------------------------------ HIERARCHICAL CLUSTERING --------------------------------------------------------- #
[docs] def clustering_plot(self): """ Runs hierarchical clustering and plots clustering results in different frames. """ n_frames = self.n_frames # Plot %5, %50 and %95 points of frame list frame_list = [int(n_frames / 20), int(n_frames / 2), int(n_frames / 1.05)] fig, ax = plt.subplots(1, len(frame_list), figsize=(20, 5)) # Iterate over three frames illustrate the clustering results for k, i in enumerate(frame_list): order_states_0 = self.get_leaflet_step_order(0, i) # Clustering # ---------------------------------------------------------------------------------------------------------------------- core_lipids = self.assign_core_lipids(weight_matrix_f=self.results["upper_weight_all"][i], g_star_i_f=self.results['Getis_Ord'][0]['g_star_i_0'][i], order_states_f=order_states_0, w_ii_f=self.results["Getis_Ord"][0]["w_ii_0"][i], z_score=self.results["z_score"][0]) clusters = self.hierarchical_clustering(weight_matrix_f=self.results["upper_weight_all"][i], w_ii_f=self.results["Getis_Ord"][0]["w_ii_0"][i], core_lipids=core_lipids) # Plot coordinates # ---------------------------------------------------------------------------------------------------------------------- residue_indexes, positions = self.get_leaflet_step_order_index(leaflet=0, step=i) for resname, index in residue_indexes.items(): ax[k].scatter(positions[resname][:, 0], positions[resname][:, 1], marker="s", alpha=1, s=5, label=resname) # Choose color scheme for clustering coloring colors = plt.cm.nipy_spectral(np.linspace(0, 1.0, len(clusters.values()))) # Goto correct frame of the trajectory self.universe.trajectory[self.start:self.stop:self.step][i] # Prepare positions for cluster plotting leaflet_assignment_mask = self.leaflet_assignment[:, i] == 0 positions = (self.membrane.residues[leaflet_assignment_mask].atoms & self.all_heads).positions label_length = len(residue_indexes) # Iterate over clusters and plot the residues print(f"Number of clusters in frame {i}: {len(clusters.values())}") for j, val in enumerate(clusters.values()): idx = np.array(list(val), dtype=int) ax[k].scatter(positions[idx, 0], positions[idx, 1], s=100, marker="o", color=colors[j], zorder=-10) if self.tmd_protein is not None: label_length += len(self.tmd_protein["0"]) for protein in self.tmd_protein["0"]: ax[k].scatter(protein[0], protein[1], s=100, marker="^", color="black", label="TMD Protein") ax[k].set_xticks([]) ax[k].set_yticks([]) ax[k].set_aspect("equal") handles, labels = ax[0].get_legend_handles_labels() plt.figlegend(handles=handles, labels=labels, ncol=label_length, loc='lower center', fontsize=15, frameon=False) ax[0].set_title("d", fontsize=20, fontweight="bold", loc="left") ax[0].set_title(label=f"Frame {frame_list[0]}", fontsize=18) ax[1].set_title(label=f"Frame {frame_list[1]}", fontsize=18) ax[2].set_title(label=f"Frame {frame_list[2]}", fontsize=18) plt.subplots_adjust(left=None, bottom=None, right=None, top=None, wspace=0, hspace=None) if self.save_plots: plt.savefig("d.pdf") plt.show()
[docs] def result_clustering(self): """ Runs hierarchical clustering for each frame and saves result """ self.results["Clustering"] = {'0': {}, '1': {}} # Iterate over all frames for i in tqdm(range(self.n_frames), total=self.n_frames): # Iterate over both leaflets for j, leaflet_ in enumerate(['upper', 'lower']): # Get order states order_states_leaf = self.get_leaflet_step_order(j, i) core_lipids = self.assign_core_lipids(weight_matrix_f=self.results[f"{leaflet_}_weight_all"][i], g_star_i_f=self.results['Getis_Ord'][j][f'g_star_i_{j}'][i], order_states_f=order_states_leaf, w_ii_f=self.results["Getis_Ord"][j][f"w_ii_{j}"][i], z_score=self.results["z_score"][j]) clusters = self.hierarchical_clustering(weight_matrix_f=self.results[f"{leaflet_}_weight_all"][i], w_ii_f=self.results["Getis_Ord"][j][f"w_ii_{j}"][i], core_lipids=core_lipids) frame_number = self.start + i * self.step leaflet_index_resid_map = self.get_leaflet_step_index_to_resid(j, i) cluster_result = [] for cluster in clusters.values(): cluster_result.append([leaflet_index_resid_map[leaflet_index] for leaflet_index in cluster]) self.results["Clustering"][str(j)][frame_number] = cluster_result
[docs] @staticmethod def assign_core_lipids(weight_matrix_f, g_star_i_f, order_states_f, w_ii_f, z_score): """ Assign lipids as core members (aka lipids with a high positive autocorrelation) depending on the Getis-Ord spatial local autocorrelation statistic. Parameters ---------- weight_matrix_f : numpy.ndarray Matrix containing the weight factors between all lipid pairs at one time step g_star_i_f : numpy.ndarray Getis-Ord spatial local autocorrelation statistic for every lipid at one time step order_states_f: numpy.ndarray Order states for every lipid at one time step w_ii_f: numpy.ndarray Self-influence weight factor for every lipid at one time step z_score: dict Contains boundary of the reaction region Returns ------- core_lipids : numpy.ndarray (bool) Contains a TRUE value if the lipid is a core member, otherwise it FALSE """ # Define boundary of the reaction region z1_a = z_score["z1_a"] z_a = z_score["z_a"] # Assign core members according to their auto-correlation core_lipids = g_star_i_f > z1_a # Assign lipids with a mid-range auto-correlation (-z_1-a, z_1-a) * Ordered low_corr = (g_star_i_f <= z1_a) & (g_star_i_f >= z_a) & (order_states_f == 1) # Add iteratively new lipids to the core members n_cores_old = np.inf n_cores_new = np.sum(core_lipids) # Iterate until self-consistency is reached while n_cores_old != n_cores_new: # Check how tightly the lipids are connected to the core members new_core_lipids = (weight_matrix_f[core_lipids].sum(0) > w_ii_f) & low_corr # Assign lipids to core members if condition is full-filled core_lipids[new_core_lipids] = True # Update number of core lipids n_cores_old = n_cores_new n_cores_new = np.sum(core_lipids) return core_lipids
[docs] @staticmethod def hierarchical_clustering(weight_matrix_f, w_ii_f, core_lipids): """ Hierarchical clustering approach to identify spatial related Lo domains. Parameters ---------- weight_matrix_f : numpy.ndarray Matrix containing the weight factors between all lipid pairs at one time step w_ii_f: numpy.ndarray Self-influence weight factor for every lipid at one time step core_lipids : numpy.ndarray Array contains information which lipid is assigned as core member Returns ------- clusters : dict The dictionary contains each found cluster """ # Merge iteratively clusters n_clusters_old = np.inf n_clusters_new = np.sum(core_lipids) # Get the indices of the core lipids core_lipids_id = np.where(core_lipids)[0] # Store clusters in a Python dictionary # Initialize all core lipids as clusters clusters = dict( zip( core_lipids_id.astype("U"), [[id] for id in core_lipids_id] ) ) # Iterate until self-consistency is reached while n_clusters_old != n_clusters_new: # Get a list of the IDs of current clusters cluster_ids = list(clusters.keys()) # Iterate over all clusters i for i, id_i in enumerate(cluster_ids): # If cluster i was already merged and deleted, skip it! if id_i not in clusters.keys(): continue # The cluster weights are defined as the sum # over the weights of all lipid members cluster_weights_i = np.sum(weight_matrix_f[clusters[id_i]], axis=0) # Compare cluster weights to the self-influence of each lipid merge_condition_i = cluster_weights_i > w_ii_f # Iterate over all clusters j for id_j in cluster_ids[(i + 1):]: # Do not merge a cluster with itself if id_i == id_j: continue # If cluster j was already merged and deleted, skip it! if id_j not in clusters.keys(): continue # Calculate cluster weights and compare to lipids self-influence cluster_weights_j = np.sum(weight_matrix_f[clusters[id_j]], axis=0) merge_condition_j = cluster_weights_j > w_ii_f # If the condition is fullfilled for any lipid -> Merge the clusters if np.any(merge_condition_i[clusters[id_j]]) or np.any(merge_condition_j[clusters[id_i]]): # Merge cluster j into cluster i clusters[id_i] += clusters[id_j] # Delete cluster j from the cluster dict del clusters[id_j] # Update cluster numbers n_clusters_old = n_clusters_new n_clusters_new = len(clusters.keys()) return clusters
# ------------------------------ HELPER FUNCTIONS ---------------------------------------------------------------- #
[docs] def get_leaflet_step_order(self, leaflet, step): """ Receive residue's order state with respect to the leaflet Parameters ---------- leaflet : int leaflet index step: int step index Returns ------- order_states : numpy.ndarray Numpy array contains order state results of the leaflet at step in order of system's residues """ #Init two empty lists for ... temp = [] #... order states prediction idxs = [] #... indices of lipids #Iterate over lipids for res, data in self.results.train_data_per_type.items(): #Get indices from resids (e.g. resid 1, is index 0) idx = self.get_residue_idx(self.resids_index_map, data[0]) #Store the indices of the residues in the current leaflet at the current step idxs.append(idx[self.leaflet_assignment[idx, step] == leaflet]) #Get the predicted HMM order states for the residues in the current leaflet at the current step temp.append(self.results["HMM_Pred"][res][:, step][self.leaflet_assignment[idx, step] == leaflet]) #Sort the obtained indices sorted_idxs = np.argsort( np.concatenate(idxs) ) #It is not always the case that the lipid order is equal (e.g., depending on resids of cholesterol for example). Here the order states are sorted #so that they correspond to the resids in the leaflet selection -> With that they should fit to the order of the lipids in the weight matrix order_states = np.concatenate(temp)[sorted_idxs] return order_states
[docs] def get_leaflet_step_order_index(self, leaflet, step): """ Receive residue's indexes and positions for a specific leaflet at any frame of the trajectory. Parameters ---------- leaflet : int leaflet index step: int step index Returns ------- indexes : dict dictionary contains numpy.arrays containing residue's indexes for each unique residue type positions : dict dictionary contains numpy.arrays containing residue's positions for each unique residue type """ leaflet_assignment_step = self.leaflet_assignment[:, step] leaflet_assignment_mask = leaflet_assignment_step == leaflet indexes = {} positions = {} # Go to correct frame of the trajectory self.universe.trajectory[self.start:self.stop:self.step][step] for res in self.unique_resnames: indexes[res] = np.where(self.membrane.residues[leaflet_assignment_mask].resnames == res)[0] if res in self.heads.keys(): positions[res] = (self.membrane.residues[leaflet_assignment_mask].atoms & self.universe.select_atoms( f"resname {res} and name {self.heads[res]}")).positions else: positions[res] = (self.membrane.residues[leaflet_assignment_mask].atoms & self.universe.select_atoms( f"resname {res} and name {self.sterol_heads[res]}")).positions return indexes, positions
[docs] def get_leaflet_step_index_to_resid(self, leaflet, step): """ Returns leaflet index to residue id map for a specific leaflet at specific frame of the trajectory Parameters ---------- leaflet : int leaflet index step: int step index Returns ------- result_map : dict dictionary contains leaflet index of residue as key and residue id as value """ leaflet_assignment_step = self.leaflet_assignment[:, step] leaflet_assignment_mask = leaflet_assignment_step == leaflet result_map = {} for resname in self.unique_resnames: sys_index = np.where(self.membrane.residues.resnames == resname)[0] sys_index =sys_index[leaflet_assignment_mask[sys_index]] leaflet_index = np.where(self.membrane.residues[leaflet_assignment_mask].resnames == resname)[0] for i in range(0, len(leaflet_index)): result_map[leaflet_index[i]] = self.index_resid_map[sys_index[i]] return result_map