Coverage for encodermap/moldata/moldata.py: 22%

46 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-12-31 16:54 +0100

1# -*- coding: utf-8 -*- 

2# encodermap/moldata/moldata.py 

3################################################################################ 

4# EncoderMap: A python library for dimensionality reduction. 

5# 

6# Copyright 2019-2024 University of Konstanz and the Authors 

7# 

8# Authors: 

9# Kevin Sawade, Tobias Lemke 

10# 

11# Encodermap is free software: you can redistribute it and/or modify 

12# it under the terms of the GNU Lesser General Public License as 

13# published by the Free Software Foundation, either version 2.1 

14# of the License, or (at your option) any later version. 

15# This package is distributed in the hope that it will be useful to other 

16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the 

17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 

18# See the GNU Lesser General Public License for more details. 

19# 

20# See <http://www.gnu.org/licenses/>. 

21################################################################################ 

22 

23"""New MolData class. Uses PyEMMA to calculate many trajectories in Parallel. 

24 

25Even when the set of trajectories or even collective variables is too large to keep in memory. 

26 

27Allows creation of tfrecord files to pass large datasets to tensorflow that normally won't fit into memory. 

28 

29Is Backwards-compatible to the old MolData class. 

30 

31 

32 

33""" 

34 

35################################################################################ 

36# Imports 

37################################################################################ 

38 

39# Third Party Imports 

40import numpy as np 

41from optional_imports import _optional_import 

42 

43# Encodermap imports 

44from encodermap.encodermap_tf1.moldata import MolData as MolDatav1 

45from encodermap.loading.featurizer import Featurizer 

46from encodermap.trajinfo.info_all import TrajEnsemble 

47from encodermap.trajinfo.info_single import SingleTraj 

48 

49 

50################################################################################ 

51# Optional Imports 

52################################################################################ 

53 

54 

55mda = _optional_import("MDAnalysis") 

56md = _optional_import("mdtraj") 

57 

58 

59################################################################################ 

60# Globals 

61################################################################################ 

62 

63 

64__all__: list[str] = ["NewMolData"] 

65 

66 

67################################################################################ 

68# Public Classes 

69################################################################################ 

70 

71 

72class NewMolData: 

73 """MolData version 2. Extracts and holds conformational information of trajectories. 

74 

75 In version 2. You can either use MDAnalysis or the out-of memory option using 

76 EncoderMap's new TrajEnsemble and SingleTraj classes. 

77 

78 Collective Variables is a term used for data of some dimension matching the dimension of your trajectory. 

79 Collective variables of dimensionality 1 assign a single (float) value to every frame of a simulation or 

80 simulation ensemble. This could the the membership to a cluster, the distance between the termini of a 

81 protein or the distance between two spin labels. Collective variables of dimensionality 2 

82 assign a list of floats to every simulation frame. The backbone torsions are such a collective variable. 

83 A flattened array of pairwise distances between CA atoms would also fall into this category. CVs of 

84 dimensionality 3 ascribe a value to every atom in every frame. This could be the xyz-coordinates of the atom 

85 or the beta-factor or the charge. 

86 

87 Encodermap in its Angle-Dihedral-Cartesioan mode uses the following collective variables: 

88 * cartesians: The xyz-coordinates of every atom in every frame in every trajectory. 

89 * central_cartesians: The xyz-coordinates of the backbone C, CA, N atoms. 

90 * dihedrals: The omega-phi-psi angles of the backbone. 

91 * angles: The angles between the central_cartesian atoms. 

92 * lengths: The distances between the central_cartesian atoms. 

93 * sidedihedrals: The dihedrals of the sidechains in order residue1-chi1-chi5 residue2-ch1-chi5. 

94 

95 """ 

96 

97 def __init__( 

98 self, 

99 trajs, 

100 cache_path="", 

101 top=None, 

102 write_traj=False, 

103 fmt=".nc", 

104 start=None, 

105 stop=None, 

106 step=None, 

107 ): 

108 """Instantiate the MolData Class. 

109 

110 The trajs parameter can take a number of possible inputs: 

111 * MDAnalysis.AtomGroup: Ensuing backwards-compatibility to the old MolData class. 

112 * em.TrajEnsemble: EncoderMap's TrajEnsemble class which keeps track of frames and collective 

113 variables. 

114 * list of str: If you don't want to bother yourself with the TrajEnsemble class you can pass a 

115 list of str giving the filenames of many trajetcory files (.xtc, .dcd, .h5). Make sure 

116 to also provide a topology in case of non-topology trajectories. 

117 

118 Args: 

119 trajs (Union[MDAnalysis.AtomGroup, encodermap.TrajEnsemble, list]): The trajectories to load. 

120 Can be either one of the following: 

121 * MDAnalysis.AtomGroup. For Backwards-compatibility. 

122 * encodermap.TrajEnsemble. New TrajEnsemble class which manages frames and collective variables. 

123 * list: Simply provide a list of trajectory files and don't forget to provide a topology. 

124 cache_path (str, optional): Where to save generated Data to. Saves either numpy arrays (when AtomGroup 

125 is provided as trajs, or fmt is '.npy') or NetCDF-HDF5 files with xarray (fmt is '.nc'). When an 

126 empty string is provided nothing is written to disk. Defaults to '' (empty string). 

127 top (Union[str, mdtraj.Topology, None], optional): The topology of trajs in case trajs is a list of str. 

128 Can take filename of a topology file or already loaded mdtraj.Topology. Defaults to None. 

129 write_traj (bool, optional): Whether to include the trajectory (+topology) into the NetCDF-HDF5 file. 

130 This option only works in conjunction with fmt='.nc' and if set to True will use mdtraj to write the 

131 trajectory, topology and the collective variables to one comprehensive file. 

132 fmt (str, optional): The format to save the CVs as. Can be either '.npy' or '.nc'. Defaults to '.nc'. 

133 The default is NetCDF-HDF5, because these files can be read iteratively and such can be larger 

134 than memory allows. This helps in the construction of tfrecord files that can also be used to train 

135 a network with large datasets. 

136 start (Union[int, None], optional): First frame to analyze. Is there for backwards-compatibility. This 

137 feature is dropped in the newer TrajEnsemble pipeline. 

138 stop (Union[int, None], optional): Last frame to analyze. Is there for backwards-compatibility. This 

139 feature is dropped in the newer TrajEnsemble pipeline. 

140 step (Union[int, None], optional): Step provided to old MolData class. Is there for backwards-compatibility. 

141 This feature is dropped in the newer TrajEnsemble pipeline. 

142 

143 """ 

144 if isinstance(trajs, mda.AtomGroup): 

145 self = MolDatav1(trajs, cache_path, start, stop, step) 

146 return 

147 if isinstance(trajs, str): 

148 trajs = [trajs] 

149 if all([isinstance(i, str) for i in trajs]): 

150 self.trajs = TrajEnsemble(trajs, tops) 

151 elif isinstance(trajs, TrajEnsemble): 

152 self.trajs = trajs 

153 elif isinstance(trajs, SingleTraj): 

154 self.trajs = trajs._gen_ensemble() 

155 else: 

156 raise TypeError( 

157 f"trajs musst be str, list, TrajEnsemble, SingleTraj, or mda.AtomGroup. You supplied {type(trajs)}" 

158 ) 

159 

160 if cache_path: 

161 feat = Featurizer(self.trajs, in_memory=False) 

162 feat.add_list_of_feats("all") 

163 self.trajs.load_CVs(feat, directory=cache_path) 

164 else: 

165 feat = Featurizer(self.trajs) 

166 feat.add_list_of_feats("all") 

167 self.trajs.load_CVs(feat) 

168 

169 # Use the data from self.trajs 

170 self.cartesians = self.trajs.all_cartesians 

171 self.central_cartesians = self.trajs.central_cartesians 

172 self.dihedrals = self.trajs.central_dihedrals 

173 self.sidedihedrals = self.trajs.side_dihedrals 

174 self.angles = self.trajs.central_angles 

175 self.lengths = self.trajs.central_distances 

176 

177 def __iadd__(self, other): 

178 self.cartesians = np.concatenate([self.cartesians, other.cartesians], axis=0) 

179 self.central_cartesians = np.concatenate( 

180 [self.central_cartesians, other.central_cartesians], axis=0 

181 ) 

182 self.dihedrals = np.concatenate([self.dihedrals, other.dihedrals], axis=0) 

183 self.sidedihedrals = np.concatenate( 

184 [self.sidedihedrals, other.sidedihedrals], axis=0 

185 ) 

186 self.angles = np.concatenate([self.angles, other.angles], axis=0) 

187 self.lengths = np.concatenate([self.lengths, other.lengths], axis=0) 

188 return self 

189 

190 def write_tfrecords(self, path=None): 

191 """Todo""" 

192 pass