Coverage for encodermap/moldata/moldata.py: 22%

46 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-07 11:05 +0000

1# -*- coding: utf-8 -*- 

2# encodermap/moldata/moldata.py 

3################################################################################ 

4# Encodermap: A python library for dimensionality reduction. 

5# 

6# Copyright 2019-2022 University of Konstanz and the Authors 

7# 

8# Authors: 

9# Kevin Sawade, Tobias Lemke 

10# 

11# Encodermap is free software: you can redistribute it and/or modify 

12# it under the terms of the GNU Lesser General Public License as 

13# published by the Free Software Foundation, either version 2.1 

14# of the License, or (at your option) any later version. 

15# This package is distributed in the hope that it will be useful to other 

16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the 

17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 

18# See the GNU Lesser General Public License for more details. 

19# 

20# See <http://www.gnu.org/licenses/>. 

21################################################################################ 

22 

23"""New MolData class. Uses PyEMMA to calculate many trajectories in Parallel. 

24 

25Even when the set of trajectories or even collective variables is too large to keep in memory. 

26 

27Allows creation of tfrecord files to pass large datasets to tensorflow that normally won't fit into memory. 

28 

29Is Backwards-compatible to the old MolData class. 

30 

31ToDo: 

32 * Add tfrecord capabilities 

33 

34 

35""" 

36 

37############################################################################## 

38# Imports 

39############################################################################## 

40 

41import numpy as np 

42 

43from .._optional_imports import _optional_import 

44from ..encodermap_tf1.moldata import MolData 

45from ..loading import Featurizer 

46from ..trajinfo.info_all import TrajEnsemble 

47from ..trajinfo.info_single import SingleTraj 

48 

49############################################################################## 

50# Optional Imports 

51############################################################################## 

52 

53 

54mda = _optional_import("MDAnalysis") 

55md = _optional_import("mdtraj") 

56 

57############################################################################## 

58# Globals 

59############################################################################## 

60 

61__all__ = ["NewMolData"] 

62 

63############################################################################## 

64# Public Classes 

65############################################################################## 

66 

67 

68class NewMolData: 

69 """MolData version 2. Extracts and holds conformational information of trajectories. 

70 

71 In version 2. You can either use MDAnalysis or the out-of memory option using 

72 encodermap's new TrajEnsemble and SingleTraj classes. 

73 

74 Collective Variables is a term used for data of some dimension matching the dimension of your trajectory. 

75 Collective variables of dimensionality 1 assign a single (float) value to every frame of a simulation or 

76 simulation ensemble. This could the the membership to a cluster, the distance between the termini of a 

77 protein or the distance between two spin labels. Collective variables of dimensionality 2 

78 assign a list of floats to every simulation frame. The backbone torsions are such a collective variable. 

79 A flattened array of pairwise distances between CA atoms would also fall into this category. CVs of 

80 dimensionality 3 ascribe a value to every atom in every frame. This could be the xyz-coordinates of the atom 

81 or the beta-factor or the charge. 

82 

83 Encodermap in its Angle-Dihedral-Cartesioan mode uses the following collective variables: 

84 * cartesians: The xyz-coordinates of every atom in every frame in every trajectory. 

85 * central_cartesians: The xyz-coordinates of the backbone C, CA, N atoms. 

86 * dihedrals: The omega-phi-psi angles of the backbone. 

87 * angles: The angles between the central_cartesian atoms. 

88 * lengths: The distances between the central_cartesian atoms. 

89 * sidedihedrals: The dihedrals of the sidechains in order residue1-chi1-chi5 residue2-ch1-chi5. 

90 

91 """ 

92 

93 def __init__( 

94 self, 

95 trajs, 

96 cache_path="", 

97 top=None, 

98 write_traj=False, 

99 fmt=".nc", 

100 start=None, 

101 stop=None, 

102 step=None, 

103 ): 

104 """Instantiate the MolData Class. 

105 

106 The trajs parameter can take a number of possible inputs: 

107 * MDAnalysis.AtomGroup: Ensuing backwards-compatibility to the old MolData class. 

108 * em.TrajEnsemble: EncoderMap's TrajEnsemble class which keeps track of frames and collective 

109 variables. 

110 * list of str: If you don't want to bother yourself with the TrajEnsemble class you can pass a 

111 list of str giving the filenames of many trajetcory files (.xtc, .dcd, .h5). Make sure 

112 to also provide a topology in case of non-topology trajectories. 

113 

114 Args: 

115 trajs (Union[MDAnalysis.AtomGroup, encodermap.TrajEnsemble, list]): The trajectories to load. 

116 Can be either one of the following: 

117 * MDAnalysis.AtomGroup. For Backwards-compatibility. 

118 * encodermap.TrajEnsemble. New TrajEnsemble class which manages frames and collective variables. 

119 * list: Simply provide a list of trajectory files and don't forget to provide a topology. 

120 cache_path (str, optional): Where to save generated Data to. Saves either numpy arrays (when AtomGroup 

121 is provided as trajs, or fmt is '.npy') or NetCDF-HDF5 files with xarray (fmt is '.nc'). When an 

122 empty string is provided nothing is written to disk. Defaults to '' (empty string). 

123 top (Union[str, mdtraj.Topology, None], optional): The topology of trajs in case trajs is a list of str. 

124 Can take filename of a topology file or already loaded mdtraj.Topology. Defaults to None. 

125 write_traj (bool, optional): Whether to include the trajectory (+topology) into the NetCDF-HDF5 file. 

126 This option only works in conjunction with fmt='.nc' and if set to True will use mdtraj to write the 

127 trajectory, topology and the collective variables to one comprehensive file. 

128 fmt (str, optional): The format to save the CVs as. Can be either '.npy' or '.nc'. Defaults to '.nc'. 

129 The default is NetCDF-HDF5, because these files can be read iteratively and such can be larger 

130 than memory allows. This helps in the construction of tfrecord files that can also be used to train 

131 a network with large datasets. 

132 start (Union[int, None], optional): First frame to analyze. Is there for backwards-compatibility. This 

133 feature is dropped in the newer TrajEnsemble pipeline. 

134 stop (Union[int, None], optional): Last frame to analyze. Is there for backwards-compatibility. This 

135 feature is dropped in the newer TrajEnsemble pipeline. 

136 step (Union[int, None], optional): Step provided to old MolData class. Is there for backwards-compatibility. 

137 This feature is dropped in the newer TrajEnsemble pipeline. 

138 

139 Examples: 

140 >>> import encodermap as em 

141 >>> traj = 

142 

143 """ 

144 if isinstance(trajs, mda.AtomGroup): 

145 self = MolDatav1(trajs, cache_path, start, stop, step) 

146 return 

147 if isinstance(trajs, str): 

148 trajs = [trajs] 

149 if all([isinstance(i, str) for i in trajs]): 

150 self.trajs = TrajEnsemble(trajs, tops) 

151 elif isinstance(trajs, TrajEnsemble): 

152 self.trajs = trajs 

153 elif isinstance(trajs, SingleTraj): 

154 self.trajs = trajs._gen_ensemble() 

155 else: 

156 raise TypeError( 

157 f"trajs musst be str, list, TrajEnsemble, SingleTraj, or mda.AtomGroup. You supplied {type(trajs)}" 

158 ) 

159 

160 if cache_path: 

161 feat = Featurizer(self.trajs, in_memory=False) 

162 feat.add_list_of_feats("all") 

163 self.trajs.load_CVs(feat, directory=cache_path) 

164 else: 

165 feat = Featurizer(self.trajs) 

166 feat.add_list_of_feats("all") 

167 self.trajs.load_CVs(feat) 

168 

169 # Use the data from self.trajs 

170 self.cartesians = self.trajs.all_cartesians 

171 self.central_cartesians = self.trajs.central_cartesians 

172 self.dihedrals = self.trajs.central_dihedrals 

173 self.sidedihedrals = self.trajs.side_dihedrals 

174 self.angles = self.trajs.central_angles 

175 self.lengths = self.trajs.central_distances 

176 

177 def __iadd__(self, other): 

178 self.cartesians = np.concatenate([self.cartesians, other.cartesians], axis=0) 

179 self.central_cartesians = np.concatenate( 

180 [self.central_cartesians, other.central_cartesians], axis=0 

181 ) 

182 self.dihedrals = np.concatenate([self.dihedrals, other.dihedrals], axis=0) 

183 self.sidedihedrals = np.concatenate( 

184 [self.sidedihedrals, other.sidedihedrals], axis=0 

185 ) 

186 self.angles = np.concatenate([self.angles, other.angles], axis=0) 

187 self.lengths = np.concatenate([self.lengths, other.lengths], axis=0) 

188 return self 

189 

190 def write_tfrecords(self, path=None): 

191 """Todo""" 

192 pass