Coverage for encodermap/moldata/moldata.py: 22%
46 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-12-31 16:54 +0100
« prev ^ index » next coverage.py v7.4.1, created at 2024-12-31 16:54 +0100
1# -*- coding: utf-8 -*-
2# encodermap/moldata/moldata.py
3################################################################################
4# EncoderMap: A python library for dimensionality reduction.
5#
6# Copyright 2019-2024 University of Konstanz and the Authors
7#
8# Authors:
9# Kevin Sawade, Tobias Lemke
10#
11# Encodermap is free software: you can redistribute it and/or modify
12# it under the terms of the GNU Lesser General Public License as
13# published by the Free Software Foundation, either version 2.1
14# of the License, or (at your option) any later version.
15# This package is distributed in the hope that it will be useful to other
16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the
17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18# See the GNU Lesser General Public License for more details.
19#
20# See <http://www.gnu.org/licenses/>.
21################################################################################
23"""New MolData class. Uses PyEMMA to calculate many trajectories in Parallel.
25Even when the set of trajectories or even collective variables is too large to keep in memory.
27Allows creation of tfrecord files to pass large datasets to tensorflow that normally won't fit into memory.
29Is Backwards-compatible to the old MolData class.
33"""
35################################################################################
36# Imports
37################################################################################
39# Third Party Imports
40import numpy as np
41from optional_imports import _optional_import
43# Encodermap imports
44from encodermap.encodermap_tf1.moldata import MolData as MolDatav1
45from encodermap.loading.featurizer import Featurizer
46from encodermap.trajinfo.info_all import TrajEnsemble
47from encodermap.trajinfo.info_single import SingleTraj
50################################################################################
51# Optional Imports
52################################################################################
55mda = _optional_import("MDAnalysis")
56md = _optional_import("mdtraj")
59################################################################################
60# Globals
61################################################################################
64__all__: list[str] = ["NewMolData"]
67################################################################################
68# Public Classes
69################################################################################
72class NewMolData:
73 """MolData version 2. Extracts and holds conformational information of trajectories.
75 In version 2. You can either use MDAnalysis or the out-of memory option using
76 EncoderMap's new TrajEnsemble and SingleTraj classes.
78 Collective Variables is a term used for data of some dimension matching the dimension of your trajectory.
79 Collective variables of dimensionality 1 assign a single (float) value to every frame of a simulation or
80 simulation ensemble. This could the the membership to a cluster, the distance between the termini of a
81 protein or the distance between two spin labels. Collective variables of dimensionality 2
82 assign a list of floats to every simulation frame. The backbone torsions are such a collective variable.
83 A flattened array of pairwise distances between CA atoms would also fall into this category. CVs of
84 dimensionality 3 ascribe a value to every atom in every frame. This could be the xyz-coordinates of the atom
85 or the beta-factor or the charge.
87 Encodermap in its Angle-Dihedral-Cartesioan mode uses the following collective variables:
88 * cartesians: The xyz-coordinates of every atom in every frame in every trajectory.
89 * central_cartesians: The xyz-coordinates of the backbone C, CA, N atoms.
90 * dihedrals: The omega-phi-psi angles of the backbone.
91 * angles: The angles between the central_cartesian atoms.
92 * lengths: The distances between the central_cartesian atoms.
93 * sidedihedrals: The dihedrals of the sidechains in order residue1-chi1-chi5 residue2-ch1-chi5.
95 """
97 def __init__(
98 self,
99 trajs,
100 cache_path="",
101 top=None,
102 write_traj=False,
103 fmt=".nc",
104 start=None,
105 stop=None,
106 step=None,
107 ):
108 """Instantiate the MolData Class.
110 The trajs parameter can take a number of possible inputs:
111 * MDAnalysis.AtomGroup: Ensuing backwards-compatibility to the old MolData class.
112 * em.TrajEnsemble: EncoderMap's TrajEnsemble class which keeps track of frames and collective
113 variables.
114 * list of str: If you don't want to bother yourself with the TrajEnsemble class you can pass a
115 list of str giving the filenames of many trajetcory files (.xtc, .dcd, .h5). Make sure
116 to also provide a topology in case of non-topology trajectories.
118 Args:
119 trajs (Union[MDAnalysis.AtomGroup, encodermap.TrajEnsemble, list]): The trajectories to load.
120 Can be either one of the following:
121 * MDAnalysis.AtomGroup. For Backwards-compatibility.
122 * encodermap.TrajEnsemble. New TrajEnsemble class which manages frames and collective variables.
123 * list: Simply provide a list of trajectory files and don't forget to provide a topology.
124 cache_path (str, optional): Where to save generated Data to. Saves either numpy arrays (when AtomGroup
125 is provided as trajs, or fmt is '.npy') or NetCDF-HDF5 files with xarray (fmt is '.nc'). When an
126 empty string is provided nothing is written to disk. Defaults to '' (empty string).
127 top (Union[str, mdtraj.Topology, None], optional): The topology of trajs in case trajs is a list of str.
128 Can take filename of a topology file or already loaded mdtraj.Topology. Defaults to None.
129 write_traj (bool, optional): Whether to include the trajectory (+topology) into the NetCDF-HDF5 file.
130 This option only works in conjunction with fmt='.nc' and if set to True will use mdtraj to write the
131 trajectory, topology and the collective variables to one comprehensive file.
132 fmt (str, optional): The format to save the CVs as. Can be either '.npy' or '.nc'. Defaults to '.nc'.
133 The default is NetCDF-HDF5, because these files can be read iteratively and such can be larger
134 than memory allows. This helps in the construction of tfrecord files that can also be used to train
135 a network with large datasets.
136 start (Union[int, None], optional): First frame to analyze. Is there for backwards-compatibility. This
137 feature is dropped in the newer TrajEnsemble pipeline.
138 stop (Union[int, None], optional): Last frame to analyze. Is there for backwards-compatibility. This
139 feature is dropped in the newer TrajEnsemble pipeline.
140 step (Union[int, None], optional): Step provided to old MolData class. Is there for backwards-compatibility.
141 This feature is dropped in the newer TrajEnsemble pipeline.
143 """
144 if isinstance(trajs, mda.AtomGroup):
145 self = MolDatav1(trajs, cache_path, start, stop, step)
146 return
147 if isinstance(trajs, str):
148 trajs = [trajs]
149 if all([isinstance(i, str) for i in trajs]):
150 self.trajs = TrajEnsemble(trajs, tops)
151 elif isinstance(trajs, TrajEnsemble):
152 self.trajs = trajs
153 elif isinstance(trajs, SingleTraj):
154 self.trajs = trajs._gen_ensemble()
155 else:
156 raise TypeError(
157 f"trajs musst be str, list, TrajEnsemble, SingleTraj, or mda.AtomGroup. You supplied {type(trajs)}"
158 )
160 if cache_path:
161 feat = Featurizer(self.trajs, in_memory=False)
162 feat.add_list_of_feats("all")
163 self.trajs.load_CVs(feat, directory=cache_path)
164 else:
165 feat = Featurizer(self.trajs)
166 feat.add_list_of_feats("all")
167 self.trajs.load_CVs(feat)
169 # Use the data from self.trajs
170 self.cartesians = self.trajs.all_cartesians
171 self.central_cartesians = self.trajs.central_cartesians
172 self.dihedrals = self.trajs.central_dihedrals
173 self.sidedihedrals = self.trajs.side_dihedrals
174 self.angles = self.trajs.central_angles
175 self.lengths = self.trajs.central_distances
177 def __iadd__(self, other):
178 self.cartesians = np.concatenate([self.cartesians, other.cartesians], axis=0)
179 self.central_cartesians = np.concatenate(
180 [self.central_cartesians, other.central_cartesians], axis=0
181 )
182 self.dihedrals = np.concatenate([self.dihedrals, other.dihedrals], axis=0)
183 self.sidedihedrals = np.concatenate(
184 [self.sidedihedrals, other.sidedihedrals], axis=0
185 )
186 self.angles = np.concatenate([self.angles, other.angles], axis=0)
187 self.lengths = np.concatenate([self.lengths, other.lengths], axis=0)
188 return self
190 def write_tfrecords(self, path=None):
191 """Todo"""
192 pass