Coverage for encodermap/moldata/moldata.py: 22%
46 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-07 11:05 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-07 11:05 +0000
1# -*- coding: utf-8 -*-
2# encodermap/moldata/moldata.py
3################################################################################
4# Encodermap: A python library for dimensionality reduction.
5#
6# Copyright 2019-2022 University of Konstanz and the Authors
7#
8# Authors:
9# Kevin Sawade, Tobias Lemke
10#
11# Encodermap is free software: you can redistribute it and/or modify
12# it under the terms of the GNU Lesser General Public License as
13# published by the Free Software Foundation, either version 2.1
14# of the License, or (at your option) any later version.
15# This package is distributed in the hope that it will be useful to other
16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the
17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18# See the GNU Lesser General Public License for more details.
19#
20# See <http://www.gnu.org/licenses/>.
21################################################################################
23"""New MolData class. Uses PyEMMA to calculate many trajectories in Parallel.
25Even when the set of trajectories or even collective variables is too large to keep in memory.
27Allows creation of tfrecord files to pass large datasets to tensorflow that normally won't fit into memory.
29Is Backwards-compatible to the old MolData class.
31ToDo:
32 * Add tfrecord capabilities
35"""
37##############################################################################
38# Imports
39##############################################################################
41import numpy as np
43from .._optional_imports import _optional_import
44from ..encodermap_tf1.moldata import MolData
45from ..loading import Featurizer
46from ..trajinfo.info_all import TrajEnsemble
47from ..trajinfo.info_single import SingleTraj
49##############################################################################
50# Optional Imports
51##############################################################################
54mda = _optional_import("MDAnalysis")
55md = _optional_import("mdtraj")
57##############################################################################
58# Globals
59##############################################################################
61__all__ = ["NewMolData"]
63##############################################################################
64# Public Classes
65##############################################################################
68class NewMolData:
69 """MolData version 2. Extracts and holds conformational information of trajectories.
71 In version 2. You can either use MDAnalysis or the out-of memory option using
72 encodermap's new TrajEnsemble and SingleTraj classes.
74 Collective Variables is a term used for data of some dimension matching the dimension of your trajectory.
75 Collective variables of dimensionality 1 assign a single (float) value to every frame of a simulation or
76 simulation ensemble. This could the the membership to a cluster, the distance between the termini of a
77 protein or the distance between two spin labels. Collective variables of dimensionality 2
78 assign a list of floats to every simulation frame. The backbone torsions are such a collective variable.
79 A flattened array of pairwise distances between CA atoms would also fall into this category. CVs of
80 dimensionality 3 ascribe a value to every atom in every frame. This could be the xyz-coordinates of the atom
81 or the beta-factor or the charge.
83 Encodermap in its Angle-Dihedral-Cartesioan mode uses the following collective variables:
84 * cartesians: The xyz-coordinates of every atom in every frame in every trajectory.
85 * central_cartesians: The xyz-coordinates of the backbone C, CA, N atoms.
86 * dihedrals: The omega-phi-psi angles of the backbone.
87 * angles: The angles between the central_cartesian atoms.
88 * lengths: The distances between the central_cartesian atoms.
89 * sidedihedrals: The dihedrals of the sidechains in order residue1-chi1-chi5 residue2-ch1-chi5.
91 """
93 def __init__(
94 self,
95 trajs,
96 cache_path="",
97 top=None,
98 write_traj=False,
99 fmt=".nc",
100 start=None,
101 stop=None,
102 step=None,
103 ):
104 """Instantiate the MolData Class.
106 The trajs parameter can take a number of possible inputs:
107 * MDAnalysis.AtomGroup: Ensuing backwards-compatibility to the old MolData class.
108 * em.TrajEnsemble: EncoderMap's TrajEnsemble class which keeps track of frames and collective
109 variables.
110 * list of str: If you don't want to bother yourself with the TrajEnsemble class you can pass a
111 list of str giving the filenames of many trajetcory files (.xtc, .dcd, .h5). Make sure
112 to also provide a topology in case of non-topology trajectories.
114 Args:
115 trajs (Union[MDAnalysis.AtomGroup, encodermap.TrajEnsemble, list]): The trajectories to load.
116 Can be either one of the following:
117 * MDAnalysis.AtomGroup. For Backwards-compatibility.
118 * encodermap.TrajEnsemble. New TrajEnsemble class which manages frames and collective variables.
119 * list: Simply provide a list of trajectory files and don't forget to provide a topology.
120 cache_path (str, optional): Where to save generated Data to. Saves either numpy arrays (when AtomGroup
121 is provided as trajs, or fmt is '.npy') or NetCDF-HDF5 files with xarray (fmt is '.nc'). When an
122 empty string is provided nothing is written to disk. Defaults to '' (empty string).
123 top (Union[str, mdtraj.Topology, None], optional): The topology of trajs in case trajs is a list of str.
124 Can take filename of a topology file or already loaded mdtraj.Topology. Defaults to None.
125 write_traj (bool, optional): Whether to include the trajectory (+topology) into the NetCDF-HDF5 file.
126 This option only works in conjunction with fmt='.nc' and if set to True will use mdtraj to write the
127 trajectory, topology and the collective variables to one comprehensive file.
128 fmt (str, optional): The format to save the CVs as. Can be either '.npy' or '.nc'. Defaults to '.nc'.
129 The default is NetCDF-HDF5, because these files can be read iteratively and such can be larger
130 than memory allows. This helps in the construction of tfrecord files that can also be used to train
131 a network with large datasets.
132 start (Union[int, None], optional): First frame to analyze. Is there for backwards-compatibility. This
133 feature is dropped in the newer TrajEnsemble pipeline.
134 stop (Union[int, None], optional): Last frame to analyze. Is there for backwards-compatibility. This
135 feature is dropped in the newer TrajEnsemble pipeline.
136 step (Union[int, None], optional): Step provided to old MolData class. Is there for backwards-compatibility.
137 This feature is dropped in the newer TrajEnsemble pipeline.
139 Examples:
140 >>> import encodermap as em
141 >>> traj =
143 """
144 if isinstance(trajs, mda.AtomGroup):
145 self = MolDatav1(trajs, cache_path, start, stop, step)
146 return
147 if isinstance(trajs, str):
148 trajs = [trajs]
149 if all([isinstance(i, str) for i in trajs]):
150 self.trajs = TrajEnsemble(trajs, tops)
151 elif isinstance(trajs, TrajEnsemble):
152 self.trajs = trajs
153 elif isinstance(trajs, SingleTraj):
154 self.trajs = trajs._gen_ensemble()
155 else:
156 raise TypeError(
157 f"trajs musst be str, list, TrajEnsemble, SingleTraj, or mda.AtomGroup. You supplied {type(trajs)}"
158 )
160 if cache_path:
161 feat = Featurizer(self.trajs, in_memory=False)
162 feat.add_list_of_feats("all")
163 self.trajs.load_CVs(feat, directory=cache_path)
164 else:
165 feat = Featurizer(self.trajs)
166 feat.add_list_of_feats("all")
167 self.trajs.load_CVs(feat)
169 # Use the data from self.trajs
170 self.cartesians = self.trajs.all_cartesians
171 self.central_cartesians = self.trajs.central_cartesians
172 self.dihedrals = self.trajs.central_dihedrals
173 self.sidedihedrals = self.trajs.side_dihedrals
174 self.angles = self.trajs.central_angles
175 self.lengths = self.trajs.central_distances
177 def __iadd__(self, other):
178 self.cartesians = np.concatenate([self.cartesians, other.cartesians], axis=0)
179 self.central_cartesians = np.concatenate(
180 [self.central_cartesians, other.central_cartesians], axis=0
181 )
182 self.dihedrals = np.concatenate([self.dihedrals, other.dihedrals], axis=0)
183 self.sidedihedrals = np.concatenate(
184 [self.sidedihedrals, other.sidedihedrals], axis=0
185 )
186 self.angles = np.concatenate([self.angles, other.angles], axis=0)
187 self.lengths = np.concatenate([self.lengths, other.lengths], axis=0)
188 return self
190 def write_tfrecords(self, path=None):
191 """Todo"""
192 pass