Coverage for encodermap/misc/xarray.py: 7%
266 statements
« prev ^ index » next coverage.py v7.4.1, created at 2024-12-31 16:54 +0100
« prev ^ index » next coverage.py v7.4.1, created at 2024-12-31 16:54 +0100
1# -*- coding: utf-8 -*-
2# encodermap/misc/xarray.py
3################################################################################
4# EncoderMap: A python library for dimensionality reduction.
5#
6# Copyright 2019-2024 University of Konstanz and the Authors
7#
8# Authors:
9# Kevin Sawade, Tobias Lemke
10#
11# Encodermap is free software: you can redistribute it and/or modify
12# it under the terms of the GNU Lesser General Public License as
13# published by the Free Software Foundation, either version 2.1
14# of the License, or (at your option) any later version.
15# This package is distributed in the hope that it will be useful to other
16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the
17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18# See the GNU Lesser General Public License for more details.
19#
20# See <http://www.gnu.org/licenses/>.
21################################################################################
22"""EncoderMap's xarray manipulation functions.
24EncoderMap uses xarray datasets to save CV data alongside with trajectory data.
25These functions implement creation of such xarray datasets.
27"""
28################################################################################
29# Imports
30################################################################################
33# Future Imports at the top
34from __future__ import annotations
36# Standard Library Imports
37import itertools
38import warnings
40# Third Party Imports
41import numpy as np
42from optional_imports import _optional_import
44# Encodermap imports
45from encodermap.misc.misc import FEATURE_NAMES
48################################################################################
49# Optional Imports
50################################################################################
53xr = _optional_import("xarray")
56################################################################################
57# Typing
58################################################################################
61# Standard Library Imports
62from numbers import Number
63from typing import TYPE_CHECKING, Optional, Union, overload
66if TYPE_CHECKING:
67 # Third Party Imports
68 import xarray as xr
70 # Encodermap imports
71 from encodermap.loading.features import AnyFeature
72 from encodermap.loading.featurizer import Featurizer
73 from encodermap.trajinfo.info_single import SingleTraj
76################################################################################
77# Globals
78################################################################################
81__all__: list[str] = [
82 "construct_xarray_from_numpy",
83 "unpack_data_and_feature",
84]
87################################################################################
88# Functions
89################################################################################
92def _get_indexes_from_feat(f: "AnyFeature", traj: "SingleTraj") -> np.ndarray:
93 """Returns the indices of this feature.
95 This can be useful for later to redo some analysis or use tensorflow
96 to implement the geometric operations in the compute graph.
98 Note:
99 Due to some PyEMMA legacy code. Sometimes the indices are called indexes.
100 Furthermore, some PyEMMA feature give the indices different names, like
101 'group_definitions'. Here, indices are integer values that align with
102 the atomic coordinate of a trajectory or ensemble of trajectories with
103 shared topology.
105 Args:
106 f (AnyFeature): The feature to extract the indices from. This has to
107 be a subclass of :class:`encodermap.loading.features.Feature`.
108 traj (SingleTraj): This argument has to be provided for the RMSD
109 features. We make it mandatory for the other feature types also
110 to keep the input consistent.
112 Returns:
113 np.ndarray: An integer array describing the atoms, this feature uses
114 to compute the collevtive variables.
116 """
117 # Local Folder Imports
118 from ..loading.features import (
119 CustomFeature,
120 GroupCOMFeature,
121 MinRmsdFeature,
122 ResidueCOMFeature,
123 )
125 try:
126 return f.indexes
127 except AttributeError as e:
128 for key in f.__dir__():
129 if "inde" in key:
130 return np.asarray(f.__dict__[key])
131 if isinstance(f, (GroupCOMFeature, ResidueCOMFeature)):
132 return f.group_definitions
133 if isinstance(f, MinRmsdFeature):
134 if f.atom_indices is None:
135 return np.arange(traj.n_atoms)
136 else:
137 return f.atom_indices
138 if isinstance(f, CustomFeature):
139 descr = f.describe()
140 if any(["CustomFeature" in i for i in descr]):
141 return np.asarray(
142 [f"{d.split()[0]} INDEX {i}" for i, d in enumerate(descr)]
143 )
144 else:
145 raise Exception(
146 f"Can't decide on indexes for this custom feature:, "
147 f"{f.__class__.__name__=} {f.describe()[:2]=}"
148 ) from e
149 else:
150 raise e
153def _cast_to_int_maybe(a: np.ndarray) -> np.ndarray:
154 """Casts a np.array to int, if possible.
156 Args:
157 a (np.ndarray): The array.
159 Returns:
160 np.ndarray: The output, which can be int.
161 """
162 if not np.any(np.mod(a, 1)):
163 return a.astype(np.int32)
164 return a
167def construct_xarray_from_numpy(
168 traj: SingleTraj,
169 data: np.ndarray,
170 name: str,
171 deg: bool = False,
172 labels: Optional[list[str]] = None,
173 check_n_frames: bool = False,
174) -> xr.DataArray:
175 """Constructs a `xarray.DataArray` from a numpy array.
177 Three cases are recognized:
178 * The input array in data has ndim == 2. This kind of feature/CV is a
179 per-frame feature, like the membership to clusters. Every frame of
180 every trajectory is assigned a single value (most often int values).
181 * The input array in data has ndim == 3: This is also a per-frame
182 feature/CV, but this time every frame is characterized by a series
183 of values. These values can be dihedral angles in the backbone
184 starting from the protein's N-terminus to the C-terminus, or
185 pairwise distance features between certain atoms. The xarray
186 datarray constructed from this kind of data will have a label
187 dimension that will either contain generic labels like
188 'CUSTOM_FEATURE FEATURE 0' or labels defined by the featurizer,
189 such as 'SIDECHAIN ANGLE CHI1 OF RESIDUE 1LYS'.
190 * The input array in data has ndim == 4. Here, the same feature/CV is
191 duplicated for the protein's atoms. Besides the XYZ coordinates of
192 the atoms, no other CVs should fall into this case. The labels will be
193 2-dimensional with 'POSITION OF ATOM H1 IN RESIDUE 1LYS' in
194 dimension 0 and either 'X', 'Y' or 'Z' in dimension 1.
196 Args:
197 traj (em.SingleTraj): The trajectory we want to create the
198 `xarray.DataArray` for.
199 data (np.ndarray): The numpy array we want to create the data from.
200 Note that the data passed into this function should be expanded
201 by `np.expand_dim(a, axis=0)`, so to add a new axis to the complete
202 data containing the trajectories of a trajectory ensemble.
203 name (str): The name of the feature. This can be chosen freely. Names
204 like 'central_angles', 'backbone_torsions' would make the most sense.
205 deg (bool): Whether provided data is in deg or radians.
206 labels (Optional[list]): If you have specific labels for your CVs in
207 mind, you can overwrite the generic 'CUSTOM_FEATURE FEATURE 0'
208 labels by providing a list for this argument. If None is provided,
209 generic names will be given to the features. Defaults to None.
210 check_n_frames (bool): Whether to check whether the number of frames in
211 the trajectory matches the len of the data in at least one
212 dimension. Defaults to False.
214 Returns:
215 xarray.DataArray: An `xarray.DataArray`.
217 Examples:
218 >>> import encodermap as em
219 >>> import numpy as np
220 >>> from encodermap.misc.xarray import construct_xarray_from_numpy
221 >>> # load file from RCSB and give it traj num to represent it in a
222 >>> # potential trajectory ensemble
223 >>> traj = em.load('https://files.rcsb.org/view/1GHC.pdb', traj_num=1)
224 >>> # single trajectory needs to be expanded into 'trajectory' axis
225 >>> z_coordinate = np.expand_dims(traj.xyz[:,:,0], 0)
226 >>> da = construct_xarray_from_numpy(traj, z_coordinate, 'z_coordinate')
227 >>> print(da.coords['Z_COORDINATE'].values[:2])
228 ['Z_COORDINATE FEATURE 0' 'Z_COORDINATE FEATURE 1']
229 >>> print(da.coords['traj_num'].values)
230 [1]
231 >>> print(da.attrs['time_units'])
232 ps
234 """
235 if check_n_frames:
236 if not any(s == traj.n_frames for s in data.shape):
237 raise Exception(
238 f"Can't add CV with name '{name}' to trajectory. The trajectory has "
239 f"{traj.n_frames} frames. The data has a shape of {data.shape} "
240 f"No dimension of data matches the frames of the trajectory."
241 )
242 if traj.backend == "no_load":
243 with_time = False
244 else:
245 with_time = True
246 if data.ndim == 2:
247 if labels is None:
248 labels = [f"{name.upper()} FEATURE"]
249 da, _ = make_frame_CV_dataarray(
250 labels, traj, name, data, deg=deg, with_time=with_time
251 )
252 elif data.ndim == 3:
253 if labels is None:
254 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])]
255 da, _ = make_dataarray(labels, traj, name, data, deg=deg, with_time=with_time)
256 elif data.ndim == 4:
257 if labels is None:
258 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-2])]
259 da = make_position_dataarray_from_numpy(
260 labels,
261 traj,
262 name,
263 data,
264 deg=deg,
265 with_time=with_time,
266 )
267 else:
268 raise Exception(
269 f"The provided data has a dimensionality of {data.ndim}, but only 2, 3 and 4 are supported."
270 )
271 return da
274def unpack_data_and_feature(
275 feat: Featurizer,
276 traj: SingleTraj,
277 input_data: np.ndarray,
278) -> xr.Dataset:
279 """Makes a `xarray.Dataset` from data and a featurizer.
281 Usually, if you add multiple features to a featurizer, they are
282 stacked along the feature axis. Let's say, you have a trajectory with 20 frames
283 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi).
284 If you then also add the end-to-end distance as a feature, the data returned by
285 the featurizer will have the shape (20, 7). This function returns the correct indices,
286 so that iteration of zip(Featurizer.active_features, indices) will yield the
287 correct results.
289 Args:
290 feat (encodermap.loading.Featurizer): An instance of the currently used `encodermap.loading.Featurizer`.
291 traj (encodermap.trajinfo.SingleTraj): An instance of `encodermap.SingleTraj`, that the data
292 in `input_data` was computed from.
293 input_data (np.ndarray): The data, as returned from PyEMMA.
295 Returns:
296 xarray.Dataset: An `xarray.Dataset` with all features in a nice format.
298 """
299 # Local Folder Imports
300 from ..trajinfo.trajinfo_utils import trajs_combine_attrs
302 # this needs to be done, because pyemma concatenates the data
303 # along the feature axis
304 indices = get_indices_by_feature_dim(feat, traj, input_data.shape)
306 DAs = {}
307 indexes = {}
308 for f, ind in zip(feat.features, indices):
309 data = input_data[:, ind]
310 if data.shape[-1] == 1 and f._dim != 1:
311 data = data.squeeze(-1)
313 # decide on the name
314 try:
315 name = FEATURE_NAMES[f.name]
316 except (KeyError, AttributeError):
317 if hasattr(f, "name"):
318 if isinstance(f.name, str):
319 name = f.name
320 if "mdtraj.trajectory" in f.name.lower():
321 f.name = f.__class__.__name__
322 name = f.__class__.__name__
323 else:
324 name = f.__class__.__name__
325 f.name = name
326 else:
327 name = f.__class__.__name__
328 if name == "CustomFeature":
329 name = f.describe()[0].split()[0]
330 f.name = name
331 assert data.shape[1] == f._dim
333 if data.shape[0] != len(traj):
334 if traj.index == (None,):
335 raise Exception(
336 "Shape of provided data does not fit traj. Traj "
337 f"has {traj.n_frames=} {len(traj)=}. Data has shape {data.shape}"
338 )
339 else:
340 data = data[traj.index].squeeze(0)
341 assert data.shape[0] == len(traj), f"{data.shape=}, {len(traj)=}"
342 if name in DAs:
343 name = (
344 name
345 + f"_{len(list(filter(lambda x: True if name in x else False, list(DAs.keys()))))}"
346 )
347 if hasattr(f, "deg"):
348 deg = f.deg
349 else:
350 deg = None
352 if (
353 f.name in ["AllCartesians", "CentralCartesians", "SideChainCartesians"]
354 or f.atom_feature
355 ):
356 data = data.reshape(len(traj), -1, 3)
357 data = np.expand_dims(data, axis=0)
358 if hasattr(feat, "indices_by_top"):
359 f.indexes = feat.indices_by_top[traj.top][f.name]
360 DAs[name], indexes[name + "_feature_indices"] = make_position_dataarray(
361 f.describe(), traj, name, data, deg=deg, feat=f
362 )
363 else:
364 data = np.expand_dims(data, axis=0)
365 if f._dim == 1:
366 DAs[name], indexes[name + "_feature_indices"] = make_frame_CV_dataarray(
367 f.describe(),
368 traj,
369 name,
370 data,
371 deg=deg,
372 feat=f,
373 )
374 else:
375 if hasattr(feat, "indices_by_top"):
376 f.indexes = feat.indices_by_top[traj.top][f.name]
377 DAs[name], indexes[name + "_feature_indices"] = make_dataarray(
378 f.describe(), traj, name, data, deg=deg, feat=f
379 )
380 if indexes[name + "_feature_indices"] is None:
381 del indexes[name + "_feature_indices"]
382 DAs_and_indxes = DAs | indexes
383 attrs = []
384 for i in DAs_and_indxes.values():
385 attrs.append(i.attrs)
386 try:
387 ds = xr.Dataset(DAs_and_indxes, attrs=trajs_combine_attrs(attrs))
388 except ValueError as e:
389 raise Exception(f"{DAs_and_indxes.keys()=}") from e
390 return ds
393def make_frame_CV_dataarray(
394 labels: list[str],
395 traj: SingleTraj,
396 name: str,
397 data: np.ndarray,
398 deg: Union[None, bool],
399 with_time: bool = True,
400 frame_indices: Optional[np.ndarray] = None,
401 labels2: Optional[list[str]] = None,
402 feat: Optional["AnyFeature"] = None,
403) -> tuple[xr.DataArray, Union[None, xr.DataArray]]:
404 """Make a DataArray from a frame CV feature.
406 A normal features yields multiple values per trajectory frame (e.g. the
407 backbone dihedral angles give 2 * n_residues features per frame). A frame
408 CV feature is just a single value for a trajectory. Examples are:
409 * The distance between a binding pocket and a ligand.
410 * The volume of the unitcell.
411 * Whether a molecule is in state 1 or 2, could be described as a binary frame CV features.
413 This method has additional logic. If `data` has a shape of (1, n_frames, 1) it
414 is left as is. If it has a shape of (1, n_frames), it will be expanded on the -1 axis to
415 shape (1, n_frames, 1).
417 Note:
418 Please make sure that the input data conforms to the nm, ps, rad coordinates.
420 Args:
421 labels (list[str]): The labels, that specify the `CV_num` dimension. This
422 requires the expression `len(labels == data.shape[2]` to be True. If you
423 build the DataArray from a feature. The `labels` argument usually will
424 be `feature.describe()`.
425 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.
426 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,
427 an xarray.DataArray always represents one feature, of one trajectory.
428 A trajectory can have multiple features, which is represented as an
429 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.
430 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This
431 dataset is created ad hoc, by merging the datasets of the trajectories along
432 the `traj_num` axis.
433 name (str): The name of the feature. This name will be used to group similar
434 features in the large `xarray.Dataset`s of trajectory ensembles. If you
435 construct this DataArray from a feature, you can either use `feature.name`,
436 or `feature.__class__.__name__`.
437 data (Union[np.ndarray, dask.array]): The data to fill the DataArray with.
438 deg (Union[None, bool]): Whether the provided data is in deg or radians.
439 If None is provided, it will not be included in the attrs.
440 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of
441 the frames to the `frame_num` axis. Can be either True, or False, or a
442 `np.ndarray`, in which case the data in this array will be used. Defaults to True.
443 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.
444 This can come in handy, if the trajectory is loaded out of-memory and
445 distributed, in which case, MDAnalysis is used to load data.
446 MDTraj does never load the trajectory coordinates. This the attribute
447 `_original_frame_indices` in the trajectory will be empty. However, as
448 the distributed task did already assign frames to the workers, we have
449 this number, we just need to use it. If set to None `_original_frame_indices`
450 will be used. Otherwise, the `np.ndarray` provided here, will be used.
451 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.
452 This can be especially useful for trajectory ensembles, which might differ
453 in a lot of places (i.e., single amino acid exchange). If the labels are
454 too strict (LYS1 ATOM1 C), the data of two slightly different proteins
455 cannot be concatenated. If the labels are generic (AA1 ATOM1), this can
456 be done. Defaults to None.
458 Returns:
459 xarray.DataArray: The resulting DataArray.
461 """
462 if data.ndim == 3:
463 pass
464 elif data.ndim == 2:
465 data = np.expand_dims(data, -1)
466 # data = _cast_to_int_maybe(data)
468 return make_dataarray(
469 labels, traj, name, data, deg, with_time, frame_indices, labels2, feat
470 )
473def make_position_dataarray_from_numpy(
474 atoms, traj, name, data, deg=None, with_time=True
475):
476 """Same as :func:`make_dataarray`, but with higher feature dimension."""
477 attrs = {
478 "length_units": "nm",
479 "time_units": "ps",
480 "full_path": traj.traj_file,
481 "topology_file": traj.top_file,
482 "feature_axis": "ATOM",
483 }
484 if deg is not None:
485 if deg:
486 attrs["angle_units"] = "deg"
487 else:
488 attrs["angle_units"] = "rad"
489 frame_indices = traj.id
490 if frame_indices.ndim > 1:
491 frame_indices = frame_indices[:, 1]
492 da = xr.DataArray(
493 data,
494 coords={
495 "traj_num": ("traj_num", np.asarray([traj.traj_num])),
496 "traj_name": ("traj_num", np.asarray([traj.basename])),
497 "frame_num": ("frame_num", frame_indices),
498 "ATOM": ("ATOM", np.asarray(atoms)),
499 "COORDS": np.array(["POSITION X", "POSITION Y", "POSITION Z"]),
500 },
501 dims=["traj_num", "frame_num", "ATOM", "COORDS"],
502 name=name,
503 attrs=attrs,
504 )
505 if with_time:
506 da = da.assign_coords(time=("frame_num", traj.time))
507 return da
510@overload
511def make_position_dataarray( 511 ↛ exitline 511 didn't jump to the function exit
512 labels: list[str],
513 traj: SingleTraj,
514 name: str,
515 data: np.ndarray,
516 deg: Union[None, bool] = None,
517 with_time: bool = True,
518 frame_indices: Optional[np.ndarray] = None,
519 labels2: Optional[list[str]] = None,
520 feat: Optional["AnyFeature"] = None,
521) -> tuple[xr.DataArray, None]: ...
524@overload
525def make_position_dataarray( 525 ↛ exitline 525 didn't jump to the function exit
526 labels: list[str],
527 traj: SingleTraj,
528 name: str,
529 data: np.ndarray,
530 deg: Union[None, bool] = None,
531 with_time: bool = True,
532 frame_indices: Optional[np.ndarray] = None,
533 labels2: Optional[list[str]] = None,
534 feat: Optional["AnyFeature"] = "AnyFeature",
535) -> tuple[xr.DataArray, xr.DataArray]: ...
538def make_position_dataarray(
539 labels: list[str],
540 traj: SingleTraj,
541 name: str,
542 data: np.ndarray,
543 deg: Union[None, bool] = None,
544 with_time: bool = True,
545 frame_indices: Optional[np.ndarray] = None,
546 labels2: Optional[list[str]] = None,
547 feat: Optional["AnyFeature"] = None,
548) -> tuple[xr.DataArray, Union[None, xr.DataArray]]:
549 """Creates DataArray belonging to cartesian positions.
551 Similar to `make_datarray`, but the shapes are even larger. As every atom
552 contributes 3 coordinates (x, y, z) to the data, the shape of the returned
553 DataArray is (1, no_of_frames, no_of_atoms_considered, 3).
555 Note:
556 Please make sure that the input data conforms to the nm, ps, rad coordinates.
558 Args:
559 labels (list[str]): The labels, that specify the `CV_num` dimension. This
560 requires the expression `len(labels == data.shape[2]` to be True. If you
561 build the DataArray from a feature. The `labels` argument usually will
562 be `feature.describe()`.
563 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.
564 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,
565 an xarray.DataArray always represents one feature, of one trajectory.
566 A trajectory can have multiple features, which is represented as an
567 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.
568 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This
569 dataset is created ad hoc, by merging the datasets of the trajectories along
570 the `traj_num` axis.
571 name (str): The name of the feature. This name will be used to group similar
572 features in the large `xarray.Dataset`s of trajectory ensembles. If you
573 construct this DataArray from a feature, you can either use `feature.name`,
574 or `feature.__class__.__name__`.
575 data (Union[np.ndarray, dask.array]): The data to fill the DataArray with.
576 deg (bool): Whether the provided data is in deg or radians.
577 If None is provided, it will not be included in the attrs. Defaults to None.
578 with_time (Union[bool, np.ndarray]): Whether to add the time of
579 the frames to the `frame_num` axis. Can be either True, or False, or a
580 `np.ndarray`, in which case the data in this array will be used. Defaults to True.
581 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.
582 This can come in handy, if the trajectory is loaded out of-memory and
583 distributed, in which case, MDAnalysis is used to load data.
584 MDTraj does never load the trajectory coordinates. This the attribute
585 `_original_frame_indices` in the trajectory will be empty. However, as
586 the distributed task did already assign frames to the workers, we have
587 this number, we just need to use it. If set to None `_original_frame_indices`
588 will be used. Otherwise, the `np.ndarray` provided here, will be used.
589 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.
590 This can be especially useful for trajectory ensembles, which might differ
591 in a lot of places (i.e., single amino acid exchange). If the labels are
592 too strict (LYS1 ATOM1 C), the data of two slightly different proteins
593 cannot be concatenated. If the labels are generic (AA1 ATOM1), this can
594 be done. Defaults to None.
596 Returns:
597 tuple[xr.DataArray, Union[None, xr.DataArray]]: The resulting dataarrays,
598 as a tuple of (data, indices) or (data, None), if indices were not
599 requested.
601 """
602 atom_axis: str = "ATOM"
603 if feat is not None:
604 if feat.__class__.__name__ == "SideChainCartesians":
605 atom_axis = "SIDEATOM"
606 if feat.__class__.__name__ == "AllCartesians":
607 atom_axis = "ALLATOM"
608 attrs = {
609 "length_units": "nm",
610 "time_units": "ps",
611 "full_path": traj.traj_file,
612 "topology_file": traj.top_file,
613 "feature_axis": atom_axis,
614 }
615 if feat is not None:
616 indices = _get_indexes_from_feat(feat, traj)
617 else:
618 indices = None
619 if deg is not None:
620 if deg:
621 attrs["angle_units"] = "deg"
622 else:
623 attrs["angle_units"] = "rad"
624 if frame_indices is None:
625 frame_indices = traj.id
626 if frame_indices.ndim > 1:
627 frame_indices = frame_indices[:, 1]
629 if labels2 is not None:
630 labels = labels2
631 else:
632 labels = [_[11:].lstrip(" ") for _ in labels[::3]]
634 coords = {
635 "traj_num": ("traj_num", np.asarray([traj.traj_num])),
636 "traj_name": ("traj_num", np.asarray([traj.basename])),
637 "frame_num": ("frame_num", frame_indices),
638 atom_axis: (atom_axis, np.asarray(labels)),
639 "COORDS": np.array(["POSITION X", "POSITION Y", "POSITION Z"]),
640 }
641 da = xr.DataArray(
642 data,
643 coords=coords,
644 dims=["traj_num", "frame_num", atom_axis, "COORDS"],
645 name=name.upper(),
646 attrs=attrs,
647 )
648 if indices is not None:
649 if len(indices) // 3 == data.shape[-2] and name.startswith("CustomFeature"):
650 warnings.warn(
651 f"Can't find good labels for the feature {name}. The "
652 f"data from the feature's transform has shape {data[0].shape}. "
653 f"The indices have indices length {len(indices)}. I will not include the "
654 f"indices of this feature in the dataset. However, the output data "
655 f"will be there."
656 )
657 indices = None
658 elif len(indices) != data.shape[-2]:
659 indices_copy = indices.copy()
660 indices = np.full((len(labels),), np.nan, float)
661 indices[: len(indices_copy)] = indices_copy
663 if indices is not None:
664 coords = {
665 "traj_num": ("traj_num", np.asarray([traj.traj_num])),
666 "traj_name": ("traj_num", np.asarray([traj.basename])),
667 atom_axis: (atom_axis, np.asarray(labels)),
668 }
669 ind_da = xr.DataArray(
670 np.expand_dims(indices, 0),
671 coords=coords,
672 dims=["traj_num", atom_axis],
673 name=name.upper(),
674 attrs=attrs | {"feature_axis": atom_axis},
675 )
676 else:
677 ind_da = None
678 else:
679 ind_da = None
680 if isinstance(with_time, bool):
681 if with_time:
682 da = da.assign_coords(time=("frame_num", traj.time))
683 else:
684 da = da.assign_coords(time=("frame_num", with_time))
685 return da, ind_da
688@overload
689def make_dataarray( 689 ↛ exitline 689 didn't jump to the function exit
690 labels: list[str],
691 traj: SingleTraj,
692 name: str,
693 data: np.ndarray,
694 deg: Union[None, bool],
695 with_time: bool = True,
696 frame_indices: Optional[np.ndarray] = None,
697 labels2: Optional[list[str]] = None,
698 feat: Optional["AnyFeature"] = None,
699) -> tuple[xr.DataArray, None]: ...
702@overload
703def make_dataarray( 703 ↛ exitline 703 didn't jump to the function exit
704 labels: list[str],
705 traj: SingleTraj,
706 name: str,
707 data: np.ndarray,
708 deg: Union[None, bool],
709 with_time: bool = True,
710 frame_indices: Optional[np.ndarray] = None,
711 labels2: Optional[list[str]] = None,
712 feat: Optional["AnyFeature"] = "AnyFeature",
713) -> tuple[xr.DataArray, xr.DataArray]: ...
716def make_dataarray(
717 labels: list[str],
718 traj: SingleTraj,
719 name: str,
720 data: np.ndarray,
721 deg: Union[None, bool],
722 with_time: bool = True,
723 frame_indices: Optional[np.ndarray] = None,
724 labels2: Optional[list[str]] = None,
725 feat: Optional["AnyFeature"] = None,
726) -> tuple[xr.DataArray, Union[None, xr.DataArray]]:
727 """Creates a DataArray belonging to a feature.
729 The shapes are a bit different from what most people might be used to. As
730 EncoderMap was meant to work with ensembles of trajectories, the data is usually
731 shaped as (traj_num, frame_num, CV_num), or even (traj_num, frame_num, atom_num, 3).
733 The `xarray.DataArray` that is returned by this function reflects this. As a
734 DataArray is attributed to a single trajectory, the first shape will always be 1.
735 So for a collective variable, that describes n features, the shape of the returned
736 datarray will be (1, n_frames, n). Combining multiple trajectories, the first number
737 can increase.
739 Note:
740 Please make sure that the input data conforms to the nm, ps, rad coordinates.
742 Args:
743 labels (list[str]): The labels, that specify the `CV_num` dimension. This
744 requires the expression `len(labels == data.shape[2]` to be True. If you
745 build the DataArray from a feature. The `labels` argument usually will
746 be `feature.describe()`.
747 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.
748 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,
749 an xarray.DataArray always represents one feature, of one trajectory.
750 A trajectory can have multiple features, which is represented as an
751 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.
752 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This
753 dataset is created ad hoc, by merging the datasets of the trajectories along
754 the `traj_num` axis.
755 name (str): The name of the feature. This name will be used to group similar
756 features in the large `xarray.Dataset`s of trajectory ensembles. If you
757 construct this DataArray from a feature, you can either use `feature.name`,
758 or `feature.__class__.__name__`.
759 data (Union[np.ndarray, dask.array]): The data to fill the DataArray with.
760 deg (Optional[bool]): Whether provided data is in deg or radians. If None
761 is provided, the angle_units will not appear in the attributes.
762 Defaults to None.
763 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of
764 the frames to the `frame_num` axis. Can be either True, or False, or a
765 `np.ndarray`, in which case the data in this array will be used. Defaults to True.
766 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.
767 This can come in handy if the trajectory is loaded out of-memory and
768 distributed, in which case, MDAnalysis is used to load data.
769 MDTraj does never load the trajectory coordinates. This the attribute
770 `_original_frame_indices` in the trajectory will be empty. However, as
771 the distributed task did already assign frames to the workers, we have
772 this number, we just need to use it. If set to None `_original_frame_indices`
773 will be used. Otherwise, the `np.ndarray` provided here, will be used.
774 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.
775 This can be especially useful for trajectory ensembles, which might differ
776 in a lot of places (i.e., single amino acid exchange). If the labels are
777 too strict (LYS1 ATOM1 C), the data of two slightly different proteins
778 cannot be concatenated. If the labels are generic (AA1 ATOM1), this can
779 be done. Defaults to None.
781 Returns:
782 xarray.DataArray: The resulting DataArray.
784 """
785 if feat is not None:
786 indices = _get_indexes_from_feat(feat, traj)
787 else:
788 indices = None
789 attrs = {
790 "length_units": "nm",
791 "time_units": "ps",
792 "full_path": traj.traj_file,
793 "topology_file": traj.top_file,
794 "feature_axis": name.upper(),
795 }
796 if deg is not None:
797 if deg:
798 attrs["angle_units"] = "deg"
799 else:
800 attrs["angle_units"] = "rad"
802 if frame_indices is None:
803 frame_indices = traj.id
804 if frame_indices.ndim > 1:
805 frame_indices = frame_indices[:, 1]
807 if labels2 is not None:
808 labels = labels2
810 if callable(labels):
811 labels = [l for l in labels(traj.top)]
812 else:
813 if len(labels) > data.shape[-1]:
814 warnings.warn(
815 f"Provided labels {labels[:5]} are greater {len(labels)} than the "
816 f"data {data.shape[-1]}. I will use integers to label the feature axis."
817 )
818 labels = np.arange(data.shape[-1])
820 coords = {
821 "traj_num": ("traj_num", np.asarray([traj.traj_num])),
822 "traj_name": ("traj_num", np.asarray([traj.basename])),
823 "frame_num": ("frame_num", frame_indices),
824 name.upper(): np.asarray(labels),
825 }
827 da = xr.DataArray(
828 data,
829 coords=coords,
830 dims=["traj_num", "frame_num", name.upper()],
831 name=name,
832 attrs=attrs,
833 )
835 ind_da = xr.DataArray()
836 if indices is not None:
837 index_labels = np.asarray(labels)
838 coords = {
839 "traj_num": ("traj_num", np.asarray([traj.traj_num])),
840 "traj_name": ("traj_num", np.asarray([traj.basename])),
841 name.upper(): index_labels,
842 }
843 try:
844 indices = np.asarray(indices)
845 inhomogeneous_shape = False
846 except ValueError as e:
847 if "inhomogeneous" in str(e):
848 inhomogeneous_shape = True
849 else:
850 raise e
851 # if the shape of indices is inhomogeneous, we can't put them into a DataArray
852 if inhomogeneous_shape or any(
853 [i in feat.__class__.__name__.lower() for i in ["groupcom", "residuecom"]]
854 ):
855 warnings.warn(
856 f"The feature {name} will not produce a '{name}_feature_indices' "
857 f"xr.DataArray. Its indices contain either inhomogeneous shapes "
858 f"({indices}) or it is a center-of-mass (COM) feature. These "
859 f"features are excluded from providing indices. "
860 f"I will put a string representation of these "
861 f"indices into the DataArray's `attrs` dictionary. Thus, it "
862 f"can still be saved to disk and viewed, but it will be "
863 f"a string and not a sequence of integers."
864 )
865 da.attrs |= {f"{name}_feature_indices": str(indices)}
866 # special case: PyEMMA feature with cartesians
867 elif len(indices) == data.shape[-1] // 3 and "selection" in feat.name.lower():
868 indices = np.vstack([indices for i in range(3)]).flatten(order="F")
869 ind_da = xr.DataArray(
870 np.expand_dims(np.expand_dims(indices, 0), -1),
871 coords=coords
872 | {
873 "ATOM_NO": [0],
874 },
875 dims=["traj_num", name.upper(), "ATOM_NO"],
876 name=name.upper(),
877 attrs=attrs | {"feature_axis": name.upper()},
878 )
879 # special case: RMSD features
880 # rmsd features provide their alignment atoms
881 elif "rmsd" in feat.__class__.__name__.lower():
882 ind_da = xr.DataArray(
883 np.expand_dims(np.expand_dims(indices, 0), 0),
884 coords=coords
885 | {
886 "RMSD_ATOM_NO": indices,
887 },
888 dims=["traj_num", name.upper(), "RMSD_ATOM_NO"],
889 name=name.upper(),
890 attrs=attrs | {"feature_axis": name.upper()},
891 )
892 # special case: Align feature
893 elif "align" in feat.__class__.__name__.lower():
894 indices = np.vstack([indices for i in range(3)]).flatten(order="F")
895 ind_da = xr.DataArray(
896 np.expand_dims(np.expand_dims(indices, 0), -1),
897 coords=coords
898 | {
899 "ALIGN_ATOM_NO": [0],
900 },
901 dims=["traj_num", name.upper(), "ALIGN_ATOM_NO"],
902 name=name.upper(),
903 attrs=attrs | {"feature_axis": name.upper()},
904 )
905 elif feat.__class__.__name__ == "ResidueMinDistanceFeature":
906 # indices = np.hstack(indices)
907 # data = np.vstack([indices for i in range(3)]).flatten(order="F")
908 ind_da = xr.DataArray(
909 np.expand_dims(indices, 0),
910 coords=coords | {"RES_NO": [0, 1]},
911 dims=["traj_num", name.upper(), "RES_NO"],
912 name=name.upper(),
913 attrs=attrs | {"feature_axis": name.upper()},
914 )
915 elif len(indices) != data.shape[-1]:
916 if all(["COS" in i for i in index_labels[::2]]) and all(
917 ["SIN" in i for i in index_labels[1::2]]
918 ):
919 c = np.empty((indices.shape[0] * 2, 3), dtype=int)
920 c[0::2] = indices
921 c[1::2] = indices
922 indices = np.expand_dims(c, 0)
923 ind_da = xr.DataArray(
924 indices,
925 coords=coords | {"ATOM_NO": np.arange(indices.shape[-1])},
926 dims=["traj_num", name.upper(), "ATOM_NO"],
927 name=name.upper(),
928 attrs=attrs | {"feature_axis": name.upper()},
929 )
930 else:
931 raise Exception(
932 f"{feat=}\n\n"
933 f"{feat.name=}\n\n"
934 f"{inhomogeneous_shape=}\n\n"
935 f"{len(indices)=}\n\n"
936 f"{indices.shape=}\n\n"
937 f"{indices.ndim=}\n\n"
938 f"{data.shape=}\n\n"
939 f"{indices=}\n\n"
940 f"{data=}\n\n"
941 f"{index_labels=}\n\n"
942 f"{len(indices) == data.shape[-1] // 3=}\n\n"
943 f"{'selection' in feat.name.lower()=}"
944 )
945 # regular case indices and data are aligned
946 else:
947 indices = np.expand_dims(indices, 0)
948 if indices.ndim != data.ndim:
949 indices = np.expand_dims(indices, -1)
950 ind_da = xr.DataArray(
951 indices,
952 coords=coords | {"ATOM_NO": np.arange(indices.shape[-1])},
953 dims=["traj_num", name.upper(), "ATOM_NO"],
954 name=name.upper(),
955 attrs=attrs | {"feature_axis": name.upper()},
956 )
957 if ind_da.size > 1:
958 assert len(ind_da.coords[name.upper()]) == len(da.coords[name.upper()])
960 if isinstance(with_time, bool):
961 if with_time:
962 da = da.assign_coords(time=("frame_num", traj.time))
963 else:
964 da = da.assign_coords(time=("frame_num", with_time))
965 if ind_da.size == 1:
966 ind_da = None
967 return da, ind_da
970def get_indices_by_feature_dim(feat, traj, input_data_shape):
971 """Unpacks the concatenated features returned by PyEMMA.
973 Usually, if you add multiple features to a PyEMMA featurizer, they are
974 stacked along the feature axis. Let's say, you have a trajectory with 20 frames
975 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi).
976 If you then also add the end-to-end distance as a feature, the data returned by
977 PyEMMA will have the shape (20, 7). This function returns the correct indices,
978 so that iteration of `zip(Featurizer.active_features, indices)` will yield the
979 correct results.
981 Args:
982 feat (em.Featurizer): Instance of `encodermap.Featurizer`. The featurizer knows
983 how many entries every feature takes up in the feature space.
984 traj (encodermap.SingleTraj): An instance of `SingleTraj`. This is needed
985 for a single purpose: PyEMMA returns malformed data for .pdb files, that
986 are loaded from the protein database
987 (i.e. `traj = em.SingleTraj('https://files.rcsb.org/view/1GHC.pdb'`).
988 This is prevented by providing the traj and do a small check.
989 input_data_shape (tuple): The data, that should conform to the slicing.
990 Also needed for some checks.
992 Returns:
993 list[np.ndarray]: List of `np.ndarray`s that correctly slice the data.
995 """
996 if len(feat.features) > 1:
997 indices = [0] + add_one_by_one([f._dim for f in feat.features])
998 # Since deprecating PyEMMA, we can remove this part
999 # if traj.extension == ".pdb" and _validate_uri(traj.traj_file):
1000 # # for internet pdb files we need to slice the data. this negates the next assert but meh.
1001 # raise Exception(
1002 # "For some reason `pyemma.coordinates.source` does not like working with internet pdb files."
1003 # )
1004 # else:
1005 if len(input_data_shape) <= 1:
1006 raise Exception(f"{feat=}, {traj=}, {input_data_shape=}")
1007 if not indices[-1] == input_data_shape[1]:
1008 raise Exception(
1009 f"The indices in the features do not match the input data shape: "
1010 f"{indices[-1]=} {input_data_shape=} "
1011 f"{sum([len(f.describe()) for f in feat.features])=}\n\n"
1012 f"{indices=}\n\n{input_data_shape=}"
1013 )
1014 indices = [np.arange(i, j) for i, j in zip(indices[:-1], indices[1:])]
1015 else:
1016 indices = [np.arange(0, feat.features[0]._dim)]
1017 return indices
1020def add_one_by_one(l: list[Number]) -> list[Number]:
1021 """Creates a new list from l with elements added one after another.
1023 Args:
1024 l (list): The input list.
1026 Returns:
1027 list: The output list.
1029 Example:
1030 >>> l = [0, 2, 4, 5, 7]
1031 >>> add_one_by_one(l)
1032 [0, 2, 6, 11, 18]
1034 """
1035 new_l = []
1036 cumsum = 0
1037 for elt in l:
1038 cumsum += elt
1039 new_l.append(cumsum)
1040 return new_l