Coverage for encodermap/misc/xarray.py: 71%
156 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-07 11:05 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-07 11:05 +0000
1# -*- coding: utf-8 -*-
2# encodermap/misc/xarray.py
3################################################################################
4# Encodermap: A python library for dimensionality reduction.
5#
6# Copyright 2019-2022 University of Konstanz and the Authors
7#
8# Authors:
9# Kevin Sawade, Tobias Lemke
10#
11# Encodermap is free software: you can redistribute it and/or modify
12# it under the terms of the GNU Lesser General Public License as
13# published by the Free Software Foundation, either version 2.1
14# of the License, or (at your option) any later version.
15# This package is distributed in the hope that it will be useful to other
16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the
17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18# See the GNU Lesser General Public License for more details.
19#
20# See <http://www.gnu.org/licenses/>.
21################################################################################
22################################################################################
23# Imports
24################################################################################
27from __future__ import annotations
29import numpy as np
31from .._optional_imports import _optional_import
32from .errors import BadError
33from .misc import FEATURE_NAMES, _validate_uri
35################################################################################
36# Optional Imports
37################################################################################
40xr = _optional_import("xarray")
43################################################################################
44# Typing
45################################################################################
48from typing import TYPE_CHECKING, Optional
50if TYPE_CHECKING:
51 import xarray as xr
53 from ..loading.featurizer import Featurizer
54 from ..trajinfo import SingleTraj
57################################################################################
58# Globals
59################################################################################
62__all__ = [
63 "construct_xarray_from_numpy",
64 "unpack_data_and_feature",
65]
68################################################################################
69# Functions
70################################################################################
73def construct_xarray_from_numpy(
74 traj: SingleTraj,
75 data: np.ndarray,
76 name: str,
77 labels: Optional[list[str]] = None,
78 check_n_frames: bool = False,
79) -> xr.DataArray:
80 """Constructs an xarray dataarray from a numpy array.
82 Three different cases are recognized:
83 * The input array in data has ndim == 2. This kind of feature/CV is a per-frame feature, like the membership
84 to clusters. Every frame of every trajectory is assigned a single value (most often int values).
85 * The input array in data has ndim == 3: This is also a per-frame feature/CV, but this time every frame
86 is characterized by a series of values. These values can be dihedral angles in the backbone starting from
87 the protein's N-terminus to the C-terminus, or pairwise distance features between certain atoms.
88 The xarray datarrat constructed from this kind of data will have a label dimension that will either
89 contain generic labels like 'CUSTOM_FEATURE FEATURE 0' or labels defined by the featurizer such as
90 'SIDECHAIN ANGLE CHI1 OF RESIDUE 1LYS'.
91 * The input array in data has ndim == 4. Here, the same feature/CV is duplicated for the protein's atoms.
92 Besides the XYZ coordinates of the atoms no other CVs should fall into this case. The labels will be
93 2-dimensional with 'POSITION OF ATOM H1 IN RESIDUE 1LYS' in dimension 0 and either 'X', 'Y' or 'Z' in
94 dimension 1.
96 Args:
97 traj (em.SingleTraj): The trajectory we want to create the xarray dataarray for.
98 data (np.ndarray): The numpy array we want to create the data from. Note, that the data passed into this
99 function should be expanded by np.expand_dim(a, axis=0), so to add a new axis to the complete data
100 containing the trajectories of a trajectory ensemble.
101 name (str): The name of the feature. This can be choosen freely. Names like 'central_angles', 'backbone_torsions'
102 would make the most sense.
103 labels (Optional[list]): If you have specific labels for your CVs in mind, you can overwrite the
104 generic 'CUSTOM_FEATURE FEATURE 0' labels with providing a list for this argument. If None is provided,
105 generic names will be given to the features. Defaults to None.
106 check_n_frames (bool): Whether to check whether the number of frames in the trajectory matches the len
107 of the data in at least one dimension. Defaults to False.
109 Returns:
110 xarray.Dataarray: An `xarray.Dataarray`.
112 Examples:
113 >>> import encodermap as em
114 >>> from encodermap.misc.xarray import construct_xarray_from_numpy
115 >>> # load file from RCSB and give it traj num to represent it in a potential trajectory ensemble
116 >>> traj = em.load('https://files.rcsb.org/view/1GHC.pdb', traj_num=1)
117 >>> # single trajectory needs to be expaneded into 'trajectory' axis
118 >>> z_coordinate = np.expand_dims(traj.xyz[:,:,0], 0)
119 >>> da = construct_xarray_from_numpy(traj, z_coordinate, 'z_coordinate')
120 >>> print(da.coords['Z_COORDINATE'].values[:2])
121 ['Z_COORDINATE FEATURE 0' 'Z_COORDINATE FEATURE 1']
122 >>> print(da.coords['traj_num'].values)
123 [1]
124 >>> print(da.attrs['time_units'])
125 ps
127 """
128 if check_n_frames:
129 assert any(s == traj.n_frames for s in data.shape), print(
130 f"Data and traj misaligned. traj_frames: {traj.n_frames}, data.shape: {data.shape}, attr_name: {name}, {data}"
131 )
132 if traj.backend == "no_load":
133 with_time = False
134 else:
135 with_time = True
136 if data.ndim == 2:
137 if labels is None: 137 ↛ 139line 137 didn't jump to line 139, because the condition on line 137 was never false
138 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])]
139 da = make_frame_CV_dataarray(labels, traj, name, data, with_time=with_time)
140 elif data.ndim == 3:
141 if labels is None: 141 ↛ 143line 141 didn't jump to line 143, because the condition on line 141 was never false
142 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])]
143 da = make_dataarray(labels, traj, name, data, with_time=with_time)
144 elif data.ndim == 4: 144 ↛ 151line 144 didn't jump to line 151, because the condition on line 144 was never false
145 if labels is None: 145 ↛ 147line 145 didn't jump to line 147, because the condition on line 145 was never false
146 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-2])]
147 da = make_position_dataarray_from_numpy(
148 labels, traj, name, data, with_time=with_time
149 )
150 else:
151 raise Exception(
152 f"The provided data has a dimensionality of {data.ndim}, but only 2, 3 and 4 are supported."
153 )
154 return da
157def unpack_data_and_feature(
158 feat: Featurizer,
159 traj: SingleTraj,
160 input_data: np.ndarray,
161 put_indices_into_attrs: bool = True,
162) -> xr.Dataset:
163 """Makes a `xarray.Dataset` from data and a featurizer.
165 Usually, if you add multiple features to a PyEMMA featurizer, they are
166 stacked along the feature axis. Let's say, you have a trajectory with 20 frames
167 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi).
168 If you then also add the end-to-end distance as a feature, the data returned by
169 PyEMMA will have the shape (20, 7). This function returns the correct indices,
170 so that iteration of `zip(Featurizer.active_features, indices)` will yield the
171 correct results.
173 Args:
174 feat (em.Featurizer): An instance of the currently used `encodermap.Featurizer`.
175 traj (em.SingleTraj): An instance of `encodermap.SingleTraj`, that the data
176 in `input_data` was computed from
177 input_data (np.ndarray): The data, as returned from PyEMMA.
178 put_indices_into_attrs (bool): Whether to put the indices into the attrs.
179 This needs to be False, when Ensembles are loaded because the Features
180 of the ensemble load function do not match the real indices that should be there.
182 Returns:
183 xarray.Dataset: An `xarray.Dataset` with all features in a nice format.
185 """
186 # this needs to be done, because pyemma concatenates the data
187 # along the feature axis
188 indices = get_indices_by_feature_dim(feat, traj, input_data.shape)
190 DAs = {}
191 indexes = {}
192 for f, ind in zip(feat.features, indices):
193 data = input_data[:, ind]
194 try:
195 name = FEATURE_NAMES[f.name]
196 except (KeyError, AttributeError):
197 name = f.__class__.__name__
198 f.name = name
199 assert data.shape[1] == f._dim
200 if data.shape[0] != len(traj): 200 ↛ 201line 200 didn't jump to line 201, because the condition on line 200 was never true
201 if traj.index is None:
202 raise Exception(
203 "Shape of provided data does not fit traj. Traj "
204 f"has {traj.n_frames=} {len(traj)=}. Data has shape {data.shape}"
205 )
206 else:
207 data = data[traj.index]
208 assert data.shape[0] == len(traj)
209 if name in DAs: 209 ↛ 210line 209 didn't jump to line 210
210 name = (
211 name
212 + f"_{len(list(filter(lambda x: True if name in x else False, list(DAs.keys()))))}"
213 )
214 if f.name in ["AllCartesians", "CentralCartesians", "SideChainCartesians"]:
215 data = data.reshape(len(traj), -1, 3)
216 data = np.expand_dims(data, axis=0)
217 DAs[name] = make_position_dataarray(f.describe(), traj, name, data)
218 else:
219 data = np.expand_dims(data, axis=0)
220 DAs[name] = make_dataarray(f.describe(), traj, name, data)
222 # the indices/indexes used to create this dataarray.
223 # This can be useful for later. To redo some analysis
224 # or use tensorflow for the geometric computations of these values.
225 # some PyEMMA features give the indexes different names e.g 'group_definitions'
226 try:
227 indexes[name] = f.indexes.tolist()
228 except AttributeError as e:
229 for key in f.__dir__():
230 if "inde" in key:
231 indexes[name] = f.__dict__[key]
232 if (
233 f.__class__.__name__ == "GroupCOMFeature"
234 or f.__class__.__name__ == "ResidueCOMFeature"
235 ):
236 indexes[name] = f.group_definitions
237 if f.__class__.__name__ == "MinRmsdFeature":
238 indexes[name] = f.atom_indices
239 if name not in indexes: 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true
240 raise e
241 ds = xr.Dataset(DAs)
242 if put_indices_into_attrs: 242 ↛ 244line 242 didn't jump to line 244, because the condition on line 242 was never false
243 ds = ds.assign_attrs(indexes)
244 return ds
247def construct_xarray_from_feat(feat, input_data=None):
248 raise Exception(
249 "Marked for deprecation, because featurizer returns its own dataset."
250 )
253def make_frame_CV_dataarray(
254 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None
255):
256 """Make a dataarray from a frame CV feature.
258 A normal features yields multiple values per trajectory frame (e.g. the
259 backbone dihedral angles give 2 * n_residues features per frame). A frame
260 CV feature is just a single value for a trajectory. Examples are:
261 * The distance between a binding pocket and a ligand.
262 * The volume of the unitcell.
263 * Whether a molecule is in state 1 or 2, could be described as a binary frame CV features.
265 This method has additional logic. If `data` has a shape of (1, n_frames, 1) it
266 is left as is. If it has a shape of (1, n_frames), it will be expanded on the -1 axis to
267 shape (1, n_frames, 1).
269 Note:
270 Please make sure, that the input data conforms to the nm, ps, rad coordinates.
272 Args:
273 labels (list[str]): The labels, that specify the `CV_num` dimension. This
274 requires the expression `len(labels == data.shape[2]` to be True. If you
275 build the dataarray from a feature. The `labels` argument usually will
276 be `feature.describe()`.
277 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.
278 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,
279 an xarray.Dataarray always represents one feature, of one trajectory.
280 A trajectory can have multiple features, which is represented as an
281 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.
282 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This
283 dataset is created ad hoc, by merging the datasets of the trajectories along
284 the `traj_num` axis.
285 name (str): The name of the feature. This name will be used to group similar
286 features in the large `xarray.Dataset`s of trajectory ensembles. If you
287 construct this dataarray from a feature, you can either use `feature.name`,
288 or `feature.__class__.__name__`.
289 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with.
290 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of
291 the frames to the `frame_num` axis. Can be either True, or False, or a
292 `np.ndarray`, in which case the data in this array will be used. Defaults to True.
293 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.
294 This can come in handy, if the trajectory is loaded out of-memory and
295 distributed, in which case, MDAnalysis is used to load data.
296 MDTraj does never load the trajectory coordinates. This the attribute
297 `_original_frame_indices` in the trajectory will be empty. However, as
298 the distributed task did already assign frames to the workers, we have
299 this number, we just need to use it. If set to None `_original_frame_indices`
300 will be used. Otherwise, the `np.ndarray` provided here, will be used.
301 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.
302 This can be especially useful for trajectory ensembles, which might differ
303 in a lot of places (i.e. single amino acid exchange). If the labels are
304 too strict (LYS1 ATOM1 C), the data of two slightly different proteins
305 can not be concatenated. If the labels are generic (AA1 ATOM1), this can
306 be done. Defaults to None.
308 Returns:
309 xarray.Dataarray: The resulting dataarray.
311 """
312 if data.ndim == 3: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true
313 pass
314 elif data.ndim == 2: 314 ↛ 316line 314 didn't jump to line 316, because the condition on line 314 was never false
315 data = np.expand_dims(data, -1)
316 return make_dataarray(
317 labels,
318 traj,
319 name,
320 data,
321 with_time,
322 frame_indices,
323 labels2,
324 )
327def make_position_dataarray_from_numpy(atoms, traj, name, data, with_time=True):
328 frame_indices = traj.id
329 if frame_indices.ndim > 1:
330 frame_indices = frame_indices[:, 1]
331 da = xr.DataArray(
332 data,
333 coords={
334 "traj_num": ("traj_num", [traj.traj_num]),
335 "traj_name": ("traj_num", [traj.basename]),
336 "frame_num": ("frame_num", frame_indices),
337 "ATOM": ("ATOM", atoms),
338 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"],
339 },
340 dims=["traj_num", "frame_num", "ATOM", "COORDS"],
341 name=name,
342 attrs={
343 "length_units": "nm",
344 "time_units": "ps",
345 "angle_units": "rad",
346 "full_path": traj.traj_file,
347 "topology_file": traj.top_file,
348 "feature_axis": "ATOM",
349 },
350 )
351 if with_time:
352 da = da.assign_coords(time=("frame_num", traj.time))
353 return da
356def make_position_dataarray(
357 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None
358):
359 """Creates dataarray belonging to cartesian positions.
361 Similar to `make_datarray`, but the shapes are even larger. As every atom
362 contributes 3 coordinates (x, y, z) to the data, the shape of the returned
363 dataarray is (1, no_of_frames, no_of_atoms_considered, 3).
365 Note:
366 Please make sure, that the input data conforms to the nm, ps, rad coordinates.
368 Args:
369 labels (list[str]): The labels, that specify the `CV_num` dimension. This
370 requires the expression `len(labels == data.shape[2]` to be True. If you
371 build the dataarray from a feature. The `labels` argument usually will
372 be `feature.describe()`.
373 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.
374 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,
375 an xarray.Dataarray always represents one feature, of one trajectory.
376 A trajectory can have multiple features, which is represented as an
377 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.
378 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This
379 dataset is created ad hoc, by merging the datasets of the trajectories along
380 the `traj_num` axis.
381 name (str): The name of the feature. This name will be used to group similar
382 features in the large `xarray.Dataset`s of trajectory ensembles. If you
383 construct this dataarray from a feature, you can either use `feature.name`,
384 or `feature.__class__.__name__`.
385 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with.
386 with_time (Union[bool, np.ndarray]): Whether to add the time of
387 the frames to the `frame_num` axis. Can be either True, or False, or a
388 `np.ndarray`, in which case the data in this array will be used. Defaults to True.
389 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.
390 This can come in handy, if the trajectory is loaded out of-memory and
391 distributed, in which case, MDAnalysis is used to load data.
392 MDTraj does never load the trajectory coordinates. This the attribute
393 `_original_frame_indices` in the trajectory will be empty. However, as
394 the distributed task did already assign frames to the workers, we have
395 this number, we just need to use it. If set to None `_original_frame_indices`
396 will be used. Otherwise, the `np.ndarray` provided here, will be used.
397 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.
398 This can be especially useful for trajectory ensembles, which might differ
399 in a lot of places (i.e. single amino acid exchange). If the labels are
400 too strict (LYS1 ATOM1 C), the data of two slightly different proteins
401 can not be concatenated. If the labels are generic (AA1 ATOM1), this can
402 be done. Defaults to None.
404 Returns:
405 xarray.Dataarray: The resulting dataarray.
407 """
408 if frame_indices is None: 408 ↛ 418line 408 didn't jump to line 418, because the condition on line 408 was never false
409 frame_indices = traj.id
410 if frame_indices.ndim > 1:
411 frame_indices = frame_indices[:, 1]
412 # frame_indices = traj._original_frame_indices
413 # if not np.any(frame_indices):
414 # frame_indices = traj.id
415 # if frame_indices.ndim > 1:
416 # frame_indices = frame_indices[:, 1]
418 if labels2 is not None: 418 ↛ 419line 418 didn't jump to line 419, because the condition on line 418 was never true
419 labels = labels2
420 else:
421 labels = [_[11:].lstrip(" ") for _ in labels[::3]]
423 da = xr.DataArray(
424 data,
425 coords={
426 "traj_num": ("traj_num", [traj.traj_num]),
427 "traj_name": ("traj_num", [traj.basename]),
428 "frame_num": ("frame_num", frame_indices),
429 "ATOM": ("ATOM", labels),
430 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"],
431 },
432 dims=["traj_num", "frame_num", "ATOM", "COORDS"],
433 name=name.upper(),
434 attrs={
435 "length_units": "nm",
436 "time_units": "ps",
437 "angle_units": "rad",
438 "full_path": traj.traj_file,
439 "topology_file": traj.top_file,
440 "feature_axis": "ATOM",
441 },
442 )
443 if isinstance(with_time, bool): 443 ↛ 447line 443 didn't jump to line 447, because the condition on line 443 was never false
444 if with_time: 444 ↛ 448line 444 didn't jump to line 448, because the condition on line 444 was never false
445 da = da.assign_coords(time=("frame_num", traj.time))
446 else:
447 da = da.assign_coords(time=("frame_num", with_time))
448 return da
451def make_dataarray(
452 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None
453):
454 """Creates a dataarray belonging to a feature.
456 The shapes are a bit different that what most people might be used to. As
457 EncoderMap was meant to work with ensembles of trajectories, the data is usually
458 shaped as (traj_num, frame_num, CV_num), or even (traj_num, frame_num, atom_num, 3).
460 The `xarray.Dataarray`, that is returned by this function reflects this. As a
461 dataarray is attributed to a single trajectory, the first shape will always be 1.
462 So for a collective variable, that describes n features, the shape of the returned
463 datarray will be (1, n_frames, n). Combining multiple trajectories, the first number
464 can increase.
466 Note:
467 Please make sure, that the input data conforms to the nm, ps, rad coordinates.
469 Args:
470 labels (list[str]): The labels, that specify the `CV_num` dimension. This
471 requires the expression `len(labels == data.shape[2]` to be True. If you
472 build the dataarray from a feature. The `labels` argument usually will
473 be `feature.describe()`.
474 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.
475 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,
476 an xarray.Dataarray always represents one feature, of one trajectory.
477 A trajectory can have multiple features, which is represented as an
478 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.
479 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This
480 dataset is created ad hoc, by merging the datasets of the trajectories along
481 the `traj_num` axis.
482 name (str): The name of the feature. This name will be used to group similar
483 features in the large `xarray.Dataset`s of trajectory ensembles. If you
484 construct this dataarray from a feature, you can either use `feature.name`,
485 or `feature.__class__.__name__`.
486 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with.
487 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of
488 the frames to the `frame_num` axis. Can be either True, or False, or a
489 `np.ndarray`, in which case the data in this array will be used. Defaults to True.
490 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.
491 This can come in handy, if the trajectory is loaded out of-memory and
492 distributed, in which case, MDAnalysis is used to load data.
493 MDTraj does never load the trajectory coordinates. This the attribute
494 `_original_frame_indices` in the trajectory will be empty. However, as
495 the distributed task did already assign frames to the workers, we have
496 this number, we just need to use it. If set to None `_original_frame_indices`
497 will be used. Otherwise, the `np.ndarray` provided here, will be used.
498 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.
499 This can be especially useful for trajectory ensembles, which might differ
500 in a lot of places (i.e. single amino acid exchange). If the labels are
501 too strict (LYS1 ATOM1 C), the data of two slightly different proteins
502 can not be concatenated. If the labels are generic (AA1 ATOM1), this can
503 be done. Defaults to None.
505 Returns:
506 xarray.Dataarray: The resulting dataarray.
508 """
509 if frame_indices is None: 509 ↛ 519line 509 didn't jump to line 519, because the condition on line 509 was never false
510 frame_indices = traj.id
511 if frame_indices.ndim > 1:
512 frame_indices = frame_indices[:, 1]
513 # frame_indices = traj._original_frame_indices
514 # if not np.any(frame_indices):
515 # frame_indices = traj.id
516 # if frame_indices.ndim > 1:
517 # frame_indices = frame_indices[:, 1]
519 if labels2 is not None: 519 ↛ 520line 519 didn't jump to line 520, because the condition on line 519 was never true
520 labels = labels2
522 if callable(labels): 522 ↛ 523line 522 didn't jump to line 523, because the condition on line 522 was never true
523 labels = [l for l in labels(traj.top)]
524 else:
525 if len(labels) > data.shape[-1]:
526 labels = np.arange(data.shape[-1])
528 da = xr.DataArray(
529 data,
530 coords={
531 "traj_num": ("traj_num", [traj.traj_num]),
532 "traj_name": ("traj_num", [traj.basename]),
533 "frame_num": ("frame_num", frame_indices),
534 name.upper(): labels,
535 },
536 dims=["traj_num", "frame_num", name.upper()],
537 name=name,
538 attrs={
539 "length_units": "nm",
540 "time_units": "ps",
541 "angle_units": "rad",
542 "full_path": traj.traj_file,
543 "topology_file": traj.top_file,
544 "feature_axis": name.upper(),
545 },
546 )
547 if isinstance(with_time, bool): 547 ↛ 551line 547 didn't jump to line 551, because the condition on line 547 was never false
548 if with_time:
549 da = da.assign_coords(time=("frame_num", traj.time))
550 else:
551 da = da.assign_coords(time=("frame_num", with_time))
552 return da
555def make_ensemble_xarray(name, data):
556 attrs = {"length_units": "nm", "time_units": "ps", "angle_units": "rad"}
557 if data.ndim == 2:
558 coords = {
559 "traj_num": ("traj_num", range(data.shape[0])),
560 "frame_num": ("frame_num", range(data.shape[1])),
561 name.upper(): (
562 "frame_num",
563 [f"{name.upper()} Frame Feature {i}" for i in range(data.shape[1])],
564 ),
565 }
566 dims = ["traj_num", "frame_num"]
567 elif data.ndim == 3:
568 coords = {
569 "traj_num": ("traj_num", range(data.shape[0])),
570 "frame_num": ("frame_num", range(data.shape[1])),
571 name.upper(): [f"{name.upper()} Feature {i}" for i in range(data.shape[2])],
572 }
573 dims = ["traj_num", "frame_num", name.upper()]
574 elif data.ndim == 4:
575 coords = {
576 "traj_num": ("traj_num", range(data.shape[0])),
577 "frame_num": ("frame_num", range(data.shape[1])),
578 "ATOM": ("ATOM", [f"ATOM {i}" for i in range(data.shape[2])]),
579 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"],
580 }
581 dims = ["traj_num", "frame_num", "ATOM", "COORDS"]
582 else:
583 raise Exception("Too high dimensional data for ensemble.")
584 da = xr.DataArray(data, coords=coords, dims=dims, name=name, attrs=attrs)
585 return da
588def get_indices_by_feature_dim(feat, traj, input_data_shape):
589 """Unpacks the concatenated features returned by PyEMMA.
591 Usually, if you add multiple features to a PyEMMA featurizer, they are
592 stacked along the feature axis. Let's say, you have a trajectory with 20 frames
593 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi).
594 If you then also add the end-to-end distance as a feature, the data returned by
595 PyEMMA will have the shape (20, 7). This function returns the correct indices,
596 so that iteration of `zip(Featurizer.active_features, indices)` will yield the
597 correct results.
599 Args:
600 feat (em.Featurizer): Instance of `encodermap.Featurizer`. The featurizer knows
601 how many entries every feature takes up in the feature space.
602 traj (encodermap.SingleTraj): An instance of `SingleTraj`. This is needed
603 for a single purpose: PyEMMA returns malformed data for .pdb files, that
604 are loaded from the protein database
605 (i.e. `traj = em.SingleTraj('https://files.rcsb.org/view/1GHC.pdb'`).
606 This is prevented by providing the traj and do a small check.
607 input_data_shape (tuple): The data, that should conform to the slicing.
608 Also needed for some checks.
610 Returns:
611 list[np.ndarray]: List of `np.ndarray`s that correctly slice the data.
613 """
614 if len(feat.features) > 1:
615 indices = [0] + add_one_by_one([f._dim for f in feat.features])
616 if traj.extension == ".pdb" and _validate_uri(traj.traj_file): 616 ↛ 618line 616 didn't jump to line 618, because the condition on line 616 was never true
617 # for internet pdb files we need to slice the data. this negates the next assert but meh.
618 raise BadError(
619 "For some reason pyemma.coordinates.source does not like working with internet pdb files."
620 )
621 else:
622 if not indices[-1] == input_data_shape[1]: 622 ↛ 623line 622 didn't jump to line 623, because the condition on line 622 was never true
623 for f in feat.features:
624 print(f.__class__)
625 print(f.describe())
626 print(len(f.describe()))
627 print(f._dim)
628 print(traj, indices, input_data_shape)
629 raise Exception
630 indices = [np.arange(i, j) for i, j in zip(indices[:-1], indices[1:])]
631 else:
632 indices = [np.arange(0, feat.features[0]._dim)]
633 return indices
636def add_one_by_one(l):
637 """Creates a new list from l with elements added one after another.
639 Args:
640 l (list): The input list.
642 Returns:
643 list: The output list.
645 Example:
646 >>> l = [0, 2, 4, 5, 7]
647 >>> add_one_by_one(l)
648 [0, 2, 6, 11, 18]
650 """
651 new_l = []
652 cumsum = 0
653 for elt in l:
654 cumsum += elt
655 new_l.append(cumsum)
656 return new_l