Coverage for encodermap/misc/xarray.py: 7%

266 statements  

« prev     ^ index     » next       coverage.py v7.4.1, created at 2024-12-31 16:54 +0100

1# -*- coding: utf-8 -*- 

2# encodermap/misc/xarray.py 

3################################################################################ 

4# EncoderMap: A python library for dimensionality reduction. 

5# 

6# Copyright 2019-2024 University of Konstanz and the Authors 

7# 

8# Authors: 

9# Kevin Sawade, Tobias Lemke 

10# 

11# Encodermap is free software: you can redistribute it and/or modify 

12# it under the terms of the GNU Lesser General Public License as 

13# published by the Free Software Foundation, either version 2.1 

14# of the License, or (at your option) any later version. 

15# This package is distributed in the hope that it will be useful to other 

16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the 

17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 

18# See the GNU Lesser General Public License for more details. 

19# 

20# See <http://www.gnu.org/licenses/>. 

21################################################################################ 

22"""EncoderMap's xarray manipulation functions. 

23 

24EncoderMap uses xarray datasets to save CV data alongside with trajectory data. 

25These functions implement creation of such xarray datasets. 

26 

27""" 

28################################################################################ 

29# Imports 

30################################################################################ 

31 

32 

33# Future Imports at the top 

34from __future__ import annotations 

35 

36# Standard Library Imports 

37import itertools 

38import warnings 

39 

40# Third Party Imports 

41import numpy as np 

42from optional_imports import _optional_import 

43 

44# Encodermap imports 

45from encodermap.misc.misc import FEATURE_NAMES 

46 

47 

48################################################################################ 

49# Optional Imports 

50################################################################################ 

51 

52 

53xr = _optional_import("xarray") 

54 

55 

56################################################################################ 

57# Typing 

58################################################################################ 

59 

60 

61# Standard Library Imports 

62from numbers import Number 

63from typing import TYPE_CHECKING, Optional, Union, overload 

64 

65 

66if TYPE_CHECKING: 

67 # Third Party Imports 

68 import xarray as xr 

69 

70 # Encodermap imports 

71 from encodermap.loading.features import AnyFeature 

72 from encodermap.loading.featurizer import Featurizer 

73 from encodermap.trajinfo.info_single import SingleTraj 

74 

75 

76################################################################################ 

77# Globals 

78################################################################################ 

79 

80 

81__all__: list[str] = [ 

82 "construct_xarray_from_numpy", 

83 "unpack_data_and_feature", 

84] 

85 

86 

87################################################################################ 

88# Functions 

89################################################################################ 

90 

91 

92def _get_indexes_from_feat(f: "AnyFeature", traj: "SingleTraj") -> np.ndarray: 

93 """Returns the indices of this feature. 

94 

95 This can be useful for later to redo some analysis or use tensorflow 

96 to implement the geometric operations in the compute graph. 

97 

98 Note: 

99 Due to some PyEMMA legacy code. Sometimes the indices are called indexes. 

100 Furthermore, some PyEMMA feature give the indices different names, like 

101 'group_definitions'. Here, indices are integer values that align with 

102 the atomic coordinate of a trajectory or ensemble of trajectories with 

103 shared topology. 

104 

105 Args: 

106 f (AnyFeature): The feature to extract the indices from. This has to 

107 be a subclass of :class:`encodermap.loading.features.Feature`. 

108 traj (SingleTraj): This argument has to be provided for the RMSD 

109 features. We make it mandatory for the other feature types also 

110 to keep the input consistent. 

111 

112 Returns: 

113 np.ndarray: An integer array describing the atoms, this feature uses 

114 to compute the collevtive variables. 

115 

116 """ 

117 # Local Folder Imports 

118 from ..loading.features import ( 

119 CustomFeature, 

120 GroupCOMFeature, 

121 MinRmsdFeature, 

122 ResidueCOMFeature, 

123 ) 

124 

125 try: 

126 return f.indexes 

127 except AttributeError as e: 

128 for key in f.__dir__(): 

129 if "inde" in key: 

130 return np.asarray(f.__dict__[key]) 

131 if isinstance(f, (GroupCOMFeature, ResidueCOMFeature)): 

132 return f.group_definitions 

133 if isinstance(f, MinRmsdFeature): 

134 if f.atom_indices is None: 

135 return np.arange(traj.n_atoms) 

136 else: 

137 return f.atom_indices 

138 if isinstance(f, CustomFeature): 

139 descr = f.describe() 

140 if any(["CustomFeature" in i for i in descr]): 

141 return np.asarray( 

142 [f"{d.split()[0]} INDEX {i}" for i, d in enumerate(descr)] 

143 ) 

144 else: 

145 raise Exception( 

146 f"Can't decide on indexes for this custom feature:, " 

147 f"{f.__class__.__name__=} {f.describe()[:2]=}" 

148 ) from e 

149 else: 

150 raise e 

151 

152 

153def _cast_to_int_maybe(a: np.ndarray) -> np.ndarray: 

154 """Casts a np.array to int, if possible. 

155 

156 Args: 

157 a (np.ndarray): The array. 

158 

159 Returns: 

160 np.ndarray: The output, which can be int. 

161 """ 

162 if not np.any(np.mod(a, 1)): 

163 return a.astype(np.int32) 

164 return a 

165 

166 

167def construct_xarray_from_numpy( 

168 traj: SingleTraj, 

169 data: np.ndarray, 

170 name: str, 

171 deg: bool = False, 

172 labels: Optional[list[str]] = None, 

173 check_n_frames: bool = False, 

174) -> xr.DataArray: 

175 """Constructs a `xarray.DataArray` from a numpy array. 

176 

177 Three cases are recognized: 

178 * The input array in data has ndim == 2. This kind of feature/CV is a 

179 per-frame feature, like the membership to clusters. Every frame of 

180 every trajectory is assigned a single value (most often int values). 

181 * The input array in data has ndim == 3: This is also a per-frame 

182 feature/CV, but this time every frame is characterized by a series 

183 of values. These values can be dihedral angles in the backbone 

184 starting from the protein's N-terminus to the C-terminus, or 

185 pairwise distance features between certain atoms. The xarray 

186 datarray constructed from this kind of data will have a label 

187 dimension that will either contain generic labels like 

188 'CUSTOM_FEATURE FEATURE 0' or labels defined by the featurizer, 

189 such as 'SIDECHAIN ANGLE CHI1 OF RESIDUE 1LYS'. 

190 * The input array in data has ndim == 4. Here, the same feature/CV is 

191 duplicated for the protein's atoms. Besides the XYZ coordinates of 

192 the atoms, no other CVs should fall into this case. The labels will be 

193 2-dimensional with 'POSITION OF ATOM H1 IN RESIDUE 1LYS' in 

194 dimension 0 and either 'X', 'Y' or 'Z' in dimension 1. 

195 

196 Args: 

197 traj (em.SingleTraj): The trajectory we want to create the 

198 `xarray.DataArray` for. 

199 data (np.ndarray): The numpy array we want to create the data from. 

200 Note that the data passed into this function should be expanded 

201 by `np.expand_dim(a, axis=0)`, so to add a new axis to the complete 

202 data containing the trajectories of a trajectory ensemble. 

203 name (str): The name of the feature. This can be chosen freely. Names 

204 like 'central_angles', 'backbone_torsions' would make the most sense. 

205 deg (bool): Whether provided data is in deg or radians. 

206 labels (Optional[list]): If you have specific labels for your CVs in 

207 mind, you can overwrite the generic 'CUSTOM_FEATURE FEATURE 0' 

208 labels by providing a list for this argument. If None is provided, 

209 generic names will be given to the features. Defaults to None. 

210 check_n_frames (bool): Whether to check whether the number of frames in 

211 the trajectory matches the len of the data in at least one 

212 dimension. Defaults to False. 

213 

214 Returns: 

215 xarray.DataArray: An `xarray.DataArray`. 

216 

217 Examples: 

218 >>> import encodermap as em 

219 >>> import numpy as np 

220 >>> from encodermap.misc.xarray import construct_xarray_from_numpy 

221 >>> # load file from RCSB and give it traj num to represent it in a 

222 >>> # potential trajectory ensemble 

223 >>> traj = em.load('https://files.rcsb.org/view/1GHC.pdb', traj_num=1) 

224 >>> # single trajectory needs to be expanded into 'trajectory' axis 

225 >>> z_coordinate = np.expand_dims(traj.xyz[:,:,0], 0) 

226 >>> da = construct_xarray_from_numpy(traj, z_coordinate, 'z_coordinate') 

227 >>> print(da.coords['Z_COORDINATE'].values[:2]) 

228 ['Z_COORDINATE FEATURE 0' 'Z_COORDINATE FEATURE 1'] 

229 >>> print(da.coords['traj_num'].values) 

230 [1] 

231 >>> print(da.attrs['time_units']) 

232 ps 

233 

234 """ 

235 if check_n_frames: 

236 if not any(s == traj.n_frames for s in data.shape): 

237 raise Exception( 

238 f"Can't add CV with name '{name}' to trajectory. The trajectory has " 

239 f"{traj.n_frames} frames. The data has a shape of {data.shape} " 

240 f"No dimension of data matches the frames of the trajectory." 

241 ) 

242 if traj.backend == "no_load": 

243 with_time = False 

244 else: 

245 with_time = True 

246 if data.ndim == 2: 

247 if labels is None: 

248 labels = [f"{name.upper()} FEATURE"] 

249 da, _ = make_frame_CV_dataarray( 

250 labels, traj, name, data, deg=deg, with_time=with_time 

251 ) 

252 elif data.ndim == 3: 

253 if labels is None: 

254 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])] 

255 da, _ = make_dataarray(labels, traj, name, data, deg=deg, with_time=with_time) 

256 elif data.ndim == 4: 

257 if labels is None: 

258 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-2])] 

259 da = make_position_dataarray_from_numpy( 

260 labels, 

261 traj, 

262 name, 

263 data, 

264 deg=deg, 

265 with_time=with_time, 

266 ) 

267 else: 

268 raise Exception( 

269 f"The provided data has a dimensionality of {data.ndim}, but only 2, 3 and 4 are supported." 

270 ) 

271 return da 

272 

273 

274def unpack_data_and_feature( 

275 feat: Featurizer, 

276 traj: SingleTraj, 

277 input_data: np.ndarray, 

278) -> xr.Dataset: 

279 """Makes a `xarray.Dataset` from data and a featurizer. 

280 

281 Usually, if you add multiple features to a featurizer, they are 

282 stacked along the feature axis. Let's say, you have a trajectory with 20 frames 

283 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi). 

284 If you then also add the end-to-end distance as a feature, the data returned by 

285 the featurizer will have the shape (20, 7). This function returns the correct indices, 

286 so that iteration of zip(Featurizer.active_features, indices) will yield the 

287 correct results. 

288 

289 Args: 

290 feat (encodermap.loading.Featurizer): An instance of the currently used `encodermap.loading.Featurizer`. 

291 traj (encodermap.trajinfo.SingleTraj): An instance of `encodermap.SingleTraj`, that the data 

292 in `input_data` was computed from. 

293 input_data (np.ndarray): The data, as returned from PyEMMA. 

294 

295 Returns: 

296 xarray.Dataset: An `xarray.Dataset` with all features in a nice format. 

297 

298 """ 

299 # Local Folder Imports 

300 from ..trajinfo.trajinfo_utils import trajs_combine_attrs 

301 

302 # this needs to be done, because pyemma concatenates the data 

303 # along the feature axis 

304 indices = get_indices_by_feature_dim(feat, traj, input_data.shape) 

305 

306 DAs = {} 

307 indexes = {} 

308 for f, ind in zip(feat.features, indices): 

309 data = input_data[:, ind] 

310 if data.shape[-1] == 1 and f._dim != 1: 

311 data = data.squeeze(-1) 

312 

313 # decide on the name 

314 try: 

315 name = FEATURE_NAMES[f.name] 

316 except (KeyError, AttributeError): 

317 if hasattr(f, "name"): 

318 if isinstance(f.name, str): 

319 name = f.name 

320 if "mdtraj.trajectory" in f.name.lower(): 

321 f.name = f.__class__.__name__ 

322 name = f.__class__.__name__ 

323 else: 

324 name = f.__class__.__name__ 

325 f.name = name 

326 else: 

327 name = f.__class__.__name__ 

328 if name == "CustomFeature": 

329 name = f.describe()[0].split()[0] 

330 f.name = name 

331 assert data.shape[1] == f._dim 

332 

333 if data.shape[0] != len(traj): 

334 if traj.index == (None,): 

335 raise Exception( 

336 "Shape of provided data does not fit traj. Traj " 

337 f"has {traj.n_frames=} {len(traj)=}. Data has shape {data.shape}" 

338 ) 

339 else: 

340 data = data[traj.index].squeeze(0) 

341 assert data.shape[0] == len(traj), f"{data.shape=}, {len(traj)=}" 

342 if name in DAs: 

343 name = ( 

344 name 

345 + f"_{len(list(filter(lambda x: True if name in x else False, list(DAs.keys()))))}" 

346 ) 

347 if hasattr(f, "deg"): 

348 deg = f.deg 

349 else: 

350 deg = None 

351 

352 if ( 

353 f.name in ["AllCartesians", "CentralCartesians", "SideChainCartesians"] 

354 or f.atom_feature 

355 ): 

356 data = data.reshape(len(traj), -1, 3) 

357 data = np.expand_dims(data, axis=0) 

358 if hasattr(feat, "indices_by_top"): 

359 f.indexes = feat.indices_by_top[traj.top][f.name] 

360 DAs[name], indexes[name + "_feature_indices"] = make_position_dataarray( 

361 f.describe(), traj, name, data, deg=deg, feat=f 

362 ) 

363 else: 

364 data = np.expand_dims(data, axis=0) 

365 if f._dim == 1: 

366 DAs[name], indexes[name + "_feature_indices"] = make_frame_CV_dataarray( 

367 f.describe(), 

368 traj, 

369 name, 

370 data, 

371 deg=deg, 

372 feat=f, 

373 ) 

374 else: 

375 if hasattr(feat, "indices_by_top"): 

376 f.indexes = feat.indices_by_top[traj.top][f.name] 

377 DAs[name], indexes[name + "_feature_indices"] = make_dataarray( 

378 f.describe(), traj, name, data, deg=deg, feat=f 

379 ) 

380 if indexes[name + "_feature_indices"] is None: 

381 del indexes[name + "_feature_indices"] 

382 DAs_and_indxes = DAs | indexes 

383 attrs = [] 

384 for i in DAs_and_indxes.values(): 

385 attrs.append(i.attrs) 

386 try: 

387 ds = xr.Dataset(DAs_and_indxes, attrs=trajs_combine_attrs(attrs)) 

388 except ValueError as e: 

389 raise Exception(f"{DAs_and_indxes.keys()=}") from e 

390 return ds 

391 

392 

393def make_frame_CV_dataarray( 

394 labels: list[str], 

395 traj: SingleTraj, 

396 name: str, 

397 data: np.ndarray, 

398 deg: Union[None, bool], 

399 with_time: bool = True, 

400 frame_indices: Optional[np.ndarray] = None, 

401 labels2: Optional[list[str]] = None, 

402 feat: Optional["AnyFeature"] = None, 

403) -> tuple[xr.DataArray, Union[None, xr.DataArray]]: 

404 """Make a DataArray from a frame CV feature. 

405 

406 A normal features yields multiple values per trajectory frame (e.g. the 

407 backbone dihedral angles give 2 * n_residues features per frame). A frame 

408 CV feature is just a single value for a trajectory. Examples are: 

409 * The distance between a binding pocket and a ligand. 

410 * The volume of the unitcell. 

411 * Whether a molecule is in state 1 or 2, could be described as a binary frame CV features. 

412 

413 This method has additional logic. If `data` has a shape of (1, n_frames, 1) it 

414 is left as is. If it has a shape of (1, n_frames), it will be expanded on the -1 axis to 

415 shape (1, n_frames, 1). 

416 

417 Note: 

418 Please make sure that the input data conforms to the nm, ps, rad coordinates. 

419 

420 Args: 

421 labels (list[str]): The labels, that specify the `CV_num` dimension. This 

422 requires the expression `len(labels == data.shape[2]` to be True. If you 

423 build the DataArray from a feature. The `labels` argument usually will 

424 be `feature.describe()`. 

425 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory. 

426 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap, 

427 an xarray.DataArray always represents one feature, of one trajectory. 

428 A trajectory can have multiple features, which is represented as an 

429 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used. 

430 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This 

431 dataset is created ad hoc, by merging the datasets of the trajectories along 

432 the `traj_num` axis. 

433 name (str): The name of the feature. This name will be used to group similar 

434 features in the large `xarray.Dataset`s of trajectory ensembles. If you 

435 construct this DataArray from a feature, you can either use `feature.name`, 

436 or `feature.__class__.__name__`. 

437 data (Union[np.ndarray, dask.array]): The data to fill the DataArray with. 

438 deg (Union[None, bool]): Whether the provided data is in deg or radians. 

439 If None is provided, it will not be included in the attrs. 

440 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of 

441 the frames to the `frame_num` axis. Can be either True, or False, or a 

442 `np.ndarray`, in which case the data in this array will be used. Defaults to True. 

443 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk. 

444 This can come in handy, if the trajectory is loaded out of-memory and 

445 distributed, in which case, MDAnalysis is used to load data. 

446 MDTraj does never load the trajectory coordinates. This the attribute 

447 `_original_frame_indices` in the trajectory will be empty. However, as 

448 the distributed task did already assign frames to the workers, we have 

449 this number, we just need to use it. If set to None `_original_frame_indices` 

450 will be used. Otherwise, the `np.ndarray` provided here, will be used. 

451 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`. 

452 This can be especially useful for trajectory ensembles, which might differ 

453 in a lot of places (i.e., single amino acid exchange). If the labels are 

454 too strict (LYS1 ATOM1 C), the data of two slightly different proteins 

455 cannot be concatenated. If the labels are generic (AA1 ATOM1), this can 

456 be done. Defaults to None. 

457 

458 Returns: 

459 xarray.DataArray: The resulting DataArray. 

460 

461 """ 

462 if data.ndim == 3: 

463 pass 

464 elif data.ndim == 2: 

465 data = np.expand_dims(data, -1) 

466 # data = _cast_to_int_maybe(data) 

467 

468 return make_dataarray( 

469 labels, traj, name, data, deg, with_time, frame_indices, labels2, feat 

470 ) 

471 

472 

473def make_position_dataarray_from_numpy( 

474 atoms, traj, name, data, deg=None, with_time=True 

475): 

476 """Same as :func:`make_dataarray`, but with higher feature dimension.""" 

477 attrs = { 

478 "length_units": "nm", 

479 "time_units": "ps", 

480 "full_path": traj.traj_file, 

481 "topology_file": traj.top_file, 

482 "feature_axis": "ATOM", 

483 } 

484 if deg is not None: 

485 if deg: 

486 attrs["angle_units"] = "deg" 

487 else: 

488 attrs["angle_units"] = "rad" 

489 frame_indices = traj.id 

490 if frame_indices.ndim > 1: 

491 frame_indices = frame_indices[:, 1] 

492 da = xr.DataArray( 

493 data, 

494 coords={ 

495 "traj_num": ("traj_num", np.asarray([traj.traj_num])), 

496 "traj_name": ("traj_num", np.asarray([traj.basename])), 

497 "frame_num": ("frame_num", frame_indices), 

498 "ATOM": ("ATOM", np.asarray(atoms)), 

499 "COORDS": np.array(["POSITION X", "POSITION Y", "POSITION Z"]), 

500 }, 

501 dims=["traj_num", "frame_num", "ATOM", "COORDS"], 

502 name=name, 

503 attrs=attrs, 

504 ) 

505 if with_time: 

506 da = da.assign_coords(time=("frame_num", traj.time)) 

507 return da 

508 

509 

510@overload 

511def make_position_dataarray( 511 ↛ exitline 511 didn't jump to the function exit

512 labels: list[str], 

513 traj: SingleTraj, 

514 name: str, 

515 data: np.ndarray, 

516 deg: Union[None, bool] = None, 

517 with_time: bool = True, 

518 frame_indices: Optional[np.ndarray] = None, 

519 labels2: Optional[list[str]] = None, 

520 feat: Optional["AnyFeature"] = None, 

521) -> tuple[xr.DataArray, None]: ... 

522 

523 

524@overload 

525def make_position_dataarray( 525 ↛ exitline 525 didn't jump to the function exit

526 labels: list[str], 

527 traj: SingleTraj, 

528 name: str, 

529 data: np.ndarray, 

530 deg: Union[None, bool] = None, 

531 with_time: bool = True, 

532 frame_indices: Optional[np.ndarray] = None, 

533 labels2: Optional[list[str]] = None, 

534 feat: Optional["AnyFeature"] = "AnyFeature", 

535) -> tuple[xr.DataArray, xr.DataArray]: ... 

536 

537 

538def make_position_dataarray( 

539 labels: list[str], 

540 traj: SingleTraj, 

541 name: str, 

542 data: np.ndarray, 

543 deg: Union[None, bool] = None, 

544 with_time: bool = True, 

545 frame_indices: Optional[np.ndarray] = None, 

546 labels2: Optional[list[str]] = None, 

547 feat: Optional["AnyFeature"] = None, 

548) -> tuple[xr.DataArray, Union[None, xr.DataArray]]: 

549 """Creates DataArray belonging to cartesian positions. 

550 

551 Similar to `make_datarray`, but the shapes are even larger. As every atom 

552 contributes 3 coordinates (x, y, z) to the data, the shape of the returned 

553 DataArray is (1, no_of_frames, no_of_atoms_considered, 3). 

554 

555 Note: 

556 Please make sure that the input data conforms to the nm, ps, rad coordinates. 

557 

558 Args: 

559 labels (list[str]): The labels, that specify the `CV_num` dimension. This 

560 requires the expression `len(labels == data.shape[2]` to be True. If you 

561 build the DataArray from a feature. The `labels` argument usually will 

562 be `feature.describe()`. 

563 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory. 

564 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap, 

565 an xarray.DataArray always represents one feature, of one trajectory. 

566 A trajectory can have multiple features, which is represented as an 

567 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used. 

568 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This 

569 dataset is created ad hoc, by merging the datasets of the trajectories along 

570 the `traj_num` axis. 

571 name (str): The name of the feature. This name will be used to group similar 

572 features in the large `xarray.Dataset`s of trajectory ensembles. If you 

573 construct this DataArray from a feature, you can either use `feature.name`, 

574 or `feature.__class__.__name__`. 

575 data (Union[np.ndarray, dask.array]): The data to fill the DataArray with. 

576 deg (bool): Whether the provided data is in deg or radians. 

577 If None is provided, it will not be included in the attrs. Defaults to None. 

578 with_time (Union[bool, np.ndarray]): Whether to add the time of 

579 the frames to the `frame_num` axis. Can be either True, or False, or a 

580 `np.ndarray`, in which case the data in this array will be used. Defaults to True. 

581 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk. 

582 This can come in handy, if the trajectory is loaded out of-memory and 

583 distributed, in which case, MDAnalysis is used to load data. 

584 MDTraj does never load the trajectory coordinates. This the attribute 

585 `_original_frame_indices` in the trajectory will be empty. However, as 

586 the distributed task did already assign frames to the workers, we have 

587 this number, we just need to use it. If set to None `_original_frame_indices` 

588 will be used. Otherwise, the `np.ndarray` provided here, will be used. 

589 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`. 

590 This can be especially useful for trajectory ensembles, which might differ 

591 in a lot of places (i.e., single amino acid exchange). If the labels are 

592 too strict (LYS1 ATOM1 C), the data of two slightly different proteins 

593 cannot be concatenated. If the labels are generic (AA1 ATOM1), this can 

594 be done. Defaults to None. 

595 

596 Returns: 

597 tuple[xr.DataArray, Union[None, xr.DataArray]]: The resulting dataarrays, 

598 as a tuple of (data, indices) or (data, None), if indices were not 

599 requested. 

600 

601 """ 

602 atom_axis: str = "ATOM" 

603 if feat is not None: 

604 if feat.__class__.__name__ == "SideChainCartesians": 

605 atom_axis = "SIDEATOM" 

606 if feat.__class__.__name__ == "AllCartesians": 

607 atom_axis = "ALLATOM" 

608 attrs = { 

609 "length_units": "nm", 

610 "time_units": "ps", 

611 "full_path": traj.traj_file, 

612 "topology_file": traj.top_file, 

613 "feature_axis": atom_axis, 

614 } 

615 if feat is not None: 

616 indices = _get_indexes_from_feat(feat, traj) 

617 else: 

618 indices = None 

619 if deg is not None: 

620 if deg: 

621 attrs["angle_units"] = "deg" 

622 else: 

623 attrs["angle_units"] = "rad" 

624 if frame_indices is None: 

625 frame_indices = traj.id 

626 if frame_indices.ndim > 1: 

627 frame_indices = frame_indices[:, 1] 

628 

629 if labels2 is not None: 

630 labels = labels2 

631 else: 

632 labels = [_[11:].lstrip(" ") for _ in labels[::3]] 

633 

634 coords = { 

635 "traj_num": ("traj_num", np.asarray([traj.traj_num])), 

636 "traj_name": ("traj_num", np.asarray([traj.basename])), 

637 "frame_num": ("frame_num", frame_indices), 

638 atom_axis: (atom_axis, np.asarray(labels)), 

639 "COORDS": np.array(["POSITION X", "POSITION Y", "POSITION Z"]), 

640 } 

641 da = xr.DataArray( 

642 data, 

643 coords=coords, 

644 dims=["traj_num", "frame_num", atom_axis, "COORDS"], 

645 name=name.upper(), 

646 attrs=attrs, 

647 ) 

648 if indices is not None: 

649 if len(indices) // 3 == data.shape[-2] and name.startswith("CustomFeature"): 

650 warnings.warn( 

651 f"Can't find good labels for the feature {name}. The " 

652 f"data from the feature's transform has shape {data[0].shape}. " 

653 f"The indices have indices length {len(indices)}. I will not include the " 

654 f"indices of this feature in the dataset. However, the output data " 

655 f"will be there." 

656 ) 

657 indices = None 

658 elif len(indices) != data.shape[-2]: 

659 indices_copy = indices.copy() 

660 indices = np.full((len(labels),), np.nan, float) 

661 indices[: len(indices_copy)] = indices_copy 

662 

663 if indices is not None: 

664 coords = { 

665 "traj_num": ("traj_num", np.asarray([traj.traj_num])), 

666 "traj_name": ("traj_num", np.asarray([traj.basename])), 

667 atom_axis: (atom_axis, np.asarray(labels)), 

668 } 

669 ind_da = xr.DataArray( 

670 np.expand_dims(indices, 0), 

671 coords=coords, 

672 dims=["traj_num", atom_axis], 

673 name=name.upper(), 

674 attrs=attrs | {"feature_axis": atom_axis}, 

675 ) 

676 else: 

677 ind_da = None 

678 else: 

679 ind_da = None 

680 if isinstance(with_time, bool): 

681 if with_time: 

682 da = da.assign_coords(time=("frame_num", traj.time)) 

683 else: 

684 da = da.assign_coords(time=("frame_num", with_time)) 

685 return da, ind_da 

686 

687 

688@overload 

689def make_dataarray( 689 ↛ exitline 689 didn't jump to the function exit

690 labels: list[str], 

691 traj: SingleTraj, 

692 name: str, 

693 data: np.ndarray, 

694 deg: Union[None, bool], 

695 with_time: bool = True, 

696 frame_indices: Optional[np.ndarray] = None, 

697 labels2: Optional[list[str]] = None, 

698 feat: Optional["AnyFeature"] = None, 

699) -> tuple[xr.DataArray, None]: ... 

700 

701 

702@overload 

703def make_dataarray( 703 ↛ exitline 703 didn't jump to the function exit

704 labels: list[str], 

705 traj: SingleTraj, 

706 name: str, 

707 data: np.ndarray, 

708 deg: Union[None, bool], 

709 with_time: bool = True, 

710 frame_indices: Optional[np.ndarray] = None, 

711 labels2: Optional[list[str]] = None, 

712 feat: Optional["AnyFeature"] = "AnyFeature", 

713) -> tuple[xr.DataArray, xr.DataArray]: ... 

714 

715 

716def make_dataarray( 

717 labels: list[str], 

718 traj: SingleTraj, 

719 name: str, 

720 data: np.ndarray, 

721 deg: Union[None, bool], 

722 with_time: bool = True, 

723 frame_indices: Optional[np.ndarray] = None, 

724 labels2: Optional[list[str]] = None, 

725 feat: Optional["AnyFeature"] = None, 

726) -> tuple[xr.DataArray, Union[None, xr.DataArray]]: 

727 """Creates a DataArray belonging to a feature. 

728 

729 The shapes are a bit different from what most people might be used to. As 

730 EncoderMap was meant to work with ensembles of trajectories, the data is usually 

731 shaped as (traj_num, frame_num, CV_num), or even (traj_num, frame_num, atom_num, 3). 

732 

733 The `xarray.DataArray` that is returned by this function reflects this. As a 

734 DataArray is attributed to a single trajectory, the first shape will always be 1. 

735 So for a collective variable, that describes n features, the shape of the returned 

736 datarray will be (1, n_frames, n). Combining multiple trajectories, the first number 

737 can increase. 

738 

739 Note: 

740 Please make sure that the input data conforms to the nm, ps, rad coordinates. 

741 

742 Args: 

743 labels (list[str]): The labels, that specify the `CV_num` dimension. This 

744 requires the expression `len(labels == data.shape[2]` to be True. If you 

745 build the DataArray from a feature. The `labels` argument usually will 

746 be `feature.describe()`. 

747 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory. 

748 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap, 

749 an xarray.DataArray always represents one feature, of one trajectory. 

750 A trajectory can have multiple features, which is represented as an 

751 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used. 

752 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This 

753 dataset is created ad hoc, by merging the datasets of the trajectories along 

754 the `traj_num` axis. 

755 name (str): The name of the feature. This name will be used to group similar 

756 features in the large `xarray.Dataset`s of trajectory ensembles. If you 

757 construct this DataArray from a feature, you can either use `feature.name`, 

758 or `feature.__class__.__name__`. 

759 data (Union[np.ndarray, dask.array]): The data to fill the DataArray with. 

760 deg (Optional[bool]): Whether provided data is in deg or radians. If None 

761 is provided, the angle_units will not appear in the attributes. 

762 Defaults to None. 

763 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of 

764 the frames to the `frame_num` axis. Can be either True, or False, or a 

765 `np.ndarray`, in which case the data in this array will be used. Defaults to True. 

766 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk. 

767 This can come in handy if the trajectory is loaded out of-memory and 

768 distributed, in which case, MDAnalysis is used to load data. 

769 MDTraj does never load the trajectory coordinates. This the attribute 

770 `_original_frame_indices` in the trajectory will be empty. However, as 

771 the distributed task did already assign frames to the workers, we have 

772 this number, we just need to use it. If set to None `_original_frame_indices` 

773 will be used. Otherwise, the `np.ndarray` provided here, will be used. 

774 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`. 

775 This can be especially useful for trajectory ensembles, which might differ 

776 in a lot of places (i.e., single amino acid exchange). If the labels are 

777 too strict (LYS1 ATOM1 C), the data of two slightly different proteins 

778 cannot be concatenated. If the labels are generic (AA1 ATOM1), this can 

779 be done. Defaults to None. 

780 

781 Returns: 

782 xarray.DataArray: The resulting DataArray. 

783 

784 """ 

785 if feat is not None: 

786 indices = _get_indexes_from_feat(feat, traj) 

787 else: 

788 indices = None 

789 attrs = { 

790 "length_units": "nm", 

791 "time_units": "ps", 

792 "full_path": traj.traj_file, 

793 "topology_file": traj.top_file, 

794 "feature_axis": name.upper(), 

795 } 

796 if deg is not None: 

797 if deg: 

798 attrs["angle_units"] = "deg" 

799 else: 

800 attrs["angle_units"] = "rad" 

801 

802 if frame_indices is None: 

803 frame_indices = traj.id 

804 if frame_indices.ndim > 1: 

805 frame_indices = frame_indices[:, 1] 

806 

807 if labels2 is not None: 

808 labels = labels2 

809 

810 if callable(labels): 

811 labels = [l for l in labels(traj.top)] 

812 else: 

813 if len(labels) > data.shape[-1]: 

814 warnings.warn( 

815 f"Provided labels {labels[:5]} are greater {len(labels)} than the " 

816 f"data {data.shape[-1]}. I will use integers to label the feature axis." 

817 ) 

818 labels = np.arange(data.shape[-1]) 

819 

820 coords = { 

821 "traj_num": ("traj_num", np.asarray([traj.traj_num])), 

822 "traj_name": ("traj_num", np.asarray([traj.basename])), 

823 "frame_num": ("frame_num", frame_indices), 

824 name.upper(): np.asarray(labels), 

825 } 

826 

827 da = xr.DataArray( 

828 data, 

829 coords=coords, 

830 dims=["traj_num", "frame_num", name.upper()], 

831 name=name, 

832 attrs=attrs, 

833 ) 

834 

835 ind_da = xr.DataArray() 

836 if indices is not None: 

837 index_labels = np.asarray(labels) 

838 coords = { 

839 "traj_num": ("traj_num", np.asarray([traj.traj_num])), 

840 "traj_name": ("traj_num", np.asarray([traj.basename])), 

841 name.upper(): index_labels, 

842 } 

843 try: 

844 indices = np.asarray(indices) 

845 inhomogeneous_shape = False 

846 except ValueError as e: 

847 if "inhomogeneous" in str(e): 

848 inhomogeneous_shape = True 

849 else: 

850 raise e 

851 # if the shape of indices is inhomogeneous, we can't put them into a DataArray 

852 if inhomogeneous_shape or any( 

853 [i in feat.__class__.__name__.lower() for i in ["groupcom", "residuecom"]] 

854 ): 

855 warnings.warn( 

856 f"The feature {name} will not produce a '{name}_feature_indices' " 

857 f"xr.DataArray. Its indices contain either inhomogeneous shapes " 

858 f"({indices}) or it is a center-of-mass (COM) feature. These " 

859 f"features are excluded from providing indices. " 

860 f"I will put a string representation of these " 

861 f"indices into the DataArray's `attrs` dictionary. Thus, it " 

862 f"can still be saved to disk and viewed, but it will be " 

863 f"a string and not a sequence of integers." 

864 ) 

865 da.attrs |= {f"{name}_feature_indices": str(indices)} 

866 # special case: PyEMMA feature with cartesians 

867 elif len(indices) == data.shape[-1] // 3 and "selection" in feat.name.lower(): 

868 indices = np.vstack([indices for i in range(3)]).flatten(order="F") 

869 ind_da = xr.DataArray( 

870 np.expand_dims(np.expand_dims(indices, 0), -1), 

871 coords=coords 

872 | { 

873 "ATOM_NO": [0], 

874 }, 

875 dims=["traj_num", name.upper(), "ATOM_NO"], 

876 name=name.upper(), 

877 attrs=attrs | {"feature_axis": name.upper()}, 

878 ) 

879 # special case: RMSD features 

880 # rmsd features provide their alignment atoms 

881 elif "rmsd" in feat.__class__.__name__.lower(): 

882 ind_da = xr.DataArray( 

883 np.expand_dims(np.expand_dims(indices, 0), 0), 

884 coords=coords 

885 | { 

886 "RMSD_ATOM_NO": indices, 

887 }, 

888 dims=["traj_num", name.upper(), "RMSD_ATOM_NO"], 

889 name=name.upper(), 

890 attrs=attrs | {"feature_axis": name.upper()}, 

891 ) 

892 # special case: Align feature 

893 elif "align" in feat.__class__.__name__.lower(): 

894 indices = np.vstack([indices for i in range(3)]).flatten(order="F") 

895 ind_da = xr.DataArray( 

896 np.expand_dims(np.expand_dims(indices, 0), -1), 

897 coords=coords 

898 | { 

899 "ALIGN_ATOM_NO": [0], 

900 }, 

901 dims=["traj_num", name.upper(), "ALIGN_ATOM_NO"], 

902 name=name.upper(), 

903 attrs=attrs | {"feature_axis": name.upper()}, 

904 ) 

905 elif feat.__class__.__name__ == "ResidueMinDistanceFeature": 

906 # indices = np.hstack(indices) 

907 # data = np.vstack([indices for i in range(3)]).flatten(order="F") 

908 ind_da = xr.DataArray( 

909 np.expand_dims(indices, 0), 

910 coords=coords | {"RES_NO": [0, 1]}, 

911 dims=["traj_num", name.upper(), "RES_NO"], 

912 name=name.upper(), 

913 attrs=attrs | {"feature_axis": name.upper()}, 

914 ) 

915 elif len(indices) != data.shape[-1]: 

916 if all(["COS" in i for i in index_labels[::2]]) and all( 

917 ["SIN" in i for i in index_labels[1::2]] 

918 ): 

919 c = np.empty((indices.shape[0] * 2, 3), dtype=int) 

920 c[0::2] = indices 

921 c[1::2] = indices 

922 indices = np.expand_dims(c, 0) 

923 ind_da = xr.DataArray( 

924 indices, 

925 coords=coords | {"ATOM_NO": np.arange(indices.shape[-1])}, 

926 dims=["traj_num", name.upper(), "ATOM_NO"], 

927 name=name.upper(), 

928 attrs=attrs | {"feature_axis": name.upper()}, 

929 ) 

930 else: 

931 raise Exception( 

932 f"{feat=}\n\n" 

933 f"{feat.name=}\n\n" 

934 f"{inhomogeneous_shape=}\n\n" 

935 f"{len(indices)=}\n\n" 

936 f"{indices.shape=}\n\n" 

937 f"{indices.ndim=}\n\n" 

938 f"{data.shape=}\n\n" 

939 f"{indices=}\n\n" 

940 f"{data=}\n\n" 

941 f"{index_labels=}\n\n" 

942 f"{len(indices) == data.shape[-1] // 3=}\n\n" 

943 f"{'selection' in feat.name.lower()=}" 

944 ) 

945 # regular case indices and data are aligned 

946 else: 

947 indices = np.expand_dims(indices, 0) 

948 if indices.ndim != data.ndim: 

949 indices = np.expand_dims(indices, -1) 

950 ind_da = xr.DataArray( 

951 indices, 

952 coords=coords | {"ATOM_NO": np.arange(indices.shape[-1])}, 

953 dims=["traj_num", name.upper(), "ATOM_NO"], 

954 name=name.upper(), 

955 attrs=attrs | {"feature_axis": name.upper()}, 

956 ) 

957 if ind_da.size > 1: 

958 assert len(ind_da.coords[name.upper()]) == len(da.coords[name.upper()]) 

959 

960 if isinstance(with_time, bool): 

961 if with_time: 

962 da = da.assign_coords(time=("frame_num", traj.time)) 

963 else: 

964 da = da.assign_coords(time=("frame_num", with_time)) 

965 if ind_da.size == 1: 

966 ind_da = None 

967 return da, ind_da 

968 

969 

970def get_indices_by_feature_dim(feat, traj, input_data_shape): 

971 """Unpacks the concatenated features returned by PyEMMA. 

972 

973 Usually, if you add multiple features to a PyEMMA featurizer, they are 

974 stacked along the feature axis. Let's say, you have a trajectory with 20 frames 

975 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi). 

976 If you then also add the end-to-end distance as a feature, the data returned by 

977 PyEMMA will have the shape (20, 7). This function returns the correct indices, 

978 so that iteration of `zip(Featurizer.active_features, indices)` will yield the 

979 correct results. 

980 

981 Args: 

982 feat (em.Featurizer): Instance of `encodermap.Featurizer`. The featurizer knows 

983 how many entries every feature takes up in the feature space. 

984 traj (encodermap.SingleTraj): An instance of `SingleTraj`. This is needed 

985 for a single purpose: PyEMMA returns malformed data for .pdb files, that 

986 are loaded from the protein database 

987 (i.e. `traj = em.SingleTraj('https://files.rcsb.org/view/1GHC.pdb'`). 

988 This is prevented by providing the traj and do a small check. 

989 input_data_shape (tuple): The data, that should conform to the slicing. 

990 Also needed for some checks. 

991 

992 Returns: 

993 list[np.ndarray]: List of `np.ndarray`s that correctly slice the data. 

994 

995 """ 

996 if len(feat.features) > 1: 

997 indices = [0] + add_one_by_one([f._dim for f in feat.features]) 

998 # Since deprecating PyEMMA, we can remove this part 

999 # if traj.extension == ".pdb" and _validate_uri(traj.traj_file): 

1000 # # for internet pdb files we need to slice the data. this negates the next assert but meh. 

1001 # raise Exception( 

1002 # "For some reason `pyemma.coordinates.source` does not like working with internet pdb files." 

1003 # ) 

1004 # else: 

1005 if len(input_data_shape) <= 1: 

1006 raise Exception(f"{feat=}, {traj=}, {input_data_shape=}") 

1007 if not indices[-1] == input_data_shape[1]: 

1008 raise Exception( 

1009 f"The indices in the features do not match the input data shape: " 

1010 f"{indices[-1]=} {input_data_shape=} " 

1011 f"{sum([len(f.describe()) for f in feat.features])=}\n\n" 

1012 f"{indices=}\n\n{input_data_shape=}" 

1013 ) 

1014 indices = [np.arange(i, j) for i, j in zip(indices[:-1], indices[1:])] 

1015 else: 

1016 indices = [np.arange(0, feat.features[0]._dim)] 

1017 return indices 

1018 

1019 

1020def add_one_by_one(l: list[Number]) -> list[Number]: 

1021 """Creates a new list from l with elements added one after another. 

1022 

1023 Args: 

1024 l (list): The input list. 

1025 

1026 Returns: 

1027 list: The output list. 

1028 

1029 Example: 

1030 >>> l = [0, 2, 4, 5, 7] 

1031 >>> add_one_by_one(l) 

1032 [0, 2, 6, 11, 18] 

1033 

1034 """ 

1035 new_l = [] 

1036 cumsum = 0 

1037 for elt in l: 

1038 cumsum += elt 

1039 new_l.append(cumsum) 

1040 return new_l