Coverage for encodermap/misc/xarray.py: 71%

156 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-07 11:05 +0000

1# -*- coding: utf-8 -*- 

2# encodermap/misc/xarray.py 

3################################################################################ 

4# Encodermap: A python library for dimensionality reduction. 

5# 

6# Copyright 2019-2022 University of Konstanz and the Authors 

7# 

8# Authors: 

9# Kevin Sawade, Tobias Lemke 

10# 

11# Encodermap is free software: you can redistribute it and/or modify 

12# it under the terms of the GNU Lesser General Public License as 

13# published by the Free Software Foundation, either version 2.1 

14# of the License, or (at your option) any later version. 

15# This package is distributed in the hope that it will be useful to other 

16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the 

17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 

18# See the GNU Lesser General Public License for more details. 

19# 

20# See <http://www.gnu.org/licenses/>. 

21################################################################################ 

22################################################################################ 

23# Imports 

24################################################################################ 

25 

26 

27from __future__ import annotations 

28 

29import numpy as np 

30 

31from .._optional_imports import _optional_import 

32from .errors import BadError 

33from .misc import FEATURE_NAMES, _validate_uri 

34 

35################################################################################ 

36# Optional Imports 

37################################################################################ 

38 

39 

40xr = _optional_import("xarray") 

41 

42 

43################################################################################ 

44# Typing 

45################################################################################ 

46 

47 

48from typing import TYPE_CHECKING, Optional 

49 

50if TYPE_CHECKING: 

51 import xarray as xr 

52 

53 from ..loading.featurizer import Featurizer 

54 from ..trajinfo import SingleTraj 

55 

56 

57################################################################################ 

58# Globals 

59################################################################################ 

60 

61 

62__all__ = [ 

63 "construct_xarray_from_numpy", 

64 "unpack_data_and_feature", 

65] 

66 

67 

68################################################################################ 

69# Functions 

70################################################################################ 

71 

72 

73def construct_xarray_from_numpy( 

74 traj: SingleTraj, 

75 data: np.ndarray, 

76 name: str, 

77 labels: Optional[list[str]] = None, 

78 check_n_frames: bool = False, 

79) -> xr.DataArray: 

80 """Constructs an xarray dataarray from a numpy array. 

81 

82 Three different cases are recognized: 

83 * The input array in data has ndim == 2. This kind of feature/CV is a per-frame feature, like the membership 

84 to clusters. Every frame of every trajectory is assigned a single value (most often int values). 

85 * The input array in data has ndim == 3: This is also a per-frame feature/CV, but this time every frame 

86 is characterized by a series of values. These values can be dihedral angles in the backbone starting from 

87 the protein's N-terminus to the C-terminus, or pairwise distance features between certain atoms. 

88 The xarray datarrat constructed from this kind of data will have a label dimension that will either 

89 contain generic labels like 'CUSTOM_FEATURE FEATURE 0' or labels defined by the featurizer such as 

90 'SIDECHAIN ANGLE CHI1 OF RESIDUE 1LYS'. 

91 * The input array in data has ndim == 4. Here, the same feature/CV is duplicated for the protein's atoms. 

92 Besides the XYZ coordinates of the atoms no other CVs should fall into this case. The labels will be 

93 2-dimensional with 'POSITION OF ATOM H1 IN RESIDUE 1LYS' in dimension 0 and either 'X', 'Y' or 'Z' in 

94 dimension 1. 

95 

96 Args: 

97 traj (em.SingleTraj): The trajectory we want to create the xarray dataarray for. 

98 data (np.ndarray): The numpy array we want to create the data from. Note, that the data passed into this 

99 function should be expanded by np.expand_dim(a, axis=0), so to add a new axis to the complete data 

100 containing the trajectories of a trajectory ensemble. 

101 name (str): The name of the feature. This can be choosen freely. Names like 'central_angles', 'backbone_torsions' 

102 would make the most sense. 

103 labels (Optional[list]): If you have specific labels for your CVs in mind, you can overwrite the 

104 generic 'CUSTOM_FEATURE FEATURE 0' labels with providing a list for this argument. If None is provided, 

105 generic names will be given to the features. Defaults to None. 

106 check_n_frames (bool): Whether to check whether the number of frames in the trajectory matches the len 

107 of the data in at least one dimension. Defaults to False. 

108 

109 Returns: 

110 xarray.Dataarray: An `xarray.Dataarray`. 

111 

112 Examples: 

113 >>> import encodermap as em 

114 >>> from encodermap.misc.xarray import construct_xarray_from_numpy 

115 >>> # load file from RCSB and give it traj num to represent it in a potential trajectory ensemble 

116 >>> traj = em.load('https://files.rcsb.org/view/1GHC.pdb', traj_num=1) 

117 >>> # single trajectory needs to be expaneded into 'trajectory' axis 

118 >>> z_coordinate = np.expand_dims(traj.xyz[:,:,0], 0) 

119 >>> da = construct_xarray_from_numpy(traj, z_coordinate, 'z_coordinate') 

120 >>> print(da.coords['Z_COORDINATE'].values[:2]) 

121 ['Z_COORDINATE FEATURE 0' 'Z_COORDINATE FEATURE 1'] 

122 >>> print(da.coords['traj_num'].values) 

123 [1] 

124 >>> print(da.attrs['time_units']) 

125 ps 

126 

127 """ 

128 if check_n_frames: 

129 assert any(s == traj.n_frames for s in data.shape), print( 

130 f"Data and traj misaligned. traj_frames: {traj.n_frames}, data.shape: {data.shape}, attr_name: {name}, {data}" 

131 ) 

132 if traj.backend == "no_load": 

133 with_time = False 

134 else: 

135 with_time = True 

136 if data.ndim == 2: 

137 if labels is None: 137 ↛ 139line 137 didn't jump to line 139, because the condition on line 137 was never false

138 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])] 

139 da = make_frame_CV_dataarray(labels, traj, name, data, with_time=with_time) 

140 elif data.ndim == 3: 

141 if labels is None: 141 ↛ 143line 141 didn't jump to line 143, because the condition on line 141 was never false

142 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])] 

143 da = make_dataarray(labels, traj, name, data, with_time=with_time) 

144 elif data.ndim == 4: 144 ↛ 151line 144 didn't jump to line 151, because the condition on line 144 was never false

145 if labels is None: 145 ↛ 147line 145 didn't jump to line 147, because the condition on line 145 was never false

146 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-2])] 

147 da = make_position_dataarray_from_numpy( 

148 labels, traj, name, data, with_time=with_time 

149 ) 

150 else: 

151 raise Exception( 

152 f"The provided data has a dimensionality of {data.ndim}, but only 2, 3 and 4 are supported." 

153 ) 

154 return da 

155 

156 

157def unpack_data_and_feature( 

158 feat: Featurizer, 

159 traj: SingleTraj, 

160 input_data: np.ndarray, 

161 put_indices_into_attrs: bool = True, 

162) -> xr.Dataset: 

163 """Makes a `xarray.Dataset` from data and a featurizer. 

164 

165 Usually, if you add multiple features to a PyEMMA featurizer, they are 

166 stacked along the feature axis. Let's say, you have a trajectory with 20 frames 

167 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi). 

168 If you then also add the end-to-end distance as a feature, the data returned by 

169 PyEMMA will have the shape (20, 7). This function returns the correct indices, 

170 so that iteration of `zip(Featurizer.active_features, indices)` will yield the 

171 correct results. 

172 

173 Args: 

174 feat (em.Featurizer): An instance of the currently used `encodermap.Featurizer`. 

175 traj (em.SingleTraj): An instance of `encodermap.SingleTraj`, that the data 

176 in `input_data` was computed from 

177 input_data (np.ndarray): The data, as returned from PyEMMA. 

178 put_indices_into_attrs (bool): Whether to put the indices into the attrs. 

179 This needs to be False, when Ensembles are loaded because the Features 

180 of the ensemble load function do not match the real indices that should be there. 

181 

182 Returns: 

183 xarray.Dataset: An `xarray.Dataset` with all features in a nice format. 

184 

185 """ 

186 # this needs to be done, because pyemma concatenates the data 

187 # along the feature axis 

188 indices = get_indices_by_feature_dim(feat, traj, input_data.shape) 

189 

190 DAs = {} 

191 indexes = {} 

192 for f, ind in zip(feat.features, indices): 

193 data = input_data[:, ind] 

194 try: 

195 name = FEATURE_NAMES[f.name] 

196 except (KeyError, AttributeError): 

197 name = f.__class__.__name__ 

198 f.name = name 

199 assert data.shape[1] == f._dim 

200 if data.shape[0] != len(traj): 200 ↛ 201line 200 didn't jump to line 201, because the condition on line 200 was never true

201 if traj.index is None: 

202 raise Exception( 

203 "Shape of provided data does not fit traj. Traj " 

204 f"has {traj.n_frames=} {len(traj)=}. Data has shape {data.shape}" 

205 ) 

206 else: 

207 data = data[traj.index] 

208 assert data.shape[0] == len(traj) 

209 if name in DAs: 209 ↛ 210line 209 didn't jump to line 210

210 name = ( 

211 name 

212 + f"_{len(list(filter(lambda x: True if name in x else False, list(DAs.keys()))))}" 

213 ) 

214 if f.name in ["AllCartesians", "CentralCartesians", "SideChainCartesians"]: 

215 data = data.reshape(len(traj), -1, 3) 

216 data = np.expand_dims(data, axis=0) 

217 DAs[name] = make_position_dataarray(f.describe(), traj, name, data) 

218 else: 

219 data = np.expand_dims(data, axis=0) 

220 DAs[name] = make_dataarray(f.describe(), traj, name, data) 

221 

222 # the indices/indexes used to create this dataarray. 

223 # This can be useful for later. To redo some analysis 

224 # or use tensorflow for the geometric computations of these values. 

225 # some PyEMMA features give the indexes different names e.g 'group_definitions' 

226 try: 

227 indexes[name] = f.indexes.tolist() 

228 except AttributeError as e: 

229 for key in f.__dir__(): 

230 if "inde" in key: 

231 indexes[name] = f.__dict__[key] 

232 if ( 

233 f.__class__.__name__ == "GroupCOMFeature" 

234 or f.__class__.__name__ == "ResidueCOMFeature" 

235 ): 

236 indexes[name] = f.group_definitions 

237 if f.__class__.__name__ == "MinRmsdFeature": 

238 indexes[name] = f.atom_indices 

239 if name not in indexes: 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true

240 raise e 

241 ds = xr.Dataset(DAs) 

242 if put_indices_into_attrs: 242 ↛ 244line 242 didn't jump to line 244, because the condition on line 242 was never false

243 ds = ds.assign_attrs(indexes) 

244 return ds 

245 

246 

247def construct_xarray_from_feat(feat, input_data=None): 

248 raise Exception( 

249 "Marked for deprecation, because featurizer returns its own dataset." 

250 ) 

251 

252 

253def make_frame_CV_dataarray( 

254 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None 

255): 

256 """Make a dataarray from a frame CV feature. 

257 

258 A normal features yields multiple values per trajectory frame (e.g. the 

259 backbone dihedral angles give 2 * n_residues features per frame). A frame 

260 CV feature is just a single value for a trajectory. Examples are: 

261 * The distance between a binding pocket and a ligand. 

262 * The volume of the unitcell. 

263 * Whether a molecule is in state 1 or 2, could be described as a binary frame CV features. 

264 

265 This method has additional logic. If `data` has a shape of (1, n_frames, 1) it 

266 is left as is. If it has a shape of (1, n_frames), it will be expanded on the -1 axis to 

267 shape (1, n_frames, 1). 

268 

269 Note: 

270 Please make sure, that the input data conforms to the nm, ps, rad coordinates. 

271 

272 Args: 

273 labels (list[str]): The labels, that specify the `CV_num` dimension. This 

274 requires the expression `len(labels == data.shape[2]` to be True. If you 

275 build the dataarray from a feature. The `labels` argument usually will 

276 be `feature.describe()`. 

277 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory. 

278 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap, 

279 an xarray.Dataarray always represents one feature, of one trajectory. 

280 A trajectory can have multiple features, which is represented as an 

281 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used. 

282 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This 

283 dataset is created ad hoc, by merging the datasets of the trajectories along 

284 the `traj_num` axis. 

285 name (str): The name of the feature. This name will be used to group similar 

286 features in the large `xarray.Dataset`s of trajectory ensembles. If you 

287 construct this dataarray from a feature, you can either use `feature.name`, 

288 or `feature.__class__.__name__`. 

289 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with. 

290 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of 

291 the frames to the `frame_num` axis. Can be either True, or False, or a 

292 `np.ndarray`, in which case the data in this array will be used. Defaults to True. 

293 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk. 

294 This can come in handy, if the trajectory is loaded out of-memory and 

295 distributed, in which case, MDAnalysis is used to load data. 

296 MDTraj does never load the trajectory coordinates. This the attribute 

297 `_original_frame_indices` in the trajectory will be empty. However, as 

298 the distributed task did already assign frames to the workers, we have 

299 this number, we just need to use it. If set to None `_original_frame_indices` 

300 will be used. Otherwise, the `np.ndarray` provided here, will be used. 

301 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`. 

302 This can be especially useful for trajectory ensembles, which might differ 

303 in a lot of places (i.e. single amino acid exchange). If the labels are 

304 too strict (LYS1 ATOM1 C), the data of two slightly different proteins 

305 can not be concatenated. If the labels are generic (AA1 ATOM1), this can 

306 be done. Defaults to None. 

307 

308 Returns: 

309 xarray.Dataarray: The resulting dataarray. 

310 

311 """ 

312 if data.ndim == 3: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true

313 pass 

314 elif data.ndim == 2: 314 ↛ 316line 314 didn't jump to line 316, because the condition on line 314 was never false

315 data = np.expand_dims(data, -1) 

316 return make_dataarray( 

317 labels, 

318 traj, 

319 name, 

320 data, 

321 with_time, 

322 frame_indices, 

323 labels2, 

324 ) 

325 

326 

327def make_position_dataarray_from_numpy(atoms, traj, name, data, with_time=True): 

328 frame_indices = traj.id 

329 if frame_indices.ndim > 1: 

330 frame_indices = frame_indices[:, 1] 

331 da = xr.DataArray( 

332 data, 

333 coords={ 

334 "traj_num": ("traj_num", [traj.traj_num]), 

335 "traj_name": ("traj_num", [traj.basename]), 

336 "frame_num": ("frame_num", frame_indices), 

337 "ATOM": ("ATOM", atoms), 

338 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"], 

339 }, 

340 dims=["traj_num", "frame_num", "ATOM", "COORDS"], 

341 name=name, 

342 attrs={ 

343 "length_units": "nm", 

344 "time_units": "ps", 

345 "angle_units": "rad", 

346 "full_path": traj.traj_file, 

347 "topology_file": traj.top_file, 

348 "feature_axis": "ATOM", 

349 }, 

350 ) 

351 if with_time: 

352 da = da.assign_coords(time=("frame_num", traj.time)) 

353 return da 

354 

355 

356def make_position_dataarray( 

357 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None 

358): 

359 """Creates dataarray belonging to cartesian positions. 

360 

361 Similar to `make_datarray`, but the shapes are even larger. As every atom 

362 contributes 3 coordinates (x, y, z) to the data, the shape of the returned 

363 dataarray is (1, no_of_frames, no_of_atoms_considered, 3). 

364 

365 Note: 

366 Please make sure, that the input data conforms to the nm, ps, rad coordinates. 

367 

368 Args: 

369 labels (list[str]): The labels, that specify the `CV_num` dimension. This 

370 requires the expression `len(labels == data.shape[2]` to be True. If you 

371 build the dataarray from a feature. The `labels` argument usually will 

372 be `feature.describe()`. 

373 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory. 

374 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap, 

375 an xarray.Dataarray always represents one feature, of one trajectory. 

376 A trajectory can have multiple features, which is represented as an 

377 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used. 

378 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This 

379 dataset is created ad hoc, by merging the datasets of the trajectories along 

380 the `traj_num` axis. 

381 name (str): The name of the feature. This name will be used to group similar 

382 features in the large `xarray.Dataset`s of trajectory ensembles. If you 

383 construct this dataarray from a feature, you can either use `feature.name`, 

384 or `feature.__class__.__name__`. 

385 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with. 

386 with_time (Union[bool, np.ndarray]): Whether to add the time of 

387 the frames to the `frame_num` axis. Can be either True, or False, or a 

388 `np.ndarray`, in which case the data in this array will be used. Defaults to True. 

389 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk. 

390 This can come in handy, if the trajectory is loaded out of-memory and 

391 distributed, in which case, MDAnalysis is used to load data. 

392 MDTraj does never load the trajectory coordinates. This the attribute 

393 `_original_frame_indices` in the trajectory will be empty. However, as 

394 the distributed task did already assign frames to the workers, we have 

395 this number, we just need to use it. If set to None `_original_frame_indices` 

396 will be used. Otherwise, the `np.ndarray` provided here, will be used. 

397 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`. 

398 This can be especially useful for trajectory ensembles, which might differ 

399 in a lot of places (i.e. single amino acid exchange). If the labels are 

400 too strict (LYS1 ATOM1 C), the data of two slightly different proteins 

401 can not be concatenated. If the labels are generic (AA1 ATOM1), this can 

402 be done. Defaults to None. 

403 

404 Returns: 

405 xarray.Dataarray: The resulting dataarray. 

406 

407 """ 

408 if frame_indices is None: 408 ↛ 418line 408 didn't jump to line 418, because the condition on line 408 was never false

409 frame_indices = traj.id 

410 if frame_indices.ndim > 1: 

411 frame_indices = frame_indices[:, 1] 

412 # frame_indices = traj._original_frame_indices 

413 # if not np.any(frame_indices): 

414 # frame_indices = traj.id 

415 # if frame_indices.ndim > 1: 

416 # frame_indices = frame_indices[:, 1] 

417 

418 if labels2 is not None: 418 ↛ 419line 418 didn't jump to line 419, because the condition on line 418 was never true

419 labels = labels2 

420 else: 

421 labels = [_[11:].lstrip(" ") for _ in labels[::3]] 

422 

423 da = xr.DataArray( 

424 data, 

425 coords={ 

426 "traj_num": ("traj_num", [traj.traj_num]), 

427 "traj_name": ("traj_num", [traj.basename]), 

428 "frame_num": ("frame_num", frame_indices), 

429 "ATOM": ("ATOM", labels), 

430 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"], 

431 }, 

432 dims=["traj_num", "frame_num", "ATOM", "COORDS"], 

433 name=name.upper(), 

434 attrs={ 

435 "length_units": "nm", 

436 "time_units": "ps", 

437 "angle_units": "rad", 

438 "full_path": traj.traj_file, 

439 "topology_file": traj.top_file, 

440 "feature_axis": "ATOM", 

441 }, 

442 ) 

443 if isinstance(with_time, bool): 443 ↛ 447line 443 didn't jump to line 447, because the condition on line 443 was never false

444 if with_time: 444 ↛ 448line 444 didn't jump to line 448, because the condition on line 444 was never false

445 da = da.assign_coords(time=("frame_num", traj.time)) 

446 else: 

447 da = da.assign_coords(time=("frame_num", with_time)) 

448 return da 

449 

450 

451def make_dataarray( 

452 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None 

453): 

454 """Creates a dataarray belonging to a feature. 

455 

456 The shapes are a bit different that what most people might be used to. As 

457 EncoderMap was meant to work with ensembles of trajectories, the data is usually 

458 shaped as (traj_num, frame_num, CV_num), or even (traj_num, frame_num, atom_num, 3). 

459 

460 The `xarray.Dataarray`, that is returned by this function reflects this. As a 

461 dataarray is attributed to a single trajectory, the first shape will always be 1. 

462 So for a collective variable, that describes n features, the shape of the returned 

463 datarray will be (1, n_frames, n). Combining multiple trajectories, the first number 

464 can increase. 

465 

466 Note: 

467 Please make sure, that the input data conforms to the nm, ps, rad coordinates. 

468 

469 Args: 

470 labels (list[str]): The labels, that specify the `CV_num` dimension. This 

471 requires the expression `len(labels == data.shape[2]` to be True. If you 

472 build the dataarray from a feature. The `labels` argument usually will 

473 be `feature.describe()`. 

474 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory. 

475 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap, 

476 an xarray.Dataarray always represents one feature, of one trajectory. 

477 A trajectory can have multiple features, which is represented as an 

478 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used. 

479 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This 

480 dataset is created ad hoc, by merging the datasets of the trajectories along 

481 the `traj_num` axis. 

482 name (str): The name of the feature. This name will be used to group similar 

483 features in the large `xarray.Dataset`s of trajectory ensembles. If you 

484 construct this dataarray from a feature, you can either use `feature.name`, 

485 or `feature.__class__.__name__`. 

486 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with. 

487 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of 

488 the frames to the `frame_num` axis. Can be either True, or False, or a 

489 `np.ndarray`, in which case the data in this array will be used. Defaults to True. 

490 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk. 

491 This can come in handy, if the trajectory is loaded out of-memory and 

492 distributed, in which case, MDAnalysis is used to load data. 

493 MDTraj does never load the trajectory coordinates. This the attribute 

494 `_original_frame_indices` in the trajectory will be empty. However, as 

495 the distributed task did already assign frames to the workers, we have 

496 this number, we just need to use it. If set to None `_original_frame_indices` 

497 will be used. Otherwise, the `np.ndarray` provided here, will be used. 

498 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`. 

499 This can be especially useful for trajectory ensembles, which might differ 

500 in a lot of places (i.e. single amino acid exchange). If the labels are 

501 too strict (LYS1 ATOM1 C), the data of two slightly different proteins 

502 can not be concatenated. If the labels are generic (AA1 ATOM1), this can 

503 be done. Defaults to None. 

504 

505 Returns: 

506 xarray.Dataarray: The resulting dataarray. 

507 

508 """ 

509 if frame_indices is None: 509 ↛ 519line 509 didn't jump to line 519, because the condition on line 509 was never false

510 frame_indices = traj.id 

511 if frame_indices.ndim > 1: 

512 frame_indices = frame_indices[:, 1] 

513 # frame_indices = traj._original_frame_indices 

514 # if not np.any(frame_indices): 

515 # frame_indices = traj.id 

516 # if frame_indices.ndim > 1: 

517 # frame_indices = frame_indices[:, 1] 

518 

519 if labels2 is not None: 519 ↛ 520line 519 didn't jump to line 520, because the condition on line 519 was never true

520 labels = labels2 

521 

522 if callable(labels): 522 ↛ 523line 522 didn't jump to line 523, because the condition on line 522 was never true

523 labels = [l for l in labels(traj.top)] 

524 else: 

525 if len(labels) > data.shape[-1]: 

526 labels = np.arange(data.shape[-1]) 

527 

528 da = xr.DataArray( 

529 data, 

530 coords={ 

531 "traj_num": ("traj_num", [traj.traj_num]), 

532 "traj_name": ("traj_num", [traj.basename]), 

533 "frame_num": ("frame_num", frame_indices), 

534 name.upper(): labels, 

535 }, 

536 dims=["traj_num", "frame_num", name.upper()], 

537 name=name, 

538 attrs={ 

539 "length_units": "nm", 

540 "time_units": "ps", 

541 "angle_units": "rad", 

542 "full_path": traj.traj_file, 

543 "topology_file": traj.top_file, 

544 "feature_axis": name.upper(), 

545 }, 

546 ) 

547 if isinstance(with_time, bool): 547 ↛ 551line 547 didn't jump to line 551, because the condition on line 547 was never false

548 if with_time: 

549 da = da.assign_coords(time=("frame_num", traj.time)) 

550 else: 

551 da = da.assign_coords(time=("frame_num", with_time)) 

552 return da 

553 

554 

555def make_ensemble_xarray(name, data): 

556 attrs = {"length_units": "nm", "time_units": "ps", "angle_units": "rad"} 

557 if data.ndim == 2: 

558 coords = { 

559 "traj_num": ("traj_num", range(data.shape[0])), 

560 "frame_num": ("frame_num", range(data.shape[1])), 

561 name.upper(): ( 

562 "frame_num", 

563 [f"{name.upper()} Frame Feature {i}" for i in range(data.shape[1])], 

564 ), 

565 } 

566 dims = ["traj_num", "frame_num"] 

567 elif data.ndim == 3: 

568 coords = { 

569 "traj_num": ("traj_num", range(data.shape[0])), 

570 "frame_num": ("frame_num", range(data.shape[1])), 

571 name.upper(): [f"{name.upper()} Feature {i}" for i in range(data.shape[2])], 

572 } 

573 dims = ["traj_num", "frame_num", name.upper()] 

574 elif data.ndim == 4: 

575 coords = { 

576 "traj_num": ("traj_num", range(data.shape[0])), 

577 "frame_num": ("frame_num", range(data.shape[1])), 

578 "ATOM": ("ATOM", [f"ATOM {i}" for i in range(data.shape[2])]), 

579 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"], 

580 } 

581 dims = ["traj_num", "frame_num", "ATOM", "COORDS"] 

582 else: 

583 raise Exception("Too high dimensional data for ensemble.") 

584 da = xr.DataArray(data, coords=coords, dims=dims, name=name, attrs=attrs) 

585 return da 

586 

587 

588def get_indices_by_feature_dim(feat, traj, input_data_shape): 

589 """Unpacks the concatenated features returned by PyEMMA. 

590 

591 Usually, if you add multiple features to a PyEMMA featurizer, they are 

592 stacked along the feature axis. Let's say, you have a trajectory with 20 frames 

593 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi). 

594 If you then also add the end-to-end distance as a feature, the data returned by 

595 PyEMMA will have the shape (20, 7). This function returns the correct indices, 

596 so that iteration of `zip(Featurizer.active_features, indices)` will yield the 

597 correct results. 

598 

599 Args: 

600 feat (em.Featurizer): Instance of `encodermap.Featurizer`. The featurizer knows 

601 how many entries every feature takes up in the feature space. 

602 traj (encodermap.SingleTraj): An instance of `SingleTraj`. This is needed 

603 for a single purpose: PyEMMA returns malformed data for .pdb files, that 

604 are loaded from the protein database 

605 (i.e. `traj = em.SingleTraj('https://files.rcsb.org/view/1GHC.pdb'`). 

606 This is prevented by providing the traj and do a small check. 

607 input_data_shape (tuple): The data, that should conform to the slicing. 

608 Also needed for some checks. 

609 

610 Returns: 

611 list[np.ndarray]: List of `np.ndarray`s that correctly slice the data. 

612 

613 """ 

614 if len(feat.features) > 1: 

615 indices = [0] + add_one_by_one([f._dim for f in feat.features]) 

616 if traj.extension == ".pdb" and _validate_uri(traj.traj_file): 616 ↛ 618line 616 didn't jump to line 618, because the condition on line 616 was never true

617 # for internet pdb files we need to slice the data. this negates the next assert but meh. 

618 raise BadError( 

619 "For some reason pyemma.coordinates.source does not like working with internet pdb files." 

620 ) 

621 else: 

622 if not indices[-1] == input_data_shape[1]: 622 ↛ 623line 622 didn't jump to line 623, because the condition on line 622 was never true

623 for f in feat.features: 

624 print(f.__class__) 

625 print(f.describe()) 

626 print(len(f.describe())) 

627 print(f._dim) 

628 print(traj, indices, input_data_shape) 

629 raise Exception 

630 indices = [np.arange(i, j) for i, j in zip(indices[:-1], indices[1:])] 

631 else: 

632 indices = [np.arange(0, feat.features[0]._dim)] 

633 return indices 

634 

635 

636def add_one_by_one(l): 

637 """Creates a new list from l with elements added one after another. 

638 

639 Args: 

640 l (list): The input list. 

641 

642 Returns: 

643 list: The output list. 

644 

645 Example: 

646 >>> l = [0, 2, 4, 5, 7] 

647 >>> add_one_by_one(l) 

648 [0, 2, 6, 11, 18] 

649 

650 """ 

651 new_l = [] 

652 cumsum = 0 

653 for elt in l: 

654 cumsum += elt 

655 new_l.append(cumsum) 

656 return new_l