Coverage for encodermap/misc/xarray.py: 71%

1# -*- coding: utf-8 -*-

2# encodermap/misc/xarray.py

3################################################################################

4# Encodermap: A python library for dimensionality reduction.

8# Authors:

9# Kevin Sawade, Tobias Lemke

10#

11# Encodermap is free software: you can redistribute it and/or modify

12# it under the terms of the GNU Lesser General Public License as

13# published by the Free Software Foundation, either version 2.1

14# of the License, or (at your option) any later version.

15# This package is distributed in the hope that it will be useful to other

16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the

17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

18# See the GNU Lesser General Public License for more details.

19#

20# See <http://www.gnu.org/licenses/>.

21################################################################################

22################################################################################

23# Imports

24################################################################################

27from __future__ import annotations

29import numpy as np

31from .._optional_imports import _optional_import

32from .errors import BadError

33from .misc import FEATURE_NAMES, _validate_uri

35################################################################################

36# Optional Imports

37################################################################################

40xr = _optional_import("xarray")

43################################################################################

44# Typing

45################################################################################

48from typing import TYPE_CHECKING, Optional

50if TYPE_CHECKING:

51 import xarray as xr

53 from ..loading.featurizer import Featurizer

54 from ..trajinfo import SingleTraj

57################################################################################

58# Globals

59################################################################################

62__all__ = [

63 "construct_xarray_from_numpy",

64 "unpack_data_and_feature",

65]

68################################################################################

69# Functions

70################################################################################

73def construct_xarray_from_numpy(

74 traj: SingleTraj,

75 data: np.ndarray,

76 name: str,

77 labels: Optional[list[str]] = None,

78 check_n_frames: bool = False,

79) -> xr.DataArray:

80 """Constructs an xarray dataarray from a numpy array.

82 Three different cases are recognized:

83 * The input array in data has ndim == 2. This kind of feature/CV is a per-frame feature, like the membership

84 to clusters. Every frame of every trajectory is assigned a single value (most often int values).

85 * The input array in data has ndim == 3: This is also a per-frame feature/CV, but this time every frame

86 is characterized by a series of values. These values can be dihedral angles in the backbone starting from

87 the protein's N-terminus to the C-terminus, or pairwise distance features between certain atoms.

88 The xarray datarrat constructed from this kind of data will have a label dimension that will either

89 contain generic labels like 'CUSTOM_FEATURE FEATURE 0' or labels defined by the featurizer such as

90 'SIDECHAIN ANGLE CHI1 OF RESIDUE 1LYS'.

91 * The input array in data has ndim == 4. Here, the same feature/CV is duplicated for the protein's atoms.

92 Besides the XYZ coordinates of the atoms no other CVs should fall into this case. The labels will be

93 2-dimensional with 'POSITION OF ATOM H1 IN RESIDUE 1LYS' in dimension 0 and either 'X', 'Y' or 'Z' in

94 dimension 1.

96 Args:

97 traj (em.SingleTraj): The trajectory we want to create the xarray dataarray for.

98 data (np.ndarray): The numpy array we want to create the data from. Note, that the data passed into this

99 function should be expanded by np.expand_dim(a, axis=0), so to add a new axis to the complete data

100 containing the trajectories of a trajectory ensemble.

101 name (str): The name of the feature. This can be choosen freely. Names like 'central_angles', 'backbone_torsions'

102 would make the most sense.

103 labels (Optional[list]): If you have specific labels for your CVs in mind, you can overwrite the

104 generic 'CUSTOM_FEATURE FEATURE 0' labels with providing a list for this argument. If None is provided,

105 generic names will be given to the features. Defaults to None.

106 check_n_frames (bool): Whether to check whether the number of frames in the trajectory matches the len

107 of the data in at least one dimension. Defaults to False.

108

109 Returns:

110 xarray.Dataarray: An `xarray.Dataarray`.

111

112 Examples:

113 >>> import encodermap as em

114 >>> from encodermap.misc.xarray import construct_xarray_from_numpy

115 >>> # load file from RCSB and give it traj num to represent it in a potential trajectory ensemble

116 >>> traj = em.load('https://files.rcsb.org/view/1GHC.pdb', traj_num=1)

117 >>> # single trajectory needs to be expaneded into 'trajectory' axis

118 >>> z_coordinate = np.expand_dims(traj.xyz[:,:,0], 0)

119 >>> da = construct_xarray_from_numpy(traj, z_coordinate, 'z_coordinate')

120 >>> print(da.coords['Z_COORDINATE'].values[:2])

121 ['Z_COORDINATE FEATURE 0' 'Z_COORDINATE FEATURE 1']

122 >>> print(da.coords['traj_num'].values)

123 [1]

124 >>> print(da.attrs['time_units'])

125 ps

126

127 """

128 if check_n_frames:

129 assert any(s == traj.n_frames for s in data.shape), print(

130 f"Data and traj misaligned. traj_frames: {traj.n_frames}, data.shape: {data.shape}, attr_name: {name}, {data}"

131 )

132 if traj.backend == "no_load":

133 with_time = False

134 else:

135 with_time = True

136 if data.ndim == 2:

137 if labels is None: 137 ↛ 139line 137 didn't jump to line 139, because the condition on line 137 was never false

138 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])]

139 da = make_frame_CV_dataarray(labels, traj, name, data, with_time=with_time)

140 elif data.ndim == 3:

141 if labels is None: 141 ↛ 143line 141 didn't jump to line 143, because the condition on line 141 was never false

142 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-1])]

143 da = make_dataarray(labels, traj, name, data, with_time=with_time)

144 elif data.ndim == 4: 144 ↛ 151line 144 didn't jump to line 151, because the condition on line 144 was never false

145 if labels is None: 145 ↛ 147line 145 didn't jump to line 147, because the condition on line 145 was never false

146 labels = [f"{name.upper()} FEATURE {i}" for i in range(data.shape[-2])]

147 da = make_position_dataarray_from_numpy(

148 labels, traj, name, data, with_time=with_time

149 )

150 else:

151 raise Exception(

152 f"The provided data has a dimensionality of {data.ndim}, but only 2, 3 and 4 are supported."

153 )

154 return da

155

156

157def unpack_data_and_feature(

158 feat: Featurizer,

159 traj: SingleTraj,

160 input_data: np.ndarray,

161 put_indices_into_attrs: bool = True,

162) -> xr.Dataset:

163 """Makes a `xarray.Dataset` from data and a featurizer.

164

165 Usually, if you add multiple features to a PyEMMA featurizer, they are

166 stacked along the feature axis. Let's say, you have a trajectory with 20 frames

167 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi).

168 If you then also add the end-to-end distance as a feature, the data returned by

169 PyEMMA will have the shape (20, 7). This function returns the correct indices,

170 so that iteration of `zip(Featurizer.active_features, indices)` will yield the

171 correct results.

172

173 Args:

174 feat (em.Featurizer): An instance of the currently used `encodermap.Featurizer`.

175 traj (em.SingleTraj): An instance of `encodermap.SingleTraj`, that the data

176 in `input_data` was computed from

177 input_data (np.ndarray): The data, as returned from PyEMMA.

178 put_indices_into_attrs (bool): Whether to put the indices into the attrs.

179 This needs to be False, when Ensembles are loaded because the Features

180 of the ensemble load function do not match the real indices that should be there.

181

182 Returns:

183 xarray.Dataset: An `xarray.Dataset` with all features in a nice format.

184

185 """

186 # this needs to be done, because pyemma concatenates the data

187 # along the feature axis

188 indices = get_indices_by_feature_dim(feat, traj, input_data.shape)

189

190 DAs = {}

191 indexes = {}

192 for f, ind in zip(feat.features, indices):

193 data = input_data[:, ind]

194 try:

195 name = FEATURE_NAMES[f.name]

196 except (KeyError, AttributeError):

197 name = f.__class__.__name__

198 f.name = name

199 assert data.shape[1] == f._dim

200 if data.shape[0] != len(traj): 200 ↛ 201line 200 didn't jump to line 201, because the condition on line 200 was never true

201 if traj.index is None:

202 raise Exception(

203 "Shape of provided data does not fit traj. Traj "

204 f"has {traj.n_frames=} {len(traj)=}. Data has shape {data.shape}"

205 )

206 else:

207 data = data[traj.index]

208 assert data.shape[0] == len(traj)

209 if name in DAs: 209 ↛ 210line 209 didn't jump to line 210

210 name = (

211 name

212 + f"_{len(list(filter(lambda x: True if name in x else False, list(DAs.keys()))))}"

213 )

214 if f.name in ["AllCartesians", "CentralCartesians", "SideChainCartesians"]:

215 data = data.reshape(len(traj), -1, 3)

216 data = np.expand_dims(data, axis=0)

217 DAs[name] = make_position_dataarray(f.describe(), traj, name, data)

218 else:

219 data = np.expand_dims(data, axis=0)

220 DAs[name] = make_dataarray(f.describe(), traj, name, data)

221

222 # the indices/indexes used to create this dataarray.

223 # This can be useful for later. To redo some analysis

224 # or use tensorflow for the geometric computations of these values.

225 # some PyEMMA features give the indexes different names e.g 'group_definitions'

226 try:

227 indexes[name] = f.indexes.tolist()

228 except AttributeError as e:

229 for key in f.__dir__():

230 if "inde" in key:

231 indexes[name] = f.__dict__[key]

232 if (

233 f.__class__.__name__ == "GroupCOMFeature"

234 or f.__class__.__name__ == "ResidueCOMFeature"

235 ):

236 indexes[name] = f.group_definitions

237 if f.__class__.__name__ == "MinRmsdFeature":

238 indexes[name] = f.atom_indices

239 if name not in indexes: 239 ↛ 240line 239 didn't jump to line 240, because the condition on line 239 was never true

240 raise e

241 ds = xr.Dataset(DAs)

242 if put_indices_into_attrs: 242 ↛ 244line 242 didn't jump to line 244, because the condition on line 242 was never false

243 ds = ds.assign_attrs(indexes)

244 return ds

245

246

247def construct_xarray_from_feat(feat, input_data=None):

248 raise Exception(

249 "Marked for deprecation, because featurizer returns its own dataset."

250 )

251

252

253def make_frame_CV_dataarray(

254 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None

255):

256 """Make a dataarray from a frame CV feature.

257

258 A normal features yields multiple values per trajectory frame (e.g. the

259 backbone dihedral angles give 2 * n_residues features per frame). A frame

260 CV feature is just a single value for a trajectory. Examples are:

261 * The distance between a binding pocket and a ligand.

262 * The volume of the unitcell.

263 * Whether a molecule is in state 1 or 2, could be described as a binary frame CV features.

264

265 This method has additional logic. If `data` has a shape of (1, n_frames, 1) it

266 is left as is. If it has a shape of (1, n_frames), it will be expanded on the -1 axis to

267 shape (1, n_frames, 1).

268

269 Note:

270 Please make sure, that the input data conforms to the nm, ps, rad coordinates.

271

272 Args:

273 labels (list[str]): The labels, that specify the `CV_num` dimension. This

274 requires the expression `len(labels == data.shape[2]` to be True. If you

275 build the dataarray from a feature. The `labels` argument usually will

276 be `feature.describe()`.

277 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.

278 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,

279 an xarray.Dataarray always represents one feature, of one trajectory.

280 A trajectory can have multiple features, which is represented as an

281 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.

282 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This

283 dataset is created ad hoc, by merging the datasets of the trajectories along

284 the `traj_num` axis.

285 name (str): The name of the feature. This name will be used to group similar

286 features in the large `xarray.Dataset`s of trajectory ensembles. If you

287 construct this dataarray from a feature, you can either use `feature.name`,

288 or `feature.__class__.__name__`.

289 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with.

290 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of

291 the frames to the `frame_num` axis. Can be either True, or False, or a

292 `np.ndarray`, in which case the data in this array will be used. Defaults to True.

293 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.

294 This can come in handy, if the trajectory is loaded out of-memory and

295 distributed, in which case, MDAnalysis is used to load data.

296 MDTraj does never load the trajectory coordinates. This the attribute

297 `_original_frame_indices` in the trajectory will be empty. However, as

298 the distributed task did already assign frames to the workers, we have

299 this number, we just need to use it. If set to None `_original_frame_indices`

300 will be used. Otherwise, the `np.ndarray` provided here, will be used.

301 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.

302 This can be especially useful for trajectory ensembles, which might differ

303 in a lot of places (i.e. single amino acid exchange). If the labels are

304 too strict (LYS1 ATOM1 C), the data of two slightly different proteins

305 can not be concatenated. If the labels are generic (AA1 ATOM1), this can

306 be done. Defaults to None.

307

308 Returns:

309 xarray.Dataarray: The resulting dataarray.

310

311 """

312 if data.ndim == 3: 312 ↛ 313line 312 didn't jump to line 313, because the condition on line 312 was never true

313 pass

314 elif data.ndim == 2: 314 ↛ 316line 314 didn't jump to line 316, because the condition on line 314 was never false

315 data = np.expand_dims(data, -1)

316 return make_dataarray(

317 labels,

318 traj,

319 name,

320 data,

321 with_time,

322 frame_indices,

323 labels2,

324 )

325

326

327def make_position_dataarray_from_numpy(atoms, traj, name, data, with_time=True):

328 frame_indices = traj.id

329 if frame_indices.ndim > 1:

330 frame_indices = frame_indices[:, 1]

331 da = xr.DataArray(

332 data,

333 coords={

334 "traj_num": ("traj_num", [traj.traj_num]),

335 "traj_name": ("traj_num", [traj.basename]),

336 "frame_num": ("frame_num", frame_indices),

337 "ATOM": ("ATOM", atoms),

338 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"],

339 },

340 dims=["traj_num", "frame_num", "ATOM", "COORDS"],

341 name=name,

342 attrs={

343 "length_units": "nm",

344 "time_units": "ps",

345 "angle_units": "rad",

346 "full_path": traj.traj_file,

347 "topology_file": traj.top_file,

348 "feature_axis": "ATOM",

349 },

350 )

351 if with_time:

352 da = da.assign_coords(time=("frame_num", traj.time))

353 return da

354

355

356def make_position_dataarray(

357 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None

358):

359 """Creates dataarray belonging to cartesian positions.

360

361 Similar to `make_datarray`, but the shapes are even larger. As every atom

362 contributes 3 coordinates (x, y, z) to the data, the shape of the returned

363 dataarray is (1, no_of_frames, no_of_atoms_considered, 3).

364

365 Note:

366 Please make sure, that the input data conforms to the nm, ps, rad coordinates.

367

368 Args:

369 labels (list[str]): The labels, that specify the `CV_num` dimension. This

370 requires the expression `len(labels == data.shape[2]` to be True. If you

371 build the dataarray from a feature. The `labels` argument usually will

372 be `feature.describe()`.

373 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.

374 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,

375 an xarray.Dataarray always represents one feature, of one trajectory.

376 A trajectory can have multiple features, which is represented as an

377 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.

378 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This

379 dataset is created ad hoc, by merging the datasets of the trajectories along

380 the `traj_num` axis.

381 name (str): The name of the feature. This name will be used to group similar

382 features in the large `xarray.Dataset`s of trajectory ensembles. If you

383 construct this dataarray from a feature, you can either use `feature.name`,

384 or `feature.__class__.__name__`.

385 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with.

386 with_time (Union[bool, np.ndarray]): Whether to add the time of

387 the frames to the `frame_num` axis. Can be either True, or False, or a

388 `np.ndarray`, in which case the data in this array will be used. Defaults to True.

389 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.

390 This can come in handy, if the trajectory is loaded out of-memory and

391 distributed, in which case, MDAnalysis is used to load data.

392 MDTraj does never load the trajectory coordinates. This the attribute

393 `_original_frame_indices` in the trajectory will be empty. However, as

394 the distributed task did already assign frames to the workers, we have

395 this number, we just need to use it. If set to None `_original_frame_indices`

396 will be used. Otherwise, the `np.ndarray` provided here, will be used.

397 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.

398 This can be especially useful for trajectory ensembles, which might differ

399 in a lot of places (i.e. single amino acid exchange). If the labels are

400 too strict (LYS1 ATOM1 C), the data of two slightly different proteins

401 can not be concatenated. If the labels are generic (AA1 ATOM1), this can

402 be done. Defaults to None.

403

404 Returns:

405 xarray.Dataarray: The resulting dataarray.

406

407 """

408 if frame_indices is None: 408 ↛ 418line 408 didn't jump to line 418, because the condition on line 408 was never false

409 frame_indices = traj.id

410 if frame_indices.ndim > 1:

411 frame_indices = frame_indices[:, 1]

412 # frame_indices = traj._original_frame_indices

413 # if not np.any(frame_indices):

414 # frame_indices = traj.id

415 # if frame_indices.ndim > 1:

416 # frame_indices = frame_indices[:, 1]

417

418 if labels2 is not None: 418 ↛ 419line 418 didn't jump to line 419, because the condition on line 418 was never true

419 labels = labels2

420 else:

421 labels = [_[11:].lstrip(" ") for _ in labels[::3]]

422

423 da = xr.DataArray(

424 data,

425 coords={

426 "traj_num": ("traj_num", [traj.traj_num]),

427 "traj_name": ("traj_num", [traj.basename]),

428 "frame_num": ("frame_num", frame_indices),

429 "ATOM": ("ATOM", labels),

430 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"],

431 },

432 dims=["traj_num", "frame_num", "ATOM", "COORDS"],

433 name=name.upper(),

434 attrs={

435 "length_units": "nm",

436 "time_units": "ps",

437 "angle_units": "rad",

438 "full_path": traj.traj_file,

439 "topology_file": traj.top_file,

440 "feature_axis": "ATOM",

441 },

442 )

443 if isinstance(with_time, bool): 443 ↛ 447line 443 didn't jump to line 447, because the condition on line 443 was never false

444 if with_time: 444 ↛ 448line 444 didn't jump to line 448, because the condition on line 444 was never false

445 da = da.assign_coords(time=("frame_num", traj.time))

446 else:

447 da = da.assign_coords(time=("frame_num", with_time))

448 return da

449

450

451def make_dataarray(

452 labels, traj, name, data, with_time=True, frame_indices=None, labels2=None

453):

454 """Creates a dataarray belonging to a feature.

455

456 The shapes are a bit different that what most people might be used to. As

457 EncoderMap was meant to work with ensembles of trajectories, the data is usually

458 shaped as (traj_num, frame_num, CV_num), or even (traj_num, frame_num, atom_num, 3).

459

460 The `xarray.Dataarray`, that is returned by this function reflects this. As a

461 dataarray is attributed to a single trajectory, the first shape will always be 1.

462 So for a collective variable, that describes n features, the shape of the returned

463 datarray will be (1, n_frames, n). Combining multiple trajectories, the first number

464 can increase.

465

466 Note:

467 Please make sure, that the input data conforms to the nm, ps, rad coordinates.

468

469 Args:

470 labels (list[str]): The labels, that specify the `CV_num` dimension. This

471 requires the expression `len(labels == data.shape[2]` to be True. If you

472 build the dataarray from a feature. The `labels` argument usually will

473 be `feature.describe()`.

474 traj (encodermap.SingleTraj): An `encodermap.SingleTraj` trajectory.

475 Why `SingleTraj` and not `TrajEnsemble`? That is, because in EncoderMap,

476 an xarray.Dataarray always represents one feature, of one trajectory.

477 A trajectory can have multiple features, which is represented as an

478 `xarray.Dataset`. For that, the function `unpack_data_and_feature` is used.

479 The `TrajEnsemble` trajectory does not even have its own `xarray.Dataset`. This

480 dataset is created ad hoc, by merging the datasets of the trajectories along

481 the `traj_num` axis.

482 name (str): The name of the feature. This name will be used to group similar

483 features in the large `xarray.Dataset`s of trajectory ensembles. If you

484 construct this dataarray from a feature, you can either use `feature.name`,

485 or `feature.__class__.__name__`.

486 data (Union[np.ndarray, dask.array]): The data to fill the dataarray with.

487 with_time (Optional[Union[bool, np.ndarray]]): Whether to add the time of

488 the frames to the `frame_num` axis. Can be either True, or False, or a

489 `np.ndarray`, in which case the data in this array will be used. Defaults to True.

490 frame_indices (Optional[np.ndarray]) The indices of the trajectory on disk.

491 This can come in handy, if the trajectory is loaded out of-memory and

492 distributed, in which case, MDAnalysis is used to load data.

493 MDTraj does never load the trajectory coordinates. This the attribute

494 `_original_frame_indices` in the trajectory will be empty. However, as

495 the distributed task did already assign frames to the workers, we have

496 this number, we just need to use it. If set to None `_original_frame_indices`

497 will be used. Otherwise, the `np.ndarray` provided here, will be used.

498 labels2 (Optional[list[str]]): Optional labels to supersede the labels in `labels`.

499 This can be especially useful for trajectory ensembles, which might differ

500 in a lot of places (i.e. single amino acid exchange). If the labels are

501 too strict (LYS1 ATOM1 C), the data of two slightly different proteins

502 can not be concatenated. If the labels are generic (AA1 ATOM1), this can

503 be done. Defaults to None.

504

505 Returns:

506 xarray.Dataarray: The resulting dataarray.

507

508 """

509 if frame_indices is None: 509 ↛ 519line 509 didn't jump to line 519, because the condition on line 509 was never false

510 frame_indices = traj.id

511 if frame_indices.ndim > 1:

512 frame_indices = frame_indices[:, 1]

513 # frame_indices = traj._original_frame_indices

514 # if not np.any(frame_indices):

515 # frame_indices = traj.id

516 # if frame_indices.ndim > 1:

517 # frame_indices = frame_indices[:, 1]

518

519 if labels2 is not None: 519 ↛ 520line 519 didn't jump to line 520, because the condition on line 519 was never true

520 labels = labels2

521

522 if callable(labels): 522 ↛ 523line 522 didn't jump to line 523, because the condition on line 522 was never true

523 labels = [l for l in labels(traj.top)]

524 else:

525 if len(labels) > data.shape[-1]:

526 labels = np.arange(data.shape[-1])

527

528 da = xr.DataArray(

529 data,

530 coords={

531 "traj_num": ("traj_num", [traj.traj_num]),

532 "traj_name": ("traj_num", [traj.basename]),

533 "frame_num": ("frame_num", frame_indices),

534 name.upper(): labels,

535 },

536 dims=["traj_num", "frame_num", name.upper()],

537 name=name,

538 attrs={

539 "length_units": "nm",

540 "time_units": "ps",

541 "angle_units": "rad",

542 "full_path": traj.traj_file,

543 "topology_file": traj.top_file,

544 "feature_axis": name.upper(),

545 },

546 )

547 if isinstance(with_time, bool): 547 ↛ 551line 547 didn't jump to line 551, because the condition on line 547 was never false

548 if with_time:

549 da = da.assign_coords(time=("frame_num", traj.time))

550 else:

551 da = da.assign_coords(time=("frame_num", with_time))

552 return da

553

554

555def make_ensemble_xarray(name, data):

556 attrs = {"length_units": "nm", "time_units": "ps", "angle_units": "rad"}

557 if data.ndim == 2:

558 coords = {

559 "traj_num": ("traj_num", range(data.shape[0])),

560 "frame_num": ("frame_num", range(data.shape[1])),

561 name.upper(): (

562 "frame_num",

563 [f"{name.upper()} Frame Feature {i}" for i in range(data.shape[1])],

564 ),

565 }

566 dims = ["traj_num", "frame_num"]

567 elif data.ndim == 3:

568 coords = {

569 "traj_num": ("traj_num", range(data.shape[0])),

570 "frame_num": ("frame_num", range(data.shape[1])),

571 name.upper(): [f"{name.upper()} Feature {i}" for i in range(data.shape[2])],

572 }

573 dims = ["traj_num", "frame_num", name.upper()]

574 elif data.ndim == 4:

575 coords = {

576 "traj_num": ("traj_num", range(data.shape[0])),

577 "frame_num": ("frame_num", range(data.shape[1])),

578 "ATOM": ("ATOM", [f"ATOM {i}" for i in range(data.shape[2])]),

579 "COORDS": ["POSITION X", "POSITION Y", "POSITION Z"],

580 }

581 dims = ["traj_num", "frame_num", "ATOM", "COORDS"]

582 else:

583 raise Exception("Too high dimensional data for ensemble.")

584 da = xr.DataArray(data, coords=coords, dims=dims, name=name, attrs=attrs)

585 return da

586

587

588def get_indices_by_feature_dim(feat, traj, input_data_shape):

589 """Unpacks the concatenated features returned by PyEMMA.

590

591 Usually, if you add multiple features to a PyEMMA featurizer, they are

592 stacked along the feature axis. Let's say, you have a trajectory with 20 frames

593 and 3 residues. If you add the Ramachandran angles, you get 6 features (3xphi, 3xpsi).

594 If you then also add the end-to-end distance as a feature, the data returned by

595 PyEMMA will have the shape (20, 7). This function returns the correct indices,

596 so that iteration of `zip(Featurizer.active_features, indices)` will yield the

597 correct results.

598

599 Args:

600 feat (em.Featurizer): Instance of `encodermap.Featurizer`. The featurizer knows

601 how many entries every feature takes up in the feature space.

602 traj (encodermap.SingleTraj): An instance of `SingleTraj`. This is needed

603 for a single purpose: PyEMMA returns malformed data for .pdb files, that

604 are loaded from the protein database

605 (i.e. `traj = em.SingleTraj('https://files.rcsb.org/view/1GHC.pdb'`).

606 This is prevented by providing the traj and do a small check.

607 input_data_shape (tuple): The data, that should conform to the slicing.

608 Also needed for some checks.

609

610 Returns:

611 list[np.ndarray]: List of `np.ndarray`s that correctly slice the data.

612

613 """

614 if len(feat.features) > 1:

615 indices = [0] + add_one_by_one([f._dim for f in feat.features])

616 if traj.extension == ".pdb" and _validate_uri(traj.traj_file): 616 ↛ 618line 616 didn't jump to line 618, because the condition on line 616 was never true

617 # for internet pdb files we need to slice the data. this negates the next assert but meh.

618 raise BadError(

619 "For some reason pyemma.coordinates.source does not like working with internet pdb files."

620 )

621 else:

622 if not indices[-1] == input_data_shape[1]: 622 ↛ 623line 622 didn't jump to line 623, because the condition on line 622 was never true

623 for f in feat.features:

624 print(f.__class__)

625 print(f.describe())

626 print(len(f.describe()))

627 print(f._dim)

628 print(traj, indices, input_data_shape)

629 raise Exception

630 indices = [np.arange(i, j) for i, j in zip(indices[:-1], indices[1:])]

631 else:

632 indices = [np.arange(0, feat.features[0]._dim)]

633 return indices

634

635

636def add_one_by_one(l):

637 """Creates a new list from l with elements added one after another.

638

639 Args:

640 l (list): The input list.

641

642 Returns:

643 list: The output list.

644

645 Example:

646 >>> l = [0, 2, 4, 5, 7]

647 >>> add_one_by_one(l)

648 [0, 2, 6, 11, 18]

649

650 """

651 new_l = []

652 cumsum = 0

653 for elt in l:

654 cumsum += elt

655 new_l.append(cumsum)

656 return new_l