Coverage for encodermap/trajinfo/repository.py: 11%

245 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-02-07 11:05 +0000

1# -*- coding: utf-8 -*- 

2# encodermap/trajinfo/repository.py 

3################################################################################ 

4# Encodermap: A python library for dimensionality reduction. 

5# 

6# Copyright 2019-2022 University of Konstanz and the Authors 

7# 

8# Authors: 

9# Kevin Sawade 

10# 

11# Encodermap is free software: you can redistribute it and/or modify 

12# it under the terms of the GNU Lesser General Public License as 

13# published by the Free Software Foundation, either version 2.1 

14# of the License, or (at your option) any later version. 

15# This package is distributed in the hope that it will be useful to other 

16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the 

17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 

18# See the GNU Lesser General Public License for more details. 

19# 

20# See <http://www.gnu.org/licenses/>. 

21################################################################################ 

22"""Python endpoint to download files from a webserver on the fly. 

23 

24Idea from Christoph Wehmeyer: https://github.com/markovmodel/mdshare 

25I liked his idea of the possibility to distribute MD data via a simple python 

26backend, but wanted to make it smaller. A simple `fetch()` should suffice. Also 

27I liked the yaml syntax and wanted to use it. 

28 

29References: 

30 @article{wehmeyer2018introduction, 

31 title={Introduction to Markov state modeling with the PyEMMA software [Article v1. 0]}, 

32 author={Wehmeyer, Christoph and Scherer, Martin K and Hempel, Tim and Husic, Brooke E and Olsson, Simon and No{\'e}, Frank}, 

33 journal={Living Journal of Computational Molecular Science}, 

34 volume={1}, 

35 number={1}, 

36 pages={5965}, 

37 year={2018} 

38 } 

39 

40""" 

41 

42 

43############################################################################## 

44# Imports 

45############################################################################## 

46 

47 

48import errno 

49import fnmatch 

50import glob 

51import inspect 

52import os 

53import re 

54import sys 

55from itertools import chain 

56from operator import methodcaller 

57 

58import requests 

59 

60from .._optional_imports import _optional_import 

61from .hash_files import hash_files 

62from .info_all import TrajEnsemble 

63from .info_single import SingleTraj 

64 

65############################################################################## 

66# Optional Imports 

67############################################################################## 

68 

69 

70yaml_load = _optional_import("yaml", "load") 

71yaml_dump = _optional_import("yaml", "dump") 

72Loader = _optional_import("yaml", "CLoader") 

73Dumper = _optional_import("yaml", "CDumper") 

74download_wrapper = _optional_import("mdshare", "utils.download_wrapper") 

75tarfile = _optional_import("tarfile") 

76 

77 

78############################################################################## 

79# Globals 

80############################################################################## 

81 

82 

83__all__ = ["Repository"] 

84 

85 

86############################################################################## 

87# Functions 

88############################################################################## 

89 

90 

91def gen_dict_extract(key, var): 

92 """Copied from hexerei software's solution for nested dicts: 

93 

94 Finds the value of a key anywhere in a nested dict. 

95 

96 """ 

97 if hasattr(var, "iteritems"): 

98 for k, v in var.iteritems(): 

99 if k == key: 

100 yield v 

101 if isinstance(v, dict): 

102 for result in gen_dict_extract(key, v): 

103 yield result 

104 elif isinstance(v, list): 

105 for d in v: 

106 for result in gen_dict_extract(key, d): 

107 yield result 

108 

109 

110def find_mime_type(d, mime_type): 

111 """Thanks to KobeJohn 

112 

113 https://stackoverflow.com/questions/22162321/search-for-a-value-in-a-nested-dictionary-python 

114 

115 """ 

116 reverse_linked_q = list() 

117 reverse_linked_q.append((list(), d)) 

118 while reverse_linked_q: 

119 this_key_chain, this_v = reverse_linked_q.pop() 

120 # finish search if found the mime type 

121 if this_v == mime_type: 

122 return this_key_chain 

123 # not found. keep searching 

124 # queue dicts for checking / ignore anything that's not a dict 

125 try: 

126 items = this_v.items() 

127 except AttributeError: 

128 continue # this was not a nested dict. ignore it 

129 for k, v in items: 

130 reverse_linked_q.append((this_key_chain + [k], v)) 

131 # if we haven't returned by this point, we've exhausted all the contents 

132 raise KeyError 

133 

134 

135def sizeof_fmt(num, suffix="B"): 

136 """Thanks to Fred Cirera and Sridhar Ratnakumar 

137 

138 https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size 

139 https://web.archive.org/web/20111010015624/http://blogmag.net/blog/read/38/Print_human_readable_file_size 

140 

141 """ 

142 for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]: 

143 if abs(num) < 1024.0: 

144 return "%3.1f%s%s" % (num, unit, suffix) 

145 num /= 1024.0 

146 return "%.1f%s%s" % (num, "Yi", suffix) 

147 

148 

149############################################################################## 

150# Classes 

151############################################################################## 

152 

153 

154class Repository: 

155 """Main Class to work with Repositories of MD data and download the data. 

156 

157 This class handles the download of files from a repository source. All 

158 data are obtained from a .yaml file (default at data/repository.yaml), which 

159 contains trajectory files and topology files organized in a readable manner. 

160 With this class the repository.yaml file can be queried using unix-like file 

161 patterns. Files can be downloaded on-the-fly (if they already exist, they won't 

162 be downloaded again). Besides files full projects can be downloaded and rebuilt. 

163 

164 Attributes: 

165 current_path (str): Path of the .py file containing this class. 

166 If no working directory is given (None), all files will be 

167 downloaded to a directory named 'data' (will be created) which will 

168 be placed in the directory of this .py file. 

169 url (str): The url to the current repo source. 

170 maintainer (str): The maintainer of the current repo source. 

171 files_dict (dict): A dictionary summarizing the files in this repo. 

172 dict keys are built from `'project_name' + 'filetype'`. So for a 

173 project called 'protein_sim', possible keys are 'protein_sim_trajectory', 

174 'protein_sim_topology', 'protein_sim_log'. The values of these keys 

175 are all str and they give the actual filename of the files. If 'protein_sim' 

176 was conducted with GROMACS, these files would be 'traj_comp.xtc', 'confout.gro' 

177 and 'md.log'. 

178 files (list): Just a list of str of all downloadable files. 

179 data (dict): The main organization of the repository. This is the complete 

180 .yaml file as it was read and returned by pyyaml. 

181 

182 Examples: 

183 >>> import encodermap as em 

184 >>> repo = em.Repository() 

185 >>> print(repo.search('*PFFP_sing*')) # doctest: +SKIP 

186 {'PFFP_single_trajectory': 'PFFP_single.xtc', 'PFFP_single_topology': 'PFFP_single.gro', 'PFFP_single_input': 'PFFP.mdp', 'PFFP_single_log': 'PFFP.log'} 

187 >>> print(repo.url) 

188 http://134.34.112.158 

189 

190 """ 

191 

192 def __init__( 

193 self, 

194 repo_source="data/repository.yaml", 

195 checksum_file="data/repository.md5", 

196 ignore_checksums=False, 

197 debug=True, 

198 ): 

199 """Initialize the repository, 

200 

201 Args: 

202 repo_source (str): The source .yaml file to build the repository from. 

203 Defaults to 'data/repository.yaml'. 

204 checksum_file (str): A file containing the md5 hash of the repository file. 

205 This ensures no one tampers with the repository.yaml file and injects 

206 malicious code. Defaults to 'data/repository.md5'. 

207 ignore_checksums (bool): If you want to ignore the checksum check of 

208 the repo_source file set this top True. Can be useful for 

209 developing, when the repository.yaml file undergoes a lot of changes. 

210 Defaults to False. 

211 debug (bool, optional): Whether to print debug info. Defaults to False. 

212 

213 """ 

214 # this will point to this file, no matter where it is (venv, etc.) 

215 self.current_path = os.path.split(inspect.getfile(inspect.currentframe()))[0] 

216 self.debug = debug 

217 

218 # with that the source files can be defined 

219 repo_source = os.path.join(self.current_path, repo_source) 

220 checksum_file = os.path.join(self.current_path, checksum_file) 

221 

222 # check the hash sum of the repo.yml file 

223 if checksum_file is not None and not ignore_checksums: 

224 with open(checksum_file, "r") as fh: 

225 if hash_files(repo_source)["repository.yaml"]["md5"] != fh.read(): 

226 raise RuntimeError( 

227 "Checksums do not match, check your catalogue files!" 

228 ) 

229 

230 # read the repo.yml file 

231 with open(repo_source, "r") as f: 

232 self.data = yaml_load(f, Loader=Loader) 

233 

234 # define variables based on that 

235 self.url = self.data["url"] 

236 self.maintainer = ( 

237 self.data["maintainer"]["name"] + ", " + self.data["maintainer"]["email"] 

238 ) 

239 self.projects = self.data["projects"] 

240 self._connection = None 

241 self.files_dict = {} 

242 for dataset in self.datasets: 

243 for filetype in self.data[dataset]: 

244 if filetype == "credit": 

245 continue 

246 self.files_dict[f"{dataset}_{filetype}"] = self.data[dataset][filetype][ 

247 "file" 

248 ] 

249 self.files = list(self.files_dict.values()) 

250 

251 @property 

252 def catalogue(self): 

253 """dict: Returns the underlying catalogue data.""" 

254 return self.data 

255 

256 def print_catalogue(self): 

257 """Prints the catalogue nicely formatted.""" 

258 print(self.__str__()) 

259 

260 @property 

261 def projects(self): 

262 """dict: A dictionary containing project names and their associated files. 

263 Projects are a larger collection of individual sims, that belong together. 

264 The project names are the dictionary's keys, the files are given as lists 

265 in the dict's values. 

266 

267 """ 

268 return self._projects 

269 

270 @projects.setter 

271 def projects(self, projects): 

272 self._projects = {} 

273 for item in self.data["projects"]: 

274 if isinstance(self.data["projects"][item], list): 

275 self._projects[item] = self.data["projects"][item] 

276 elif isinstance(self.data["projects"][item], dict): 

277 _ = [] 

278 for key, value in self.data["projects"][item].items(): 

279 if key == "type": 

280 continue 

281 _.extend(value) 

282 self._projects[item] = _ 

283 else: 

284 raise ValueError(f"Wrong Type in projects: {item}: {type(item)}") 

285 

286 @property 

287 def datasets(self): 

288 """set: A set of datasets in this repository. A dataset can either be 

289 characterized by a set of trajectory-, topology-, log- and input-file 

290 or a dataset is a .tar.gz container, which contains all necessary files. 

291 

292 """ 

293 return set(self.data.keys()).difference( 

294 set(["name", "url", "maintainer", "projects"]) 

295 ) 

296 

297 def search(self, pattern): 

298 out = {} 

299 if isinstance(pattern, list): 

300 _ = [self.search(i) for i in pattern] 

301 return dict(chain.from_iterable(map(methodcaller("items"), _))) 

302 for key, item in self.files_dict.items(): 

303 if fnmatch.fnmatch(key, pattern) or fnmatch.fnmatch(item, pattern): 

304 out[key] = item 

305 return out 

306 

307 def load_project( 

308 self, 

309 project, 

310 working_directory=None, 

311 overwrite=False, 

312 max_attempts=3, 

313 makdedir=False, 

314 progress_bar=True, 

315 ): 

316 """This will return `TrajEnsemble` / `SingleTraj` objects that are correctly formatted. 

317 

318 This method allows one to directly rebuild projects from the repo source, 

319 using encodermap's own `SingleTraj` and `TrajEnsemble` classes. 

320 

321 Args: 

322 project (str): The name of the project to be loaded. See 

323 Repository.projects.keys() for a list of projects. 

324 working_directory (Union[str, None], optional): Can be a string to a directory to save the 

325 files at. Can also be None. In that case `self.current_path` + `'/data'` will be used 

326 to save the file at. Which is retrieved by `inspect.getfile(inspect.currentframe))`. If 

327 the files are already there and overwrite is false, the file path is simply returned. 

328 Defaults to None. 

329 overwrite (bool, optional): Whether to overwrite local files. Defaults to False. 

330 max_attempts (int, optional): Number of download attempts. Defaults to 3. 

331 makdedir (bool, optional): Whether to create `working_directory`, if it is not already existing. 

332 Defaults to False. 

333 progress_bar (bool, optional): Uses the package progress-reporter to display a progress bar. 

334 

335 Returns: 

336 Union[encodermap.SingleTraj, encodermap.TrajEnsemble]: The project already loaded into encodermap's 

337 `SingleTraj` or `TrajEnsemble` classes. 

338 

339 Examples: 

340 >>> import encodermap as em 

341 >>> repo = em.Repository() 

342 >>> trajs = repo.load_project('Tetrapeptides_Single') 

343 >>> print(trajs) 

344 encodermap.TrajEnsemble object. Current backend is no_load. Containing 2 trajs. Common str is ['PFFP', 'FPPF']. Not containing any CVs. 

345 >>> print(trajs.n_trajs) 

346 2 

347 

348 """ 

349 if isinstance(self.data["projects"][project], dict): 

350 common_strings = list( 

351 filter( 

352 lambda x: False if x == "type" else True, 

353 list(self.data["projects"][project].keys()), 

354 ) 

355 ) 

356 traj_files = [ 

357 file 

358 for cs in common_strings 

359 for file in self.data["projects"][project][cs][:-1] 

360 ] 

361 top_files = [ 

362 self.data["projects"][project][cs][-1] for cs in common_strings 

363 ] 

364 if self.data["projects"][project]["type"] == "files": 

365 files, directory = self.fetch( 

366 traj_files + top_files, 

367 working_directory=working_directory, 

368 overwrite=overwrite, 

369 max_attempts=max_attempts, 

370 makdedir=makdedir, 

371 progress_bar=progress_bar, 

372 ) 

373 elif self.data["projects"][project]["type"] == "container": 

374 pattern = project + ".tar.gz" 

375 files, directory = self.fetch( 

376 pattern, 

377 working_directory=working_directory, 

378 overwrite=overwrite, 

379 max_attempts=max_attempts, 

380 makdedir=makdedir, 

381 progress_bar=progress_bar, 

382 ) 

383 else: 

384 raise Exception( 

385 f"Unknown type of project: {self.data['projects'][project]['type']}. `type` needs to be either 'files' or 'container'." 

386 ) 

387 traj_files = [os.path.join(directory, i) for i in traj_files] 

388 top_files = [os.path.join(directory, i) for i in top_files] 

389 return TrajEnsemble(traj_files, top_files, common_str=common_strings) 

390 else: 

391 files, directory = self.fetch( 

392 self.projects[project], 

393 working_directory=working_directory, 

394 overwrite=overwrite, 

395 max_attempts=max_attempts, 

396 makdedir=makdedir, 

397 progress_bar=progress_bar, 

398 ) 

399 return SingleTraj(files[0], files[1]) 

400 

401 def lookup(self, file): 

402 """Piece of code to allow some compatibility to mdshare. 

403 

404 The complete `self.data` dictionary will be traversed to find 

405 `file` and its location in the `self.data` dictionary. This will be 

406 used to get the filesize and its md5 hash. The returned tuple also tells 

407 whether the file is a .tar.gz container or not. In the case of a container, 

408 the container needs to be extracted using tarfile. 

409 

410 Args: 

411 file (str): The file to search for. 

412 

413 Returns: 

414 tuple: A tuple containing the follwing: 

415 str: A string that is either 'container' or 'index' (for normal files). 

416 dict: A dict with dict(file=filename, hash=filehas, size=filesize) 

417 

418 """ 

419 simulation, filetype, _ = find_mime_type(self.data, file) 

420 out = dict( 

421 file=self.data[simulation][filetype]["file"], 

422 hash=self.data[simulation][filetype]["md5"], 

423 size=self.data[simulation][filetype]["size"], 

424 ) 

425 if filetype == "container": 

426 return "containers", out 

427 else: 

428 return "index", out 

429 

430 def _get_connection(self): 

431 """Also compatibility with mdshare""" 

432 if self._connection is None: 

433 self._connection = requests.session() 

434 return self._connection 

435 

436 @staticmethod 

437 def _split_proj_filetype(proj_filetype): 

438 """Splits the strings that index the self.datasets dictionary.""" 

439 if proj_filetype.count("_") == 1: 

440 return proj_filetype.split("_") 

441 else: 

442 substrings = proj_filetype.split("_") 

443 return "_".join(substrings[:-1]), substrings[-1] 

444 

445 def get_sizes(self, pattern): 

446 """Returns a list of file-sizes of a given pattern. 

447 

448 Args: 

449 pattern (Union[str, list]): A unix-like pattern ('traj*.xtc') or a 

450 list of files (['traj_1.xtc', 'traj_2.xtc']). 

451 

452 Returns: 

453 list: A list of filesizes in bytes. 

454 

455 """ 

456 sizes = [] 

457 for proj_filetype, file in self.search(pattern).items(): 

458 project, filetype = Repository._split_proj_filetype(proj_filetype) 

459 size = self.data[project][filetype]["size"] 

460 sizes.append(size) 

461 return sizes 

462 

463 def stack(self, pattern): 

464 """Creates a stack to prepare for downloads. 

465 

466 Args: 

467 pattern (Union[str, list]): A unix-like pattern ('traj*.xtc') or a 

468 list of files (['traj_1.xtc', 'traj_2.xtc']). 

469 

470 Returns: 

471 list: A list of dicts. Each dict contains filename, size and a boolean 

472 value telling whether the downloaded file needs to be extracted 

473 after downloading. 

474 

475 """ 

476 stack = [] 

477 sizes = self.get_sizes(pattern) 

478 for (proj_filetype, file), size in zip(self.search(pattern).items(), sizes): 

479 project, filetype = Repository._split_proj_filetype(proj_filetype) 

480 unpack = filetype == "container" 

481 stack.append(dict(file=file, size=size, unpack=unpack)) 

482 return stack 

483 

484 def fetch( 

485 self, 

486 remote_filenames, 

487 working_directory=None, 

488 overwrite=False, 

489 max_attempts=3, 

490 makdedir=False, 

491 progress_bar=True, 

492 ): 

493 """This fetches a singular file from self.files. 

494 

495 Displays also progress bar with the name of the file. Uses requests. 

496 

497 Args: 

498 remote_filename (str): The name of the remote file. Check `self.files` for more info. 

499 working_directory (Union[str, None], optional): Can be a string to a directory to save the 

500 files at. Can also be None. In that case `self.current_path` + `'/data'` will be used 

501 to save the file at. Which is retrieved by `inspect.getfile(inspect.currentframe))`. If 

502 the files are already there and overwrite is false, the file path is simply returned. 

503 Defaults to None. 

504 overwrite (bool, optional): Whether to overwrite local files. Defaults to False. 

505 max_attempts (int, optional): Number of download attempts. Defaults to 3. 

506 makdedir (bool, optional): Whether to create `working_directory`, if it is not already existing. 

507 Defaults to False. 

508 progress_bar (bool, optional): Uses the package progress-reporter to display a progress bar. 

509 

510 Returns: 

511 tuple: A tuple containing the following: 

512 list: A list of files that have just been downloaded. 

513 str: A string leading to the directory the files have been downloaded to. 

514 

515 """ 

516 # import progress-reporter 

517 try: 

518 import progress_reporter 

519 

520 have_progress_reporter = True 

521 except ImportError: 

522 if self.debug: 

523 print( 

524 "Downloading files without progress bar. Run `pip install progress-reporter` to use this feature." 

525 ) 

526 have_progress_reporter = False 

527 

528 # find files to import 

529 stack = self.stack(remote_filenames) 

530 

531 # define the filename 

532 if working_directory is None: 

533 working_directory = os.path.join(self.current_path, "data") 

534 if isinstance(working_directory, str): 

535 local_filenames = [ 

536 os.path.join(working_directory, s["file"]) for s in stack 

537 ] 

538 if not os.path.isdir(working_directory): 

539 if makdedir: 

540 os.makedirs(working_directory) 

541 else: 

542 raise FileNotFoundError( 

543 errno.ENOENT, os.strerror(errno.ENOENT), working_directory 

544 ) 

545 else: 

546 raise ValueError( 

547 f"Type of argument `working_directory` needs to be either `None` or `str`, you provided {type(working_directory)}" 

548 ) 

549 

550 # split .tar.gz and regular files 

551 # and check what already exists 

552 local_containers = [f for f in local_filenames if f.endswith(".tar.gz")] 

553 local_containers = list(map(lambda x: x.split(".")[0], local_containers)) 

554 local_files = [f for f in local_filenames if not f.endswith(".tar.gz")] 

555 result = [] 

556 if local_files: 

557 if all([os.path.isfile(lf) for lf in local_files]) and not overwrite: 

558 if self.debug: 

559 print( 

560 f"Files '{local_files}' already exists. Set `overwrite` to `True` to download them again." 

561 ) 

562 result.extend(local_files) 

563 elif any([os.path.isfile(lf) for lf in local_files]) and not overwrite: 

564 existing_files = glob.glob(os.path.join(working_directory, "*")) 

565 existing_files = [os.path.split(i)[-1] for i in existing_files] 

566 missing_files = list( 

567 set([i["file"] for i in stack]).difference(set(existing_files)) 

568 ) 

569 result.extend( 

570 [ 

571 os.path.join(working_directory, i) 

572 for i in existing_files 

573 if fnmatch.fnmatch(i, remote_filenames) 

574 ] 

575 ) 

576 if self.debug: 

577 print( 

578 f"{len(stack) - len(missing_files)} Files already exist. I will only download '{missing_files}'. Set `overwrite` to `True` to download all files again." 

579 ) 

580 stack = list( 

581 filter( 

582 lambda x: True if x["file"] in missing_files else False, stack 

583 ) 

584 ) 

585 if local_containers: 

586 if all([os.path.isdir(lc) for lc in local_containers]) and not overwrite: 

587 if self.debug: 

588 print( 

589 f"Directories '{local_containers}' already exists. Set `overwrite` to `True` to download them again." 

590 ) 

591 result.extend(local_containers) 

592 elif any([os.path.isdir(lc) for lc in local_containers]) and not overwrite: 

593 existing_directories = glob.glob(os.path.join(working_directory, "*/")) 

594 existing_directories = [ 

595 os.path.split(os.path.split(i)[0])[-1] for i in existing_directories 

596 ] 

597 missing_directories = list( 

598 set([i["file"].split(".")[0] for i in stack]).difference( 

599 set(existing_directories) 

600 ) 

601 ) 

602 result.extend( 

603 [ 

604 os.path.join(working_directory, i) 

605 for i in existing_directories 

606 if fnmatch.fnmatch(i + ".tar.gz", remote_filenames) 

607 ] 

608 ) 

609 if self.debug: 

610 print( 

611 f"{len(stack) - len(missing_directories)} Directories already exist. I will only download '{missing_directories}'. Set `overwrite` to `True` to download all files again." 

612 ) 

613 stack = list( 

614 filter( 

615 lambda x: True 

616 if x["file"].split(".")[0] in missing_directories 

617 else False, 

618 stack, 

619 ) 

620 ) 

621 if len(result) == len(local_filenames): 

622 return result, working_directory 

623 

624 # instantiate ProgressBars 

625 if have_progress_reporter and progress_bar: 

626 callbacks = [] 

627 pg = progress_reporter.ProgressReporter_() 

628 total = sum(item["size"] for item in stack) 

629 

630 def update(n, blk, stage): 

631 downloaded = n * blk 

632 inc = max(0, downloaded - pg._prog_rep_progressbars[stage].n) 

633 pg.update(inc, stage=stage) 

634 # total progress 

635 try: 

636 pg.update(inc, stage=-1) 

637 except RuntimeError: 

638 pass 

639 

640 from functools import partial 

641 

642 tqdm_args = dict(unit="B", file=sys.stdout, unit_scale=True, position=0) 

643 

644 n_progress_bars = 0 

645 for stage, item in enumerate(stack): 

646 if working_directory is not None: 

647 path = os.path.join(working_directory, item["file"]) 

648 if os.path.exists(path) and not overwrite: 

649 callbacks.append(None) 

650 else: 

651 pg.register( 

652 item["size"], 

653 description=f'downloading {item["file"]}', 

654 tqdm_args=tqdm_args, 

655 stage=stage, 

656 ) 

657 callbacks.append(partial(update, stage=stage)) 

658 n_progress_bars += 1 

659 if n_progress_bars > 1: 

660 pg.register(total, description="total", tqdm_args=tqdm_args, stage=-1) 

661 else: 

662 from unittest.mock import MagicMock 

663 

664 pg = MagicMock() 

665 callbacks = [None] * len(stack) 

666 

667 # download and unpack 

668 result = [] 

669 with pg.context(): 

670 for item, progress in zip(stack, callbacks): 

671 file = download_wrapper( 

672 self, 

673 item["file"], 

674 working_directory=working_directory, 

675 max_attempts=max_attempts, 

676 force=overwrite, 

677 callback=progress, 

678 ) 

679 if item["unpack"]: 

680 

681 def inspect(members): 

682 for member in members: 

683 path, filename = os.path.split(member.name) 

684 if path == "": 

685 yield member, filename 

686 

687 with tarfile.open(file, "r:gz") as fh: 

688 members = [] 

689 for i, (member, filename) in enumerate(inspect(fh)): 

690 members.append(member) 

691 result.append(os.path.join(working_directory, filename)) 

692 fh.extractall(path=working_directory, members=members) 

693 os.remove(file) 

694 result.append(file.split(".")[0]) 

695 # print(result) 

696 # raise Exception("STOP") 

697 # def inspect(members): 

698 # for member in members: 

699 # path, filename = os.path.split(member.name) 

700 # if path == '': 

701 # yield member, filename 

702 # 

703 # with tarfile.open(file, 'r:gz') as fh: 

704 # members = [] 

705 # for i, (member, filename) in enumerate(inspect(fh)): 

706 # members.append(member) 

707 # result.append( 

708 # os.path.join(working_directory, filename)) 

709 # fh.extractall( 

710 # path=working_directory, members=members) 

711 # os.remove(file) 

712 else: 

713 result.append(file) 

714 

715 return result, working_directory 

716 

717 def __str__(self): 

718 string = f"Repository: {self.url}\n" 

719 string += f"Maintainer: {self.maintainer}\n" 

720 for dataset in self.datasets: 

721 string += f" Dataset: {dataset}\n" 

722 for filetype in self.data[dataset]: 

723 if filetype == "credit": 

724 try: 

725 string += f" Author: {self.data[dataset][filetype]['author']}, {self.data[dataset][filetype]['email']}\n" 

726 except KeyError: 

727 string += ( 

728 f" Author: {self.data[dataset][filetype]['author']}\n" 

729 ) 

730 continue 

731 try: 

732 substr = f" {filetype.capitalize()} File: {self.data[dataset][filetype]['file']}" 

733 string += f"{substr:<50}{sizeof_fmt(self.data[dataset][filetype]['size'])}\n" 

734 except KeyError: 

735 print("Could not build summary string") 

736 print(filetype) 

737 print(type(filetype)) 

738 raise 

739 return string