Coverage for encodermap/trajinfo/repository.py: 11%

1# -*- coding: utf-8 -*-

2# encodermap/trajinfo/repository.py

3################################################################################

4# Encodermap: A python library for dimensionality reduction.

8# Authors:

9# Kevin Sawade

10#

11# Encodermap is free software: you can redistribute it and/or modify

12# it under the terms of the GNU Lesser General Public License as

13# published by the Free Software Foundation, either version 2.1

14# of the License, or (at your option) any later version.

15# This package is distributed in the hope that it will be useful to other

16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the

17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

18# See the GNU Lesser General Public License for more details.

19#

20# See <http://www.gnu.org/licenses/>.

21################################################################################

22"""Python endpoint to download files from a webserver on the fly.

24Idea from Christoph Wehmeyer: https://github.com/markovmodel/mdshare

25I liked his idea of the possibility to distribute MD data via a simple python

26backend, but wanted to make it smaller. A simple `fetch()` should suffice. Also

27I liked the yaml syntax and wanted to use it.

29References:

30 @article{wehmeyer2018introduction,

31 title={Introduction to Markov state modeling with the PyEMMA software [Article v1. 0]},

32 author={Wehmeyer, Christoph and Scherer, Martin K and Hempel, Tim and Husic, Brooke E and Olsson, Simon and No{\'e}, Frank},

33 journal={Living Journal of Computational Molecular Science},

34 volume={1},

35 number={1},

36 pages={5965},

37 year={2018}

38 }

40"""

43##############################################################################

44# Imports

45##############################################################################

48import errno

49import fnmatch

50import glob

51import inspect

52import os

53import re

54import sys

55from itertools import chain

56from operator import methodcaller

58import requests

60from .._optional_imports import _optional_import

61from .hash_files import hash_files

62from .info_all import TrajEnsemble

63from .info_single import SingleTraj

65##############################################################################

66# Optional Imports

67##############################################################################

70yaml_load = _optional_import("yaml", "load")

71yaml_dump = _optional_import("yaml", "dump")

72Loader = _optional_import("yaml", "CLoader")

73Dumper = _optional_import("yaml", "CDumper")

74download_wrapper = _optional_import("mdshare", "utils.download_wrapper")

75tarfile = _optional_import("tarfile")

78##############################################################################

79# Globals

80##############################################################################

83__all__ = ["Repository"]

86##############################################################################

87# Functions

88##############################################################################

91def gen_dict_extract(key, var):

92 """Copied from hexerei software's solution for nested dicts:

94 Finds the value of a key anywhere in a nested dict.

96 """

97 if hasattr(var, "iteritems"):

98 for k, v in var.iteritems():

99 if k == key:

100 yield v

101 if isinstance(v, dict):

102 for result in gen_dict_extract(key, v):

103 yield result

104 elif isinstance(v, list):

105 for d in v:

106 for result in gen_dict_extract(key, d):

107 yield result

108

109

110def find_mime_type(d, mime_type):

111 """Thanks to KobeJohn

112

113 https://stackoverflow.com/questions/22162321/search-for-a-value-in-a-nested-dictionary-python

114

115 """

116 reverse_linked_q = list()

117 reverse_linked_q.append((list(), d))

118 while reverse_linked_q:

119 this_key_chain, this_v = reverse_linked_q.pop()

120 # finish search if found the mime type

121 if this_v == mime_type:

122 return this_key_chain

123 # not found. keep searching

124 # queue dicts for checking / ignore anything that's not a dict

125 try:

126 items = this_v.items()

127 except AttributeError:

128 continue # this was not a nested dict. ignore it

129 for k, v in items:

130 reverse_linked_q.append((this_key_chain + [k], v))

131 # if we haven't returned by this point, we've exhausted all the contents

132 raise KeyError

133

134

135def sizeof_fmt(num, suffix="B"):

136 """Thanks to Fred Cirera and Sridhar Ratnakumar

137

138 https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size

139 https://web.archive.org/web/20111010015624/http://blogmag.net/blog/read/38/Print_human_readable_file_size

140

141 """

142 for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:

143 if abs(num) < 1024.0:

144 return "%3.1f%s%s" % (num, unit, suffix)

145 num /= 1024.0

146 return "%.1f%s%s" % (num, "Yi", suffix)

147

148

149##############################################################################

150# Classes

151##############################################################################

152

153

154class Repository:

155 """Main Class to work with Repositories of MD data and download the data.

156

157 This class handles the download of files from a repository source. All

158 data are obtained from a .yaml file (default at data/repository.yaml), which

159 contains trajectory files and topology files organized in a readable manner.

160 With this class the repository.yaml file can be queried using unix-like file

161 patterns. Files can be downloaded on-the-fly (if they already exist, they won't

162 be downloaded again). Besides files full projects can be downloaded and rebuilt.

163

164 Attributes:

165 current_path (str): Path of the .py file containing this class.

166 If no working directory is given (None), all files will be

167 downloaded to a directory named 'data' (will be created) which will

168 be placed in the directory of this .py file.

169 url (str): The url to the current repo source.

170 maintainer (str): The maintainer of the current repo source.

171 files_dict (dict): A dictionary summarizing the files in this repo.

172 dict keys are built from `'project_name' + 'filetype'`. So for a

173 project called 'protein_sim', possible keys are 'protein_sim_trajectory',

174 'protein_sim_topology', 'protein_sim_log'. The values of these keys

175 are all str and they give the actual filename of the files. If 'protein_sim'

176 was conducted with GROMACS, these files would be 'traj_comp.xtc', 'confout.gro'

177 and 'md.log'.

178 files (list): Just a list of str of all downloadable files.

179 data (dict): The main organization of the repository. This is the complete

180 .yaml file as it was read and returned by pyyaml.

181

182 Examples:

183 >>> import encodermap as em

184 >>> repo = em.Repository()

185 >>> print(repo.search('*PFFP_sing*')) # doctest: +SKIP

186 {'PFFP_single_trajectory': 'PFFP_single.xtc', 'PFFP_single_topology': 'PFFP_single.gro', 'PFFP_single_input': 'PFFP.mdp', 'PFFP_single_log': 'PFFP.log'}

187 >>> print(repo.url)

188 http://134.34.112.158

189

190 """

191

192 def __init__(

193 self,

194 repo_source="data/repository.yaml",

195 checksum_file="data/repository.md5",

196 ignore_checksums=False,

197 debug=True,

198 ):

199 """Initialize the repository,

200

201 Args:

202 repo_source (str): The source .yaml file to build the repository from.

203 Defaults to 'data/repository.yaml'.

204 checksum_file (str): A file containing the md5 hash of the repository file.

205 This ensures no one tampers with the repository.yaml file and injects

206 malicious code. Defaults to 'data/repository.md5'.

207 ignore_checksums (bool): If you want to ignore the checksum check of

208 the repo_source file set this top True. Can be useful for

209 developing, when the repository.yaml file undergoes a lot of changes.

210 Defaults to False.

211 debug (bool, optional): Whether to print debug info. Defaults to False.

212

213 """

214 # this will point to this file, no matter where it is (venv, etc.)

215 self.current_path = os.path.split(inspect.getfile(inspect.currentframe()))[0]

216 self.debug = debug

217

218 # with that the source files can be defined

219 repo_source = os.path.join(self.current_path, repo_source)

220 checksum_file = os.path.join(self.current_path, checksum_file)

221

222 # check the hash sum of the repo.yml file

223 if checksum_file is not None and not ignore_checksums:

224 with open(checksum_file, "r") as fh:

225 if hash_files(repo_source)["repository.yaml"]["md5"] != fh.read():

226 raise RuntimeError(

227 "Checksums do not match, check your catalogue files!"

228 )

229

230 # read the repo.yml file

231 with open(repo_source, "r") as f:

232 self.data = yaml_load(f, Loader=Loader)

233

234 # define variables based on that

235 self.url = self.data["url"]

236 self.maintainer = (

237 self.data["maintainer"]["name"] + ", " + self.data["maintainer"]["email"]

238 )

239 self.projects = self.data["projects"]

240 self._connection = None

241 self.files_dict = {}

242 for dataset in self.datasets:

243 for filetype in self.data[dataset]:

244 if filetype == "credit":

245 continue

246 self.files_dict[f"{dataset}_{filetype}"] = self.data[dataset][filetype][

247 "file"

248 ]

249 self.files = list(self.files_dict.values())

250

251 @property

252 def catalogue(self):

253 """dict: Returns the underlying catalogue data."""

254 return self.data

255

256 def print_catalogue(self):

257 """Prints the catalogue nicely formatted."""

258 print(self.__str__())

259

260 @property

261 def projects(self):

262 """dict: A dictionary containing project names and their associated files.

263 Projects are a larger collection of individual sims, that belong together.

264 The project names are the dictionary's keys, the files are given as lists

265 in the dict's values.

266

267 """

268 return self._projects

269

270 @projects.setter

271 def projects(self, projects):

272 self._projects = {}

273 for item in self.data["projects"]:

274 if isinstance(self.data["projects"][item], list):

275 self._projects[item] = self.data["projects"][item]

276 elif isinstance(self.data["projects"][item], dict):

277 _ = []

278 for key, value in self.data["projects"][item].items():

279 if key == "type":

280 continue

281 _.extend(value)

282 self._projects[item] = _

283 else:

284 raise ValueError(f"Wrong Type in projects: {item}: {type(item)}")

285

286 @property

287 def datasets(self):

288 """set: A set of datasets in this repository. A dataset can either be

289 characterized by a set of trajectory-, topology-, log- and input-file

290 or a dataset is a .tar.gz container, which contains all necessary files.

291

292 """

293 return set(self.data.keys()).difference(

294 set(["name", "url", "maintainer", "projects"])

295 )

296

297 def search(self, pattern):

298 out = {}

299 if isinstance(pattern, list):

300 _ = [self.search(i) for i in pattern]

301 return dict(chain.from_iterable(map(methodcaller("items"), _)))

302 for key, item in self.files_dict.items():

303 if fnmatch.fnmatch(key, pattern) or fnmatch.fnmatch(item, pattern):

304 out[key] = item

305 return out

306

307 def load_project(

308 self,

309 project,

310 working_directory=None,

311 overwrite=False,

312 max_attempts=3,

313 makdedir=False,

314 progress_bar=True,

315 ):

316 """This will return `TrajEnsemble` / `SingleTraj` objects that are correctly formatted.

317

318 This method allows one to directly rebuild projects from the repo source,

319 using encodermap's own `SingleTraj` and `TrajEnsemble` classes.

320

321 Args:

322 project (str): The name of the project to be loaded. See

323 Repository.projects.keys() for a list of projects.

324 working_directory (Union[str, None], optional): Can be a string to a directory to save the

325 files at. Can also be None. In that case `self.current_path` + `'/data'` will be used

326 to save the file at. Which is retrieved by `inspect.getfile(inspect.currentframe))`. If

327 the files are already there and overwrite is false, the file path is simply returned.

328 Defaults to None.

329 overwrite (bool, optional): Whether to overwrite local files. Defaults to False.

330 max_attempts (int, optional): Number of download attempts. Defaults to 3.

331 makdedir (bool, optional): Whether to create `working_directory`, if it is not already existing.

332 Defaults to False.

333 progress_bar (bool, optional): Uses the package progress-reporter to display a progress bar.

334

335 Returns:

336 Union[encodermap.SingleTraj, encodermap.TrajEnsemble]: The project already loaded into encodermap's

337 `SingleTraj` or `TrajEnsemble` classes.

338

339 Examples:

340 >>> import encodermap as em

341 >>> repo = em.Repository()

342 >>> trajs = repo.load_project('Tetrapeptides_Single')

343 >>> print(trajs)

344 encodermap.TrajEnsemble object. Current backend is no_load. Containing 2 trajs. Common str is ['PFFP', 'FPPF']. Not containing any CVs.

345 >>> print(trajs.n_trajs)

346 2

347

348 """

349 if isinstance(self.data["projects"][project], dict):

350 common_strings = list(

351 filter(

352 lambda x: False if x == "type" else True,

353 list(self.data["projects"][project].keys()),

354 )

355 )

356 traj_files = [

357 file

358 for cs in common_strings

359 for file in self.data["projects"][project][cs][:-1]

360 ]

361 top_files = [

362 self.data["projects"][project][cs][-1] for cs in common_strings

363 ]

364 if self.data["projects"][project]["type"] == "files":

365 files, directory = self.fetch(

366 traj_files + top_files,

367 working_directory=working_directory,

368 overwrite=overwrite,

369 max_attempts=max_attempts,

370 makdedir=makdedir,

371 progress_bar=progress_bar,

372 )

373 elif self.data["projects"][project]["type"] == "container":

374 pattern = project + ".tar.gz"

375 files, directory = self.fetch(

376 pattern,

377 working_directory=working_directory,

378 overwrite=overwrite,

379 max_attempts=max_attempts,

380 makdedir=makdedir,

381 progress_bar=progress_bar,

382 )

383 else:

384 raise Exception(

385 f"Unknown type of project: {self.data['projects'][project]['type']}. `type` needs to be either 'files' or 'container'."

386 )

387 traj_files = [os.path.join(directory, i) for i in traj_files]

388 top_files = [os.path.join(directory, i) for i in top_files]

389 return TrajEnsemble(traj_files, top_files, common_str=common_strings)

390 else:

391 files, directory = self.fetch(

392 self.projects[project],

393 working_directory=working_directory,

394 overwrite=overwrite,

395 max_attempts=max_attempts,

396 makdedir=makdedir,

397 progress_bar=progress_bar,

398 )

399 return SingleTraj(files[0], files[1])

400

401 def lookup(self, file):

402 """Piece of code to allow some compatibility to mdshare.

403

404 The complete `self.data` dictionary will be traversed to find

405 `file` and its location in the `self.data` dictionary. This will be

406 used to get the filesize and its md5 hash. The returned tuple also tells

407 whether the file is a .tar.gz container or not. In the case of a container,

408 the container needs to be extracted using tarfile.

409

410 Args:

411 file (str): The file to search for.

412

413 Returns:

414 tuple: A tuple containing the follwing:

415 str: A string that is either 'container' or 'index' (for normal files).

416 dict: A dict with dict(file=filename, hash=filehas, size=filesize)

417

418 """

419 simulation, filetype, _ = find_mime_type(self.data, file)

420 out = dict(

421 file=self.data[simulation][filetype]["file"],

422 hash=self.data[simulation][filetype]["md5"],

423 size=self.data[simulation][filetype]["size"],

424 )

425 if filetype == "container":

426 return "containers", out

427 else:

428 return "index", out

429

430 def _get_connection(self):

431 """Also compatibility with mdshare"""

432 if self._connection is None:

433 self._connection = requests.session()

434 return self._connection

435

436 @staticmethod

437 def _split_proj_filetype(proj_filetype):

438 """Splits the strings that index the self.datasets dictionary."""

439 if proj_filetype.count("_") == 1:

440 return proj_filetype.split("_")

441 else:

442 substrings = proj_filetype.split("_")

443 return "_".join(substrings[:-1]), substrings[-1]

444

445 def get_sizes(self, pattern):

446 """Returns a list of file-sizes of a given pattern.

447

448 Args:

449 pattern (Union[str, list]): A unix-like pattern ('traj*.xtc') or a

450 list of files (['traj_1.xtc', 'traj_2.xtc']).

451

452 Returns:

453 list: A list of filesizes in bytes.

454

455 """

456 sizes = []

457 for proj_filetype, file in self.search(pattern).items():

458 project, filetype = Repository._split_proj_filetype(proj_filetype)

459 size = self.data[project][filetype]["size"]

460 sizes.append(size)

461 return sizes

462

463 def stack(self, pattern):

464 """Creates a stack to prepare for downloads.

465

466 Args:

467 pattern (Union[str, list]): A unix-like pattern ('traj*.xtc') or a

468 list of files (['traj_1.xtc', 'traj_2.xtc']).

469

470 Returns:

471 list: A list of dicts. Each dict contains filename, size and a boolean

472 value telling whether the downloaded file needs to be extracted

473 after downloading.

474

475 """

476 stack = []

477 sizes = self.get_sizes(pattern)

478 for (proj_filetype, file), size in zip(self.search(pattern).items(), sizes):

479 project, filetype = Repository._split_proj_filetype(proj_filetype)

480 unpack = filetype == "container"

481 stack.append(dict(file=file, size=size, unpack=unpack))

482 return stack

483

484 def fetch(

485 self,

486 remote_filenames,

487 working_directory=None,

488 overwrite=False,

489 max_attempts=3,

490 makdedir=False,

491 progress_bar=True,

492 ):

493 """This fetches a singular file from self.files.

494

495 Displays also progress bar with the name of the file. Uses requests.

496

497 Args:

498 remote_filename (str): The name of the remote file. Check `self.files` for more info.

499 working_directory (Union[str, None], optional): Can be a string to a directory to save the

500 files at. Can also be None. In that case `self.current_path` + `'/data'` will be used

501 to save the file at. Which is retrieved by `inspect.getfile(inspect.currentframe))`. If

502 the files are already there and overwrite is false, the file path is simply returned.

503 Defaults to None.

504 overwrite (bool, optional): Whether to overwrite local files. Defaults to False.

505 max_attempts (int, optional): Number of download attempts. Defaults to 3.

506 makdedir (bool, optional): Whether to create `working_directory`, if it is not already existing.

507 Defaults to False.

508 progress_bar (bool, optional): Uses the package progress-reporter to display a progress bar.

509

510 Returns:

511 tuple: A tuple containing the following:

512 list: A list of files that have just been downloaded.

513 str: A string leading to the directory the files have been downloaded to.

514

515 """

516 # import progress-reporter

517 try:

518 import progress_reporter

519

520 have_progress_reporter = True

521 except ImportError:

522 if self.debug:

523 print(

524 "Downloading files without progress bar. Run `pip install progress-reporter` to use this feature."

525 )

526 have_progress_reporter = False

527

528 # find files to import

529 stack = self.stack(remote_filenames)

530

531 # define the filename

532 if working_directory is None:

533 working_directory = os.path.join(self.current_path, "data")

534 if isinstance(working_directory, str):

535 local_filenames = [

536 os.path.join(working_directory, s["file"]) for s in stack

537 ]

538 if not os.path.isdir(working_directory):

539 if makdedir:

540 os.makedirs(working_directory)

541 else:

542 raise FileNotFoundError(

543 errno.ENOENT, os.strerror(errno.ENOENT), working_directory

544 )

545 else:

546 raise ValueError(

547 f"Type of argument `working_directory` needs to be either `None` or `str`, you provided {type(working_directory)}"

548 )

549

550 # split .tar.gz and regular files

551 # and check what already exists

552 local_containers = [f for f in local_filenames if f.endswith(".tar.gz")]

553 local_containers = list(map(lambda x: x.split(".")[0], local_containers))

554 local_files = [f for f in local_filenames if not f.endswith(".tar.gz")]

555 result = []

556 if local_files:

557 if all([os.path.isfile(lf) for lf in local_files]) and not overwrite:

558 if self.debug:

559 print(

560 f"Files '{local_files}' already exists. Set `overwrite` to `True` to download them again."

561 )

562 result.extend(local_files)

563 elif any([os.path.isfile(lf) for lf in local_files]) and not overwrite:

564 existing_files = glob.glob(os.path.join(working_directory, "*"))

565 existing_files = [os.path.split(i)[-1] for i in existing_files]

566 missing_files = list(

567 set([i["file"] for i in stack]).difference(set(existing_files))

568 )

569 result.extend(

570 [

571 os.path.join(working_directory, i)

572 for i in existing_files

573 if fnmatch.fnmatch(i, remote_filenames)

574 ]

575 )

576 if self.debug:

577 print(

578 f"{len(stack) - len(missing_files)} Files already exist. I will only download '{missing_files}'. Set `overwrite` to `True` to download all files again."

579 )

580 stack = list(

581 filter(

582 lambda x: True if x["file"] in missing_files else False, stack

583 )

584 )

585 if local_containers:

586 if all([os.path.isdir(lc) for lc in local_containers]) and not overwrite:

587 if self.debug:

588 print(

589 f"Directories '{local_containers}' already exists. Set `overwrite` to `True` to download them again."

590 )

591 result.extend(local_containers)

592 elif any([os.path.isdir(lc) for lc in local_containers]) and not overwrite:

593 existing_directories = glob.glob(os.path.join(working_directory, "*/"))

594 existing_directories = [

595 os.path.split(os.path.split(i)[0])[-1] for i in existing_directories

596 ]

597 missing_directories = list(

598 set([i["file"].split(".")[0] for i in stack]).difference(

599 set(existing_directories)

600 )

601 )

602 result.extend(

603 [

604 os.path.join(working_directory, i)

605 for i in existing_directories

606 if fnmatch.fnmatch(i + ".tar.gz", remote_filenames)

607 ]

608 )

609 if self.debug:

610 print(

611 f"{len(stack) - len(missing_directories)} Directories already exist. I will only download '{missing_directories}'. Set `overwrite` to `True` to download all files again."

612 )

613 stack = list(

614 filter(

615 lambda x: True

616 if x["file"].split(".")[0] in missing_directories

617 else False,

618 stack,

619 )

620 )

621 if len(result) == len(local_filenames):

622 return result, working_directory

623

624 # instantiate ProgressBars

625 if have_progress_reporter and progress_bar:

626 callbacks = []

627 pg = progress_reporter.ProgressReporter_()

628 total = sum(item["size"] for item in stack)

629

630 def update(n, blk, stage):

631 downloaded = n * blk

632 inc = max(0, downloaded - pg._prog_rep_progressbars[stage].n)

633 pg.update(inc, stage=stage)

634 # total progress

635 try:

636 pg.update(inc, stage=-1)

637 except RuntimeError:

638 pass

639

640 from functools import partial

641

642 tqdm_args = dict(unit="B", file=sys.stdout, unit_scale=True, position=0)

643

644 n_progress_bars = 0

645 for stage, item in enumerate(stack):

646 if working_directory is not None:

647 path = os.path.join(working_directory, item["file"])

648 if os.path.exists(path) and not overwrite:

649 callbacks.append(None)

650 else:

651 pg.register(

652 item["size"],

653 description=f'downloading {item["file"]}',

654 tqdm_args=tqdm_args,

655 stage=stage,

656 )

657 callbacks.append(partial(update, stage=stage))

658 n_progress_bars += 1

659 if n_progress_bars > 1:

660 pg.register(total, description="total", tqdm_args=tqdm_args, stage=-1)

661 else:

662 from unittest.mock import MagicMock

663

664 pg = MagicMock()

665 callbacks = [None] * len(stack)

666

667 # download and unpack

668 result = []

669 with pg.context():

670 for item, progress in zip(stack, callbacks):

671 file = download_wrapper(

672 self,

673 item["file"],

674 working_directory=working_directory,

675 max_attempts=max_attempts,

676 force=overwrite,

677 callback=progress,

678 )

679 if item["unpack"]:

680

681 def inspect(members):

682 for member in members:

683 path, filename = os.path.split(member.name)

684 if path == "":

685 yield member, filename

686

687 with tarfile.open(file, "r:gz") as fh:

688 members = []

689 for i, (member, filename) in enumerate(inspect(fh)):

690 members.append(member)

691 result.append(os.path.join(working_directory, filename))

692 fh.extractall(path=working_directory, members=members)

693 os.remove(file)

694 result.append(file.split(".")[0])

695 # print(result)

696 # raise Exception("STOP")

697 # def inspect(members):

698 # for member in members:

699 # path, filename = os.path.split(member.name)

700 # if path == '':

701 # yield member, filename

702 #

703 # with tarfile.open(file, 'r:gz') as fh:

704 # members = []

705 # for i, (member, filename) in enumerate(inspect(fh)):

706 # members.append(member)

707 # result.append(

708 # os.path.join(working_directory, filename))

709 # fh.extractall(

710 # path=working_directory, members=members)

711 # os.remove(file)

712 else:

713 result.append(file)

714

715 return result, working_directory

716

717 def __str__(self):

718 string = f"Repository: {self.url}\n"

719 string += f"Maintainer: {self.maintainer}\n"

720 for dataset in self.datasets:

721 string += f" Dataset: {dataset}\n"

722 for filetype in self.data[dataset]:

723 if filetype == "credit":

724 try:

725 string += f" Author: {self.data[dataset][filetype]['author']}, {self.data[dataset][filetype]['email']}\n"

726 except KeyError:

727 string += (

728 f" Author: {self.data[dataset][filetype]['author']}\n"

729 )

730 continue

731 try:

732 substr = f" {filetype.capitalize()} File: {self.data[dataset][filetype]['file']}"

733 string += f"{substr:<50}{sizeof_fmt(self.data[dataset][filetype]['size'])}\n"