Coverage for encodermap/trajinfo/repository.py: 11%
245 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-07 11:05 +0000
« prev ^ index » next coverage.py v7.1.0, created at 2023-02-07 11:05 +0000
1# -*- coding: utf-8 -*-
2# encodermap/trajinfo/repository.py
3################################################################################
4# Encodermap: A python library for dimensionality reduction.
5#
6# Copyright 2019-2022 University of Konstanz and the Authors
7#
8# Authors:
9# Kevin Sawade
10#
11# Encodermap is free software: you can redistribute it and/or modify
12# it under the terms of the GNU Lesser General Public License as
13# published by the Free Software Foundation, either version 2.1
14# of the License, or (at your option) any later version.
15# This package is distributed in the hope that it will be useful to other
16# researches. IT DOES NOT COME WITH ANY WARRANTY WHATSOEVER; without even the
17# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18# See the GNU Lesser General Public License for more details.
19#
20# See <http://www.gnu.org/licenses/>.
21################################################################################
22"""Python endpoint to download files from a webserver on the fly.
24Idea from Christoph Wehmeyer: https://github.com/markovmodel/mdshare
25I liked his idea of the possibility to distribute MD data via a simple python
26backend, but wanted to make it smaller. A simple `fetch()` should suffice. Also
27I liked the yaml syntax and wanted to use it.
29References:
30 @article{wehmeyer2018introduction,
31 title={Introduction to Markov state modeling with the PyEMMA software [Article v1. 0]},
32 author={Wehmeyer, Christoph and Scherer, Martin K and Hempel, Tim and Husic, Brooke E and Olsson, Simon and No{\'e}, Frank},
33 journal={Living Journal of Computational Molecular Science},
34 volume={1},
35 number={1},
36 pages={5965},
37 year={2018}
38 }
40"""
43##############################################################################
44# Imports
45##############################################################################
48import errno
49import fnmatch
50import glob
51import inspect
52import os
53import re
54import sys
55from itertools import chain
56from operator import methodcaller
58import requests
60from .._optional_imports import _optional_import
61from .hash_files import hash_files
62from .info_all import TrajEnsemble
63from .info_single import SingleTraj
65##############################################################################
66# Optional Imports
67##############################################################################
70yaml_load = _optional_import("yaml", "load")
71yaml_dump = _optional_import("yaml", "dump")
72Loader = _optional_import("yaml", "CLoader")
73Dumper = _optional_import("yaml", "CDumper")
74download_wrapper = _optional_import("mdshare", "utils.download_wrapper")
75tarfile = _optional_import("tarfile")
78##############################################################################
79# Globals
80##############################################################################
83__all__ = ["Repository"]
86##############################################################################
87# Functions
88##############################################################################
91def gen_dict_extract(key, var):
92 """Copied from hexerei software's solution for nested dicts:
94 Finds the value of a key anywhere in a nested dict.
96 """
97 if hasattr(var, "iteritems"):
98 for k, v in var.iteritems():
99 if k == key:
100 yield v
101 if isinstance(v, dict):
102 for result in gen_dict_extract(key, v):
103 yield result
104 elif isinstance(v, list):
105 for d in v:
106 for result in gen_dict_extract(key, d):
107 yield result
110def find_mime_type(d, mime_type):
111 """Thanks to KobeJohn
113 https://stackoverflow.com/questions/22162321/search-for-a-value-in-a-nested-dictionary-python
115 """
116 reverse_linked_q = list()
117 reverse_linked_q.append((list(), d))
118 while reverse_linked_q:
119 this_key_chain, this_v = reverse_linked_q.pop()
120 # finish search if found the mime type
121 if this_v == mime_type:
122 return this_key_chain
123 # not found. keep searching
124 # queue dicts for checking / ignore anything that's not a dict
125 try:
126 items = this_v.items()
127 except AttributeError:
128 continue # this was not a nested dict. ignore it
129 for k, v in items:
130 reverse_linked_q.append((this_key_chain + [k], v))
131 # if we haven't returned by this point, we've exhausted all the contents
132 raise KeyError
135def sizeof_fmt(num, suffix="B"):
136 """Thanks to Fred Cirera and Sridhar Ratnakumar
138 https://stackoverflow.com/questions/1094841/get-human-readable-version-of-file-size
139 https://web.archive.org/web/20111010015624/http://blogmag.net/blog/read/38/Print_human_readable_file_size
141 """
142 for unit in ["", "Ki", "Mi", "Gi", "Ti", "Pi", "Ei", "Zi"]:
143 if abs(num) < 1024.0:
144 return "%3.1f%s%s" % (num, unit, suffix)
145 num /= 1024.0
146 return "%.1f%s%s" % (num, "Yi", suffix)
149##############################################################################
150# Classes
151##############################################################################
154class Repository:
155 """Main Class to work with Repositories of MD data and download the data.
157 This class handles the download of files from a repository source. All
158 data are obtained from a .yaml file (default at data/repository.yaml), which
159 contains trajectory files and topology files organized in a readable manner.
160 With this class the repository.yaml file can be queried using unix-like file
161 patterns. Files can be downloaded on-the-fly (if they already exist, they won't
162 be downloaded again). Besides files full projects can be downloaded and rebuilt.
164 Attributes:
165 current_path (str): Path of the .py file containing this class.
166 If no working directory is given (None), all files will be
167 downloaded to a directory named 'data' (will be created) which will
168 be placed in the directory of this .py file.
169 url (str): The url to the current repo source.
170 maintainer (str): The maintainer of the current repo source.
171 files_dict (dict): A dictionary summarizing the files in this repo.
172 dict keys are built from `'project_name' + 'filetype'`. So for a
173 project called 'protein_sim', possible keys are 'protein_sim_trajectory',
174 'protein_sim_topology', 'protein_sim_log'. The values of these keys
175 are all str and they give the actual filename of the files. If 'protein_sim'
176 was conducted with GROMACS, these files would be 'traj_comp.xtc', 'confout.gro'
177 and 'md.log'.
178 files (list): Just a list of str of all downloadable files.
179 data (dict): The main organization of the repository. This is the complete
180 .yaml file as it was read and returned by pyyaml.
182 Examples:
183 >>> import encodermap as em
184 >>> repo = em.Repository()
185 >>> print(repo.search('*PFFP_sing*')) # doctest: +SKIP
186 {'PFFP_single_trajectory': 'PFFP_single.xtc', 'PFFP_single_topology': 'PFFP_single.gro', 'PFFP_single_input': 'PFFP.mdp', 'PFFP_single_log': 'PFFP.log'}
187 >>> print(repo.url)
188 http://134.34.112.158
190 """
192 def __init__(
193 self,
194 repo_source="data/repository.yaml",
195 checksum_file="data/repository.md5",
196 ignore_checksums=False,
197 debug=True,
198 ):
199 """Initialize the repository,
201 Args:
202 repo_source (str): The source .yaml file to build the repository from.
203 Defaults to 'data/repository.yaml'.
204 checksum_file (str): A file containing the md5 hash of the repository file.
205 This ensures no one tampers with the repository.yaml file and injects
206 malicious code. Defaults to 'data/repository.md5'.
207 ignore_checksums (bool): If you want to ignore the checksum check of
208 the repo_source file set this top True. Can be useful for
209 developing, when the repository.yaml file undergoes a lot of changes.
210 Defaults to False.
211 debug (bool, optional): Whether to print debug info. Defaults to False.
213 """
214 # this will point to this file, no matter where it is (venv, etc.)
215 self.current_path = os.path.split(inspect.getfile(inspect.currentframe()))[0]
216 self.debug = debug
218 # with that the source files can be defined
219 repo_source = os.path.join(self.current_path, repo_source)
220 checksum_file = os.path.join(self.current_path, checksum_file)
222 # check the hash sum of the repo.yml file
223 if checksum_file is not None and not ignore_checksums:
224 with open(checksum_file, "r") as fh:
225 if hash_files(repo_source)["repository.yaml"]["md5"] != fh.read():
226 raise RuntimeError(
227 "Checksums do not match, check your catalogue files!"
228 )
230 # read the repo.yml file
231 with open(repo_source, "r") as f:
232 self.data = yaml_load(f, Loader=Loader)
234 # define variables based on that
235 self.url = self.data["url"]
236 self.maintainer = (
237 self.data["maintainer"]["name"] + ", " + self.data["maintainer"]["email"]
238 )
239 self.projects = self.data["projects"]
240 self._connection = None
241 self.files_dict = {}
242 for dataset in self.datasets:
243 for filetype in self.data[dataset]:
244 if filetype == "credit":
245 continue
246 self.files_dict[f"{dataset}_{filetype}"] = self.data[dataset][filetype][
247 "file"
248 ]
249 self.files = list(self.files_dict.values())
251 @property
252 def catalogue(self):
253 """dict: Returns the underlying catalogue data."""
254 return self.data
256 def print_catalogue(self):
257 """Prints the catalogue nicely formatted."""
258 print(self.__str__())
260 @property
261 def projects(self):
262 """dict: A dictionary containing project names and their associated files.
263 Projects are a larger collection of individual sims, that belong together.
264 The project names are the dictionary's keys, the files are given as lists
265 in the dict's values.
267 """
268 return self._projects
270 @projects.setter
271 def projects(self, projects):
272 self._projects = {}
273 for item in self.data["projects"]:
274 if isinstance(self.data["projects"][item], list):
275 self._projects[item] = self.data["projects"][item]
276 elif isinstance(self.data["projects"][item], dict):
277 _ = []
278 for key, value in self.data["projects"][item].items():
279 if key == "type":
280 continue
281 _.extend(value)
282 self._projects[item] = _
283 else:
284 raise ValueError(f"Wrong Type in projects: {item}: {type(item)}")
286 @property
287 def datasets(self):
288 """set: A set of datasets in this repository. A dataset can either be
289 characterized by a set of trajectory-, topology-, log- and input-file
290 or a dataset is a .tar.gz container, which contains all necessary files.
292 """
293 return set(self.data.keys()).difference(
294 set(["name", "url", "maintainer", "projects"])
295 )
297 def search(self, pattern):
298 out = {}
299 if isinstance(pattern, list):
300 _ = [self.search(i) for i in pattern]
301 return dict(chain.from_iterable(map(methodcaller("items"), _)))
302 for key, item in self.files_dict.items():
303 if fnmatch.fnmatch(key, pattern) or fnmatch.fnmatch(item, pattern):
304 out[key] = item
305 return out
307 def load_project(
308 self,
309 project,
310 working_directory=None,
311 overwrite=False,
312 max_attempts=3,
313 makdedir=False,
314 progress_bar=True,
315 ):
316 """This will return `TrajEnsemble` / `SingleTraj` objects that are correctly formatted.
318 This method allows one to directly rebuild projects from the repo source,
319 using encodermap's own `SingleTraj` and `TrajEnsemble` classes.
321 Args:
322 project (str): The name of the project to be loaded. See
323 Repository.projects.keys() for a list of projects.
324 working_directory (Union[str, None], optional): Can be a string to a directory to save the
325 files at. Can also be None. In that case `self.current_path` + `'/data'` will be used
326 to save the file at. Which is retrieved by `inspect.getfile(inspect.currentframe))`. If
327 the files are already there and overwrite is false, the file path is simply returned.
328 Defaults to None.
329 overwrite (bool, optional): Whether to overwrite local files. Defaults to False.
330 max_attempts (int, optional): Number of download attempts. Defaults to 3.
331 makdedir (bool, optional): Whether to create `working_directory`, if it is not already existing.
332 Defaults to False.
333 progress_bar (bool, optional): Uses the package progress-reporter to display a progress bar.
335 Returns:
336 Union[encodermap.SingleTraj, encodermap.TrajEnsemble]: The project already loaded into encodermap's
337 `SingleTraj` or `TrajEnsemble` classes.
339 Examples:
340 >>> import encodermap as em
341 >>> repo = em.Repository()
342 >>> trajs = repo.load_project('Tetrapeptides_Single')
343 >>> print(trajs)
344 encodermap.TrajEnsemble object. Current backend is no_load. Containing 2 trajs. Common str is ['PFFP', 'FPPF']. Not containing any CVs.
345 >>> print(trajs.n_trajs)
346 2
348 """
349 if isinstance(self.data["projects"][project], dict):
350 common_strings = list(
351 filter(
352 lambda x: False if x == "type" else True,
353 list(self.data["projects"][project].keys()),
354 )
355 )
356 traj_files = [
357 file
358 for cs in common_strings
359 for file in self.data["projects"][project][cs][:-1]
360 ]
361 top_files = [
362 self.data["projects"][project][cs][-1] for cs in common_strings
363 ]
364 if self.data["projects"][project]["type"] == "files":
365 files, directory = self.fetch(
366 traj_files + top_files,
367 working_directory=working_directory,
368 overwrite=overwrite,
369 max_attempts=max_attempts,
370 makdedir=makdedir,
371 progress_bar=progress_bar,
372 )
373 elif self.data["projects"][project]["type"] == "container":
374 pattern = project + ".tar.gz"
375 files, directory = self.fetch(
376 pattern,
377 working_directory=working_directory,
378 overwrite=overwrite,
379 max_attempts=max_attempts,
380 makdedir=makdedir,
381 progress_bar=progress_bar,
382 )
383 else:
384 raise Exception(
385 f"Unknown type of project: {self.data['projects'][project]['type']}. `type` needs to be either 'files' or 'container'."
386 )
387 traj_files = [os.path.join(directory, i) for i in traj_files]
388 top_files = [os.path.join(directory, i) for i in top_files]
389 return TrajEnsemble(traj_files, top_files, common_str=common_strings)
390 else:
391 files, directory = self.fetch(
392 self.projects[project],
393 working_directory=working_directory,
394 overwrite=overwrite,
395 max_attempts=max_attempts,
396 makdedir=makdedir,
397 progress_bar=progress_bar,
398 )
399 return SingleTraj(files[0], files[1])
401 def lookup(self, file):
402 """Piece of code to allow some compatibility to mdshare.
404 The complete `self.data` dictionary will be traversed to find
405 `file` and its location in the `self.data` dictionary. This will be
406 used to get the filesize and its md5 hash. The returned tuple also tells
407 whether the file is a .tar.gz container or not. In the case of a container,
408 the container needs to be extracted using tarfile.
410 Args:
411 file (str): The file to search for.
413 Returns:
414 tuple: A tuple containing the follwing:
415 str: A string that is either 'container' or 'index' (for normal files).
416 dict: A dict with dict(file=filename, hash=filehas, size=filesize)
418 """
419 simulation, filetype, _ = find_mime_type(self.data, file)
420 out = dict(
421 file=self.data[simulation][filetype]["file"],
422 hash=self.data[simulation][filetype]["md5"],
423 size=self.data[simulation][filetype]["size"],
424 )
425 if filetype == "container":
426 return "containers", out
427 else:
428 return "index", out
430 def _get_connection(self):
431 """Also compatibility with mdshare"""
432 if self._connection is None:
433 self._connection = requests.session()
434 return self._connection
436 @staticmethod
437 def _split_proj_filetype(proj_filetype):
438 """Splits the strings that index the self.datasets dictionary."""
439 if proj_filetype.count("_") == 1:
440 return proj_filetype.split("_")
441 else:
442 substrings = proj_filetype.split("_")
443 return "_".join(substrings[:-1]), substrings[-1]
445 def get_sizes(self, pattern):
446 """Returns a list of file-sizes of a given pattern.
448 Args:
449 pattern (Union[str, list]): A unix-like pattern ('traj*.xtc') or a
450 list of files (['traj_1.xtc', 'traj_2.xtc']).
452 Returns:
453 list: A list of filesizes in bytes.
455 """
456 sizes = []
457 for proj_filetype, file in self.search(pattern).items():
458 project, filetype = Repository._split_proj_filetype(proj_filetype)
459 size = self.data[project][filetype]["size"]
460 sizes.append(size)
461 return sizes
463 def stack(self, pattern):
464 """Creates a stack to prepare for downloads.
466 Args:
467 pattern (Union[str, list]): A unix-like pattern ('traj*.xtc') or a
468 list of files (['traj_1.xtc', 'traj_2.xtc']).
470 Returns:
471 list: A list of dicts. Each dict contains filename, size and a boolean
472 value telling whether the downloaded file needs to be extracted
473 after downloading.
475 """
476 stack = []
477 sizes = self.get_sizes(pattern)
478 for (proj_filetype, file), size in zip(self.search(pattern).items(), sizes):
479 project, filetype = Repository._split_proj_filetype(proj_filetype)
480 unpack = filetype == "container"
481 stack.append(dict(file=file, size=size, unpack=unpack))
482 return stack
484 def fetch(
485 self,
486 remote_filenames,
487 working_directory=None,
488 overwrite=False,
489 max_attempts=3,
490 makdedir=False,
491 progress_bar=True,
492 ):
493 """This fetches a singular file from self.files.
495 Displays also progress bar with the name of the file. Uses requests.
497 Args:
498 remote_filename (str): The name of the remote file. Check `self.files` for more info.
499 working_directory (Union[str, None], optional): Can be a string to a directory to save the
500 files at. Can also be None. In that case `self.current_path` + `'/data'` will be used
501 to save the file at. Which is retrieved by `inspect.getfile(inspect.currentframe))`. If
502 the files are already there and overwrite is false, the file path is simply returned.
503 Defaults to None.
504 overwrite (bool, optional): Whether to overwrite local files. Defaults to False.
505 max_attempts (int, optional): Number of download attempts. Defaults to 3.
506 makdedir (bool, optional): Whether to create `working_directory`, if it is not already existing.
507 Defaults to False.
508 progress_bar (bool, optional): Uses the package progress-reporter to display a progress bar.
510 Returns:
511 tuple: A tuple containing the following:
512 list: A list of files that have just been downloaded.
513 str: A string leading to the directory the files have been downloaded to.
515 """
516 # import progress-reporter
517 try:
518 import progress_reporter
520 have_progress_reporter = True
521 except ImportError:
522 if self.debug:
523 print(
524 "Downloading files without progress bar. Run `pip install progress-reporter` to use this feature."
525 )
526 have_progress_reporter = False
528 # find files to import
529 stack = self.stack(remote_filenames)
531 # define the filename
532 if working_directory is None:
533 working_directory = os.path.join(self.current_path, "data")
534 if isinstance(working_directory, str):
535 local_filenames = [
536 os.path.join(working_directory, s["file"]) for s in stack
537 ]
538 if not os.path.isdir(working_directory):
539 if makdedir:
540 os.makedirs(working_directory)
541 else:
542 raise FileNotFoundError(
543 errno.ENOENT, os.strerror(errno.ENOENT), working_directory
544 )
545 else:
546 raise ValueError(
547 f"Type of argument `working_directory` needs to be either `None` or `str`, you provided {type(working_directory)}"
548 )
550 # split .tar.gz and regular files
551 # and check what already exists
552 local_containers = [f for f in local_filenames if f.endswith(".tar.gz")]
553 local_containers = list(map(lambda x: x.split(".")[0], local_containers))
554 local_files = [f for f in local_filenames if not f.endswith(".tar.gz")]
555 result = []
556 if local_files:
557 if all([os.path.isfile(lf) for lf in local_files]) and not overwrite:
558 if self.debug:
559 print(
560 f"Files '{local_files}' already exists. Set `overwrite` to `True` to download them again."
561 )
562 result.extend(local_files)
563 elif any([os.path.isfile(lf) for lf in local_files]) and not overwrite:
564 existing_files = glob.glob(os.path.join(working_directory, "*"))
565 existing_files = [os.path.split(i)[-1] for i in existing_files]
566 missing_files = list(
567 set([i["file"] for i in stack]).difference(set(existing_files))
568 )
569 result.extend(
570 [
571 os.path.join(working_directory, i)
572 for i in existing_files
573 if fnmatch.fnmatch(i, remote_filenames)
574 ]
575 )
576 if self.debug:
577 print(
578 f"{len(stack) - len(missing_files)} Files already exist. I will only download '{missing_files}'. Set `overwrite` to `True` to download all files again."
579 )
580 stack = list(
581 filter(
582 lambda x: True if x["file"] in missing_files else False, stack
583 )
584 )
585 if local_containers:
586 if all([os.path.isdir(lc) for lc in local_containers]) and not overwrite:
587 if self.debug:
588 print(
589 f"Directories '{local_containers}' already exists. Set `overwrite` to `True` to download them again."
590 )
591 result.extend(local_containers)
592 elif any([os.path.isdir(lc) for lc in local_containers]) and not overwrite:
593 existing_directories = glob.glob(os.path.join(working_directory, "*/"))
594 existing_directories = [
595 os.path.split(os.path.split(i)[0])[-1] for i in existing_directories
596 ]
597 missing_directories = list(
598 set([i["file"].split(".")[0] for i in stack]).difference(
599 set(existing_directories)
600 )
601 )
602 result.extend(
603 [
604 os.path.join(working_directory, i)
605 for i in existing_directories
606 if fnmatch.fnmatch(i + ".tar.gz", remote_filenames)
607 ]
608 )
609 if self.debug:
610 print(
611 f"{len(stack) - len(missing_directories)} Directories already exist. I will only download '{missing_directories}'. Set `overwrite` to `True` to download all files again."
612 )
613 stack = list(
614 filter(
615 lambda x: True
616 if x["file"].split(".")[0] in missing_directories
617 else False,
618 stack,
619 )
620 )
621 if len(result) == len(local_filenames):
622 return result, working_directory
624 # instantiate ProgressBars
625 if have_progress_reporter and progress_bar:
626 callbacks = []
627 pg = progress_reporter.ProgressReporter_()
628 total = sum(item["size"] for item in stack)
630 def update(n, blk, stage):
631 downloaded = n * blk
632 inc = max(0, downloaded - pg._prog_rep_progressbars[stage].n)
633 pg.update(inc, stage=stage)
634 # total progress
635 try:
636 pg.update(inc, stage=-1)
637 except RuntimeError:
638 pass
640 from functools import partial
642 tqdm_args = dict(unit="B", file=sys.stdout, unit_scale=True, position=0)
644 n_progress_bars = 0
645 for stage, item in enumerate(stack):
646 if working_directory is not None:
647 path = os.path.join(working_directory, item["file"])
648 if os.path.exists(path) and not overwrite:
649 callbacks.append(None)
650 else:
651 pg.register(
652 item["size"],
653 description=f'downloading {item["file"]}',
654 tqdm_args=tqdm_args,
655 stage=stage,
656 )
657 callbacks.append(partial(update, stage=stage))
658 n_progress_bars += 1
659 if n_progress_bars > 1:
660 pg.register(total, description="total", tqdm_args=tqdm_args, stage=-1)
661 else:
662 from unittest.mock import MagicMock
664 pg = MagicMock()
665 callbacks = [None] * len(stack)
667 # download and unpack
668 result = []
669 with pg.context():
670 for item, progress in zip(stack, callbacks):
671 file = download_wrapper(
672 self,
673 item["file"],
674 working_directory=working_directory,
675 max_attempts=max_attempts,
676 force=overwrite,
677 callback=progress,
678 )
679 if item["unpack"]:
681 def inspect(members):
682 for member in members:
683 path, filename = os.path.split(member.name)
684 if path == "":
685 yield member, filename
687 with tarfile.open(file, "r:gz") as fh:
688 members = []
689 for i, (member, filename) in enumerate(inspect(fh)):
690 members.append(member)
691 result.append(os.path.join(working_directory, filename))
692 fh.extractall(path=working_directory, members=members)
693 os.remove(file)
694 result.append(file.split(".")[0])
695 # print(result)
696 # raise Exception("STOP")
697 # def inspect(members):
698 # for member in members:
699 # path, filename = os.path.split(member.name)
700 # if path == '':
701 # yield member, filename
702 #
703 # with tarfile.open(file, 'r:gz') as fh:
704 # members = []
705 # for i, (member, filename) in enumerate(inspect(fh)):
706 # members.append(member)
707 # result.append(
708 # os.path.join(working_directory, filename))
709 # fh.extractall(
710 # path=working_directory, members=members)
711 # os.remove(file)
712 else:
713 result.append(file)
715 return result, working_directory
717 def __str__(self):
718 string = f"Repository: {self.url}\n"
719 string += f"Maintainer: {self.maintainer}\n"
720 for dataset in self.datasets:
721 string += f" Dataset: {dataset}\n"
722 for filetype in self.data[dataset]:
723 if filetype == "credit":
724 try:
725 string += f" Author: {self.data[dataset][filetype]['author']}, {self.data[dataset][filetype]['email']}\n"
726 except KeyError:
727 string += (
728 f" Author: {self.data[dataset][filetype]['author']}\n"
729 )
730 continue
731 try:
732 substr = f" {filetype.capitalize()} File: {self.data[dataset][filetype]['file']}"
733 string += f"{substr:<50}{sizeof_fmt(self.data[dataset][filetype]['size'])}\n"
734 except KeyError:
735 print("Could not build summary string")
736 print(filetype)
737 print(type(filetype))
738 raise
739 return string