Source code for ncempy.io.emd

"""
This module provides an interface to the Berkeley EMD file format.

See https://emdatasets.com/ for more details.

* It fully supports v0.2 for reading and writing

Note
----
General users:
    Use the simplified emd.emdReader() function to load the data and meta
    data as a python dictionary.

Advanced users and developers:
    Access the file internals through the emd.fileDEMD() class.

"""

from pathlib import Path
import datetime

import numpy as np
import h5py


[docs]class fileEMD:
    """Class to represent Berkeley EMD files.

    Implemented for spec 0.2 using the recommended layout for metadata.

    Attributes
    ----------
    file_name : str
        The name of the file
    file_path : pathlib.Path
        A pathlib.Path object for the open file
    file_hdl : h5py.File
        The h5py file handle which provides direct access to the underlying h5py file structure.
    version : tuple
        A 2-tuple of the major and minor EMD version as ints.
    data : h5py.Group
        A link to the data group in the EMD file.
    microscope : h5py.Group
        A link to the microscope EMD group with metadata.
    sample : h5py.Group
        A link to the sample EMD group with metadata.
    user : h5py.Group
        A link to the user EMD group with metadata.
    comments : h5py.Group
        A link to the comments in this EMD file. A time stamp is included with the comment.
    list_emds : list
        A list of the EMD groups in the file.

    Examples
    --------
    Open an Berkeley EMD file using the *advanced* interface. See the emdReader function for a more convenient
    way to load the data. We show how to list the available data sets, load a 3D data set and plot the first image.
    >> from ncempy.io import emd
    >> with emd.fileEMD('filename.emd') as emd1:
    >>     [print(dataGroup.name) for dataGroup in emd1.list_emds] # print all available EMD datasets
    >>     data1, dims1 = emd1.get_emdgroup(0) # load the first full data array and dimension information
    """

    def __init__(self, filename, readonly=True):
        """Init opening/creating the file.

        Parameters
        ----------
        filename : str or pathlib.Path or file object
            The EMD file to open.
        readonly : bool, default True
            Set to False to allow writing to the file.

        """

        # necessary declarations in case something goes bad
        self.file_hdl = None

        # convenience handles to access the data in the emd file, everything can as well be accessed using the file_hdl
        self.version = None
        self.data = None
        self.microscope = None
        self.sample = None
        self.user = None
        self.comments = None
        self.list_emds = []  # list of HDF5 groups with emd_data_type type 1

        if hasattr(filename, 'read'):
            try:
                self.file_path = Path(filename.name)
                self.file_name = self.file_path.name
            except AttributeError:
                self.file_path = None
                self.file_name = None
        else:
            # check filename type, change to pathlib.Path
            if isinstance(filename, str):
                filename = Path(filename)
            elif isinstance(filename, Path):
                pass
            else:
                raise TypeError('Filename is supposed to be a string or pathlib.Path or file object')
            self.file_path = filename
            self.file_name = self.file_path.name

        # try opening the file
        if readonly:
            try:
                self.file_hdl = h5py.File(filename, 'r')
            except:
                print('Error opening file for readonly: "{}"'.format(filename))
                raise
        else:
            try:
                self.file_hdl = h5py.File(filename, 'a')
            except:
                print('Error opening file for read/write: "{}"'.format(filename))
                raise

        # if we got a working file
        if self.file_hdl:
            # check version information
            if 'version_major' in self.file_hdl.attrs and 'version_minor' in self.file_hdl.attrs:
                # read version information
                self.version = (self.file_hdl.attrs['version_major'], self.file_hdl.attrs['version_minor'])
                # compare to implementation
                if not self.version == (0, 2):
                    print('WARNING: You are reading a version {}.{} EMD file, '
                          'this implementation assumes version 0.2!'.format(self.version[0], self.version[1]))
            else:
                # set version information
                if not readonly:
                    self.file_hdl.attrs['version_major'] = 0
                    self.file_hdl.attrs['version_minor'] = 2

            # check for data group
            if 'data' not in self.file_hdl:
                if not readonly:
                    self.data = self.file_hdl.create_group('data')
            else:
                self.data = self.file_hdl['data']

            # check for data group
            if 'microscope' not in self.file_hdl:
                if not readonly:
                    self.microscope = self.file_hdl.create_group('microscope')
            else:
                self.microscope = self.file_hdl['microscope']

            # check for data group
            if 'sample' not in self.file_hdl:
                if not readonly:
                    self.sample = self.file_hdl.create_group('sample')
            else:
                self.sample = self.file_hdl['sample']

            # check for data group
            if 'user' not in self.file_hdl:
                if not readonly:
                    self.user = self.file_hdl.create_group('user')
            else:
                self.user = self.file_hdl['user']

            # check for data group
            if 'comments' not in self.file_hdl:
                if not readonly:
                    self.comments = self.file_hdl.create_group('comments')
            else:
                self.comments = self.file_hdl['comments']

            # find emd_data_type groups in the file
            self.list_emds = self.find_emdgroups(self.file_hdl)

    def __del__(self):
        """Destructor for EMD file object.

        """
        # close the file
        # if(not self.file_hdl.closed):
        self.file_hdl.close()

    def __enter__(self):
        """Implement python's with statement

        """
        return self

    def __exit__(self, exception_type, exception_value, traceback):
        """Implement python's with statement
        and close the file via __del__()
        """
        self.__del__()
        return None

[docs]    def find_emdgroups(self, parent):
        """Find all emd_data_type groups within the group parent and return a list of references to their HDF5 groups.

        Parameters
        ----------
            parent: h5py.Group
                Handle to the parent group.

        Returns
        -------
            : list
                A list of h5py.Group handles to children groups
                being emd_data_type groups.

        """

        emds0 = []

        # recursive function to run and retrieve groups with emd_group_type set to 1
        def proc_group(group, emds):
            # take a look at each item in the group
            for item in group:
                # check if group
                if group.get(item, getclass=True) == h5py.Group:
                    item = group.get(item)
                    # check if emd_group_type
                    if 'emd_group_type' in item.attrs and 'data' in item.keys():
                        if item.attrs['emd_group_type'] == 1:
                            emds.append(item)
                    # process subgroups
                    proc_group(item, emds)

        # run
        proc_group(parent, emds0)

        return emds0

[docs]    def get_emddims(self, group):
        """Get the emdtype dimensions saved in in group.

        Parameters
        ----------
            group: h5py.Group
                Reference to the emdtype HDF5 group.

        Returns
        -------
            : tuple
                List of dimension vectors plus labels and units.

        """
        # get the dims
        dims = []
        for ii in range(group['data'].ndim):
            dim = group['dim{}'.format(ii + 1)]
            # save them as (vector, name, units)

            if 'name' in dim.attrs:
                if isinstance(dim.attrs['name'], np.ndarray):
                    name = dim.attrs['name'][0]
                else:
                    name = dim.attrs['name']
            else:
                name = 'dim{}'.format(ii + 1)

            if 'units' in dim.attrs:
                if isinstance(dim.attrs['units'], np.ndarray):
                    units = dim.attrs['units'][0]
                else:
                    units = dim.attrs['units']
            else:
                units = 'pixels'

            # Handle bytes objects by decoding them to strings
            # If something goes wrong, the original attribute is left as-is
            try:
                if isinstance(name, bytes):
                    name = name.decode('utf-8')
                if isinstance(units, bytes):
                    units = units.decode('utf-8')
            except:
                pass

            dims.append((dim[:], name, units))

        dims = tuple(dims)
        return dims

[docs]    def get_emdgroup(self, group):
        """Get the emd data saved in the requested group.

        Note
        ____
        The memmap keyword has been removed. Please use get_memmap().

        Parameters
        ----------
            group: h5py._hl.group.Group or int
                Reference to the HDF5 group to load. If int is used then the item corresponding to self.list_emds
                is loaded

        Returns
        -------
            : tuple or None
                None or tuple containing:
                    : np.ndarray
                        The data of the emdtype group.
                    : list
                        List of [0] dimension vectors, [1] labels and [2] units.
        """

        # check input
        if not isinstance(group, h5py.Group):
            if isinstance(group, int):
                try:
                    group = self.list_emds[group]
                except IndexError:
                    print('group does not exist')
                    return
            else:
                raise TypeError('group needs to refer to a valid HDF5 group!')

        if 'emd_group_type' not in group.attrs:
            raise TypeError('group is not a emd_group_type group!')
        if not group.attrs['emd_group_type'] == 1:
            raise TypeError('group is not a emd_group_type group!')

        # retrieve data and dims
        try:
            data = group['data'][:]
            dims = self.get_emddims(group)

            return data, dims
        except:
            # if something goes wrong, return None
            print('Content of "{}" does not seem to be in emd specified shape'.format(group.name))

            return None

[docs]    def get_memmap(self, group):
        """ Opens a new fileEMD object and returns the requested EMD group. This prevents the file from being closed.
        The original HDF5 file is left open. The file pointer is lost, but it will be closed once the EMD group
        is deleted or removed. See also get_emdgroup().


        Parameters
        ----------
        group: h5py._hl.group.Group or int
                Reference to the HDF5 group to load. If int is used then the item corresponding to self.list_emds
                is loaded

        """
        f = h5py.File(self.file_hdl.filename, 'r')

        # check input
        if not isinstance(group, h5py.Group):
            if isinstance(group, int):
                try:
                    group = self.list_emds[group]
                except IndexError:
                    print('group does not exist')
                    return
            else:
                raise TypeError('group needs to refer to a valid HDF5 group!')
        data = f[group.name + '/data']
        return data, self.get_emddims(group)

[docs]    def write_dim(self, label, dim, parent):
        """Auxiliary function to write a dim dataset to parent.

        Input is not checked for sanity, so handle exceptions in call.

        Parameters
        ----------
            label: str
                Label for dataset, usually dim1, dim2, dimN.
            dim: tuple
                Tuple containing (data, name, units).
            parent: h5py.Group
                HDF5 handle to parent group.

        Returns
        -------
            : h5py.Group
                HDF5 dataset handle referencing this dim.

        """

        try:
            dset = parent.create_dataset(label, data=dim[0])
            dset.attrs['name'] = np.string_(dim[1])
            dset.attrs['units'] = np.string_(dim[2])
        except:
            raise RuntimeError('Error during writing dim dataset')

        return dset

[docs]    def put_emdgroup(self, label, data, dims, parent=None, overwrite=False, **kwargs):
        """Put an emdtype dataset into the EMD file.

        Parameters
        ----------
            label: str
                Label for the emdtype group containing the dataset.
            data: np.ndarray
                Numpy array containing the data.
            dims: tuple
                Tuple containing the necessary dims as ((vec, name, units), (vec, name, units), ...)
            parent: h5py.Group or None
                Parent for the emdtype group, if None it will be written to /data.
            overwrite: bool
                Set to force overwriting entry in EMD file.
            **kwargs: various
                Keyword arguments to be passed to h5py.create_dataset(), e.g. for compression.

        Returns
        -------
            : h5py.Group or None
                Group referencing this emdtype dataset or None if failed.
        """

        # check input
        if not isinstance(label, str):
            raise TypeError('label needs to be string!')

        if not isinstance(data, np.ndarray):
            raise TypeError('data needs to be a numpy.ndarray!')

        try:
            assert len(dims) == len(data.shape)
            for i in range(len(dims)):
                assert len(dims[i]) == 3
                assert dims[i][0].shape[0] == data.shape[i]
        except:
            raise TypeError('Something wrong with the provided dims')

        # write stuff to HDF5

        # create group
        try:
            if parent:
                if label in parent:
                    if overwrite:
                        print('overwriting "{}" in "{}"'.format(label, parent.name))
                        del parent[label]
                    else:
                        print('"{}" already exists in "{}"'.format(label, parent.name))
                        raise RuntimeError('"{}" already exists in "{}"'.format(label, parent.name))
                grp = parent.create_group(label)

            else:
                if label in self.data:
                    if overwrite:
                        print('overwriting "{}" in "{}"'.format(label, self.data.name))
                        del self.data[label]
                    else:
                        print('"{}" already exists in "{}"'.format(label, self.data.name))
                        raise RuntimeError('"{}" already exists in "{}"'.format(label, self.data.name))

                grp = self.data.create_group(label)

            # add attribute
            grp.attrs['emd_group_type'] = 1

            # create dataset
            _ = grp.create_dataset('data', data=data, **kwargs)

            # create dim datasets
            for i in range(len(dims)):
                self.write_dim('dim{}'.format(i + 1), dims[i], grp)

            # update emds list
            self.list_emds = self.find_emdgroups(self.file_hdl)

            return grp

        except:
            print('Something went wrong trying to write the dataset.')

            return None

[docs]    def put_comment(self, msg, timestamp=None):
        """Create a comment in the EMD file.

        If timestamp already exists, the msg is appended to existing comment.

        Parameters
        ----------
            msg: str
                String of the message to save.
            timestamp: str/None
                Timestamp used as the key, defaults to the current UTC time.

        """

        # check input
        if not isinstance(msg, str):
            raise TypeError('msg needs to be a string!')

        # create timestamp if missing
        if not timestamp:
            timestamp = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S (UTC)')
        else:
            # try to convert given timestamp to string
            try:
                timestamp = str(timestamp)
            except:
                raise

        # write comment
        if timestamp in self.comments.attrs:
            # append to existing
            self.comments.attrs[timestamp] += np.string_('\n' + msg)

        else:
            # create new entry
            self.comments.attrs[timestamp] = np.string_(msg)
        

[docs]def defaultDims(data, pixel_size=None):
    """ A helper function that can generate a properly setup dim tuple
    with default values to allow quick writing of EMD files without
    the need to create these dim vectors manually.

    Parameters
    ----------
        data : ndarray
            The data that will be written to the EMD file. This is used to get the number of dims and their shape

        pixel_size : tuple, optional
            A tuple of pixel sizes. Must have same length as the number of dimensions of data.

    Returns
    -------
        dims: list
            A properly formatted tuple of dim vectors used as input
            to emd.emdFile.put_emdgroup()
    """
    if isinstance(pixel_size, np.ndarray):
        raise Exception('pixel_size must be a tuple')

    num = data.ndim

    if not pixel_size:
        pixel_size = (1,) * num

    if len(pixel_size) != data.ndim:
        raise ValueError('pixel_size and data dimensions must match')

    dims = []
    for ii in range(num):
        curDim = [np.linspace(0, data.shape[ii] - 1, data.shape[ii]) * pixel_size[ii],
                  'dim{}'.format(ii+1), 'unit{}'.format(ii+1)]
        dims.append(curDim)

    return dims


[docs]def emdReader(filename, dsetNum=0):
    """ A simple helper function to read in the data and metadata 
    in a structured format similar to the other ncempy readers.

    Note
    ----
        Note fully implemented yet. Work in progress.

    Parameters
    ----------
        filename : str or pathlib.Path
            The path to the file as a string.
        dsetNum : int
            The index of the data set to load.
            
    Returns
    -------
        : dict
            Data and metadata as a dictionary similar to other ncempy readers.

    Example
    -------
        Simply load all data and metadata from a data set in an EMD Velox file
            >> import ncempy.io as nio
            >> emd0 = nio.emd.emdReader('filename.emd', dsetNum = 0)

    """
    with fileEMD(filename, readonly=True) as emd0:
        d, dims = emd0.get_emdgroup(dsetNum)  # memmap must be false. File is closed
        out = {'data': d, 'filename': filename, 'pixelSize': []}

        # Get the group name
        name = emd0.list_emds[dsetNum].name.split('/')[-1]
        out['name'] = name

        for dim in dims:
            try:
                d = dim[0][1] - dim[0][0]
            except:
                d = 0
            out['pixelSize'].append(d)
        out['pixelUnit'] = [aa[2] for aa in dims]
        out['pixelName'] = [aa[1] for aa in dims]
        return out


[docs]def emdWriter(filename, data, pixel_size=None, overwrite=False):
    """ Simple method to write data to a file formatted as an EMD v0.2. The only possible metadata to write is the pixel
    size for each dimension. Use the emd.fileEMD() class for more complex operations. The file must not already exist.

    Parameters
    ----------
    filename : str or pathlib.Path
        The path and file name to write the data to.
    data : ndarray
        The data as an ndarray to write to the file.
    pixel_size : tuple
        A tuple with the same length as the number of dims in data. len(pixel_size) == data.ndim
    overwrite : boolean
        If file exists, overwrite it.
    """
    if isinstance(filename, str):
        filename = Path(filename)

    if pixel_size:
        try:
            assert len(pixel_size) == data.ndim
        except ValueError:
            raise ValueError('pixel_size length must match the number of dimensions of data.')

    # Delete the file if it exists and overwrite is true
    if filename.exists() and overwrite:
        filename.unlink()

    if not filename.exists():
        with fileEMD(filename, readonly=False) as emd0:
            # Setup the dims for the EMD file. Pixel size is set to 1 by default for each dimension
            dims0 = defaultDims(data, pixel_size=pixel_size)
            # Write the data to he emd file.
            emd0.put_emdgroup('converted', data, dims0)
    else:
        raise FileExistsError