"""
This module provides an interface to the Berkeley EMD file format.
See https://emdatasets.com/ for more details.
* It fully supports v0.2 for reading and writing
Note
----
General users:
Use the simplified emd.emdReader() function to load the data and meta
data as a python dictionary.
Advanced users and developers:
Access the file internals through the emd.fileDEMD() class.
"""
from pathlib import Path
import datetime
import numpy as np
import h5py
[docs]class fileEMD:
"""Class to represent Berkeley EMD files.
Implemented for spec 0.2 using the recommended layout for metadata.
Attributes
----------
file_name : str
The name of the file
file_path : pathlib.Path
A pathlib.Path object for the open file
file_hdl : h5py.File
The h5py file handle which provides direct access to the underlying h5py file structure.
version : tuple
A 2-tuple of the major and minor EMD version as ints.
data : h5py.Group
A link to the data group in the EMD file.
microscope : h5py.Group
A link to the microscope EMD group with metadata.
sample : h5py.Group
A link to the sample EMD group with metadata.
user : h5py.Group
A link to the user EMD group with metadata.
comments : h5py.Group
A link to the comments in this EMD file. A time stamp is included with the comment.
list_emds : list
A list of the EMD groups in the file.
Examples
--------
Open an Berkeley EMD file using the *advanced* interface. See the emdReader function for a more convenient
way to load the data. We show how to list the available data sets, load a 3D data set and plot the first image.
>> from ncempy.io import emd
>> with emd.fileEMD('filename.emd') as emd1:
>> [print(dataGroup.name) for dataGroup in emd1.list_emds] # print all available EMD datasets
>> data1, dims1 = emd1.get_emdgroup(0) # load the first full data array and dimension information
"""
def __init__(self, filename, readonly=True):
"""Init opening/creating the file.
Parameters
----------
filename : str or pathlib.Path or file object
The EMD file to open.
readonly : bool, default True
Set to False to allow writing to the file.
"""
# necessary declarations in case something goes bad
self.file_hdl = None
# convenience handles to access the data in the emd file, everything can as well be accessed using the file_hdl
self.version = None
self.data = None
self.microscope = None
self.sample = None
self.user = None
self.comments = None
self.list_emds = [] # list of HDF5 groups with emd_data_type type 1
if hasattr(filename, 'read'):
try:
self.file_path = Path(filename.name)
self.file_name = self.file_path.name
except AttributeError:
self.file_path = None
self.file_name = None
else:
# check filename type, change to pathlib.Path
if isinstance(filename, str):
filename = Path(filename)
elif isinstance(filename, Path):
pass
else:
raise TypeError('Filename is supposed to be a string or pathlib.Path or file object')
self.file_path = filename
self.file_name = self.file_path.name
# try opening the file
if readonly:
try:
self.file_hdl = h5py.File(filename, 'r')
except:
print('Error opening file for readonly: "{}"'.format(filename))
raise
else:
try:
self.file_hdl = h5py.File(filename, 'a')
except:
print('Error opening file for read/write: "{}"'.format(filename))
raise
# if we got a working file
if self.file_hdl:
# check version information
if 'version_major' in self.file_hdl.attrs and 'version_minor' in self.file_hdl.attrs:
# read version information
self.version = (self.file_hdl.attrs['version_major'], self.file_hdl.attrs['version_minor'])
# compare to implementation
if not self.version == (0, 2):
print('WARNING: You are reading a version {}.{} EMD file, '
'this implementation assumes version 0.2!'.format(self.version[0], self.version[1]))
else:
# set version information
if not readonly:
self.file_hdl.attrs['version_major'] = 0
self.file_hdl.attrs['version_minor'] = 2
# check for data group
if 'data' not in self.file_hdl:
if not readonly:
self.data = self.file_hdl.create_group('data')
else:
self.data = self.file_hdl['data']
# check for data group
if 'microscope' not in self.file_hdl:
if not readonly:
self.microscope = self.file_hdl.create_group('microscope')
else:
self.microscope = self.file_hdl['microscope']
# check for data group
if 'sample' not in self.file_hdl:
if not readonly:
self.sample = self.file_hdl.create_group('sample')
else:
self.sample = self.file_hdl['sample']
# check for data group
if 'user' not in self.file_hdl:
if not readonly:
self.user = self.file_hdl.create_group('user')
else:
self.user = self.file_hdl['user']
# check for data group
if 'comments' not in self.file_hdl:
if not readonly:
self.comments = self.file_hdl.create_group('comments')
else:
self.comments = self.file_hdl['comments']
# find emd_data_type groups in the file
self.list_emds = self.find_emdgroups(self.file_hdl)
def __del__(self):
"""Destructor for EMD file object.
"""
# close the file
# if(not self.file_hdl.closed):
self.file_hdl.close()
def __enter__(self):
"""Implement python's with statement
"""
return self
def __exit__(self, exception_type, exception_value, traceback):
"""Implement python's with statement
and close the file via __del__()
"""
self.__del__()
return None
[docs] def find_emdgroups(self, parent):
"""Find all emd_data_type groups within the group parent and return a list of references to their HDF5 groups.
Parameters
----------
parent: h5py.Group
Handle to the parent group.
Returns
-------
: list
A list of h5py.Group handles to children groups
being emd_data_type groups.
"""
emds0 = []
# recursive function to run and retrieve groups with emd_group_type set to 1
def proc_group(group, emds):
# take a look at each item in the group
for item in group:
# check if group
if group.get(item, getclass=True) == h5py.Group:
item = group.get(item)
# check if emd_group_type
if 'emd_group_type' in item.attrs and 'data' in item.keys():
if item.attrs['emd_group_type'] == 1:
emds.append(item)
# process subgroups
proc_group(item, emds)
# run
proc_group(parent, emds0)
return emds0
[docs] def get_emddims(self, group):
"""Get the emdtype dimensions saved in in group.
Parameters
----------
group: h5py.Group
Reference to the emdtype HDF5 group.
Returns
-------
: tuple
List of dimension vectors plus labels and units.
"""
# get the dims
dims = []
for ii in range(group['data'].ndim):
dim = group['dim{}'.format(ii + 1)]
# save them as (vector, name, units)
if 'name' in dim.attrs:
if isinstance(dim.attrs['name'], np.ndarray):
name = dim.attrs['name'][0]
else:
name = dim.attrs['name']
else:
name = 'dim{}'.format(ii + 1)
if 'units' in dim.attrs:
if isinstance(dim.attrs['units'], np.ndarray):
units = dim.attrs['units'][0]
else:
units = dim.attrs['units']
else:
units = 'pixels'
# Handle bytes objects by decoding them to strings
# If something goes wrong, the original attribute is left as-is
try:
if isinstance(name, bytes):
name = name.decode('utf-8')
if isinstance(units, bytes):
units = units.decode('utf-8')
except:
pass
dims.append((dim[:], name, units))
dims = tuple(dims)
return dims
[docs] def get_emdgroup(self, group):
"""Get the emd data saved in the requested group.
Note
____
The memmap keyword has been removed. Please use get_memmap().
Parameters
----------
group: h5py._hl.group.Group or int
Reference to the HDF5 group to load. If int is used then the item corresponding to self.list_emds
is loaded
Returns
-------
: tuple or None
None or tuple containing:
: np.ndarray
The data of the emdtype group.
: list
List of [0] dimension vectors, [1] labels and [2] units.
"""
# check input
if not isinstance(group, h5py.Group):
if isinstance(group, int):
try:
group = self.list_emds[group]
except IndexError:
print('group does not exist')
return
else:
raise TypeError('group needs to refer to a valid HDF5 group!')
if 'emd_group_type' not in group.attrs:
raise TypeError('group is not a emd_group_type group!')
if not group.attrs['emd_group_type'] == 1:
raise TypeError('group is not a emd_group_type group!')
# retrieve data and dims
try:
data = group['data'][:]
dims = self.get_emddims(group)
return data, dims
except:
# if something goes wrong, return None
print('Content of "{}" does not seem to be in emd specified shape'.format(group.name))
return None
[docs] def get_memmap(self, group):
""" Opens a new fileEMD object and returns the requested EMD group. This prevents the file from being closed.
The original HDF5 file is left open. The file pointer is lost, but it will be closed once the EMD group
is deleted or removed. See also get_emdgroup().
Parameters
----------
group: h5py._hl.group.Group or int
Reference to the HDF5 group to load. If int is used then the item corresponding to self.list_emds
is loaded
"""
f = h5py.File(self.file_hdl.filename, 'r')
# check input
if not isinstance(group, h5py.Group):
if isinstance(group, int):
try:
group = self.list_emds[group]
except IndexError:
print('group does not exist')
return
else:
raise TypeError('group needs to refer to a valid HDF5 group!')
data = f[group.name + '/data']
return data, self.get_emddims(group)
[docs] def write_dim(self, label, dim, parent):
"""Auxiliary function to write a dim dataset to parent.
Input is not checked for sanity, so handle exceptions in call.
Parameters
----------
label: str
Label for dataset, usually dim1, dim2, dimN.
dim: tuple
Tuple containing (data, name, units).
parent: h5py.Group
HDF5 handle to parent group.
Returns
-------
: h5py.Group
HDF5 dataset handle referencing this dim.
"""
try:
dset = parent.create_dataset(label, data=dim[0])
dset.attrs['name'] = np.string_(dim[1])
dset.attrs['units'] = np.string_(dim[2])
except:
raise RuntimeError('Error during writing dim dataset')
return dset
[docs] def put_emdgroup(self, label, data, dims, parent=None, overwrite=False, **kwargs):
"""Put an emdtype dataset into the EMD file.
Parameters
----------
label: str
Label for the emdtype group containing the dataset.
data: np.ndarray
Numpy array containing the data.
dims: tuple
Tuple containing the necessary dims as ((vec, name, units), (vec, name, units), ...)
parent: h5py.Group or None
Parent for the emdtype group, if None it will be written to /data.
overwrite: bool
Set to force overwriting entry in EMD file.
**kwargs: various
Keyword arguments to be passed to h5py.create_dataset(), e.g. for compression.
Returns
-------
: h5py.Group or None
Group referencing this emdtype dataset or None if failed.
"""
# check input
if not isinstance(label, str):
raise TypeError('label needs to be string!')
if not isinstance(data, np.ndarray):
raise TypeError('data needs to be a numpy.ndarray!')
try:
assert len(dims) == len(data.shape)
for i in range(len(dims)):
assert len(dims[i]) == 3
assert dims[i][0].shape[0] == data.shape[i]
except:
raise TypeError('Something wrong with the provided dims')
# write stuff to HDF5
# create group
try:
if parent:
if label in parent:
if overwrite:
print('overwriting "{}" in "{}"'.format(label, parent.name))
del parent[label]
else:
print('"{}" already exists in "{}"'.format(label, parent.name))
raise RuntimeError('"{}" already exists in "{}"'.format(label, parent.name))
grp = parent.create_group(label)
else:
if label in self.data:
if overwrite:
print('overwriting "{}" in "{}"'.format(label, self.data.name))
del self.data[label]
else:
print('"{}" already exists in "{}"'.format(label, self.data.name))
raise RuntimeError('"{}" already exists in "{}"'.format(label, self.data.name))
grp = self.data.create_group(label)
# add attribute
grp.attrs['emd_group_type'] = 1
# create dataset
_ = grp.create_dataset('data', data=data, **kwargs)
# create dim datasets
for i in range(len(dims)):
self.write_dim('dim{}'.format(i + 1), dims[i], grp)
# update emds list
self.list_emds = self.find_emdgroups(self.file_hdl)
return grp
except:
print('Something went wrong trying to write the dataset.')
return None
[docs]def defaultDims(data, pixel_size=None):
""" A helper function that can generate a properly setup dim tuple
with default values to allow quick writing of EMD files without
the need to create these dim vectors manually.
Parameters
----------
data : ndarray
The data that will be written to the EMD file. This is used to get the number of dims and their shape
pixel_size : tuple, optional
A tuple of pixel sizes. Must have same length as the number of dimensions of data.
Returns
-------
dims: list
A properly formatted tuple of dim vectors used as input
to emd.emdFile.put_emdgroup()
"""
if isinstance(pixel_size, np.ndarray):
raise Exception('pixel_size must be a tuple')
num = data.ndim
if not pixel_size:
pixel_size = (1,) * num
if len(pixel_size) != data.ndim:
raise ValueError('pixel_size and data dimensions must match')
dims = []
for ii in range(num):
curDim = [np.linspace(0, data.shape[ii] - 1, data.shape[ii]) * pixel_size[ii],
'dim{}'.format(ii+1), 'unit{}'.format(ii+1)]
dims.append(curDim)
return dims
[docs]def emdReader(filename, dsetNum=0):
""" A simple helper function to read in the data and metadata
in a structured format similar to the other ncempy readers.
Note
----
Note fully implemented yet. Work in progress.
Parameters
----------
filename : str or pathlib.Path
The path to the file as a string.
dsetNum : int
The index of the data set to load.
Returns
-------
: dict
Data and metadata as a dictionary similar to other ncempy readers.
Example
-------
Simply load all data and metadata from a data set in an EMD Velox file
>> import ncempy.io as nio
>> emd0 = nio.emd.emdReader('filename.emd', dsetNum = 0)
"""
with fileEMD(filename, readonly=True) as emd0:
d, dims = emd0.get_emdgroup(dsetNum) # memmap must be false. File is closed
out = {'data': d, 'filename': filename, 'pixelSize': []}
# Get the group name
name = emd0.list_emds[dsetNum].name.split('/')[-1]
out['name'] = name
for dim in dims:
try:
d = dim[0][1] - dim[0][0]
except:
d = 0
out['pixelSize'].append(d)
out['pixelUnit'] = [aa[2] for aa in dims]
out['pixelName'] = [aa[1] for aa in dims]
return out
[docs]def emdWriter(filename, data, pixel_size=None, overwrite=False):
""" Simple method to write data to a file formatted as an EMD v0.2. The only possible metadata to write is the pixel
size for each dimension. Use the emd.fileEMD() class for more complex operations. The file must not already exist.
Parameters
----------
filename : str or pathlib.Path
The path and file name to write the data to.
data : ndarray
The data as an ndarray to write to the file.
pixel_size : tuple
A tuple with the same length as the number of dims in data. len(pixel_size) == data.ndim
overwrite : boolean
If file exists, overwrite it.
"""
if isinstance(filename, str):
filename = Path(filename)
if pixel_size:
try:
assert len(pixel_size) == data.ndim
except ValueError:
raise ValueError('pixel_size length must match the number of dimensions of data.')
# Delete the file if it exists and overwrite is true
if filename.exists() and overwrite:
filename.unlink()
if not filename.exists():
with fileEMD(filename, readonly=False) as emd0:
# Setup the dims for the EMD file. Pixel size is set to 1 by default for each dimension
dims0 = defaultDims(data, pixel_size=pixel_size)
# Write the data to he emd file.
emd0.put_emdgroup('converted', data, dims0)
else:
raise FileExistsError