Source code for ncempy.io.mrc

"""
A module to read MRC files in python and numpy.
Written according to MRC specification at http://bio3d.colorado.edu/imod/betaDoc/mrc_format.txt
Also works with FEI MRC files which include a special header block with experimental information.

Note
----
General users:
    Use the simplified mrc.mrcReader() function to load the data and meta
    data as a python dictionary.

Advanced users and developers:
    Access the file internals through the mrc.fileMRC() class.

"""

from pathlib import Path

import numpy as np


[docs]class fileMRC: """ Read in the data in MRC format and other useful information like metadata. Follows the specification published at http://bio3d.colorado.edu/imod/betaDoc/mrc_format.txt Attributes ---------- file_name : str The name of the file file_path : pathlib.Path A pathlib.Path object for the open file fid : file The file handle to the opened MRC file. mrcType : int The internal MRC data type. dataType : np.dtype The numpy dtype corresponding to the mrcType. dataSize : np.ndarray The number of pixels along each dimension. Corresponds to the shape attribute of a np.ndarray gridSize : np.ndarray The size of the grid. Usually the same as dataSize volumeSize : np.ndarray The size of the volume along each direction in Angstroms. voxelSize : np.ndarray The size of the voxel along each direction in Angstroms. cellAngles : np.ndarray The angles of the cell. Ignored in most cases including ncempy. axisOrientations : np.ndarray Mapping the orientations of the data to real space directions X, Y, Z. Ignored by ncempy minMaxMean : np.ndarray The minimum, maximum and mean value of the data to avoid computing this every time. extra : np.ndarray Extra mbinary metadata if it exists. FEIinfo : dict A dictionary of metadata used by FEI (Thermo Fischer) microsocpes for important metadata. This metadata overwrites the voxelsize attribute if it exists. dataOffset : int The integer offset in bytes to the start of the raw data. dataOut: dict Will hold the data and metadata to output to the user after getDataset() call. v : bool More output for debugging. False by default Examples -------- Read in all data and metadata into memory. >> import ncempy.io as nio >> mrc0 = nio.mrc.mrcReader('file.mrc') Low level operations to get 1 slice of the 3D data >> import ncempy.io as nio >> with nio.mrc.fileMRC('file.mrc') as f1: >> single_slice = f1.getSlice(0) """ def __init__(self, filename, verbose=False): """ Parameters ----------- filename : str or pathlib.Path or file object String or pathlib.Path of file object pointing to the filesystem location of the file. verbose : bool If True, debug information is printed. """ # check filename type if hasattr(filename, 'read'): self.fid = filename try: self.file_path = Path(self.fid.name) self.file_name = self.file_path.name except AttributeError: self.file_name = None self.file_path = None else: if isinstance(filename, str): filename = Path(filename) elif isinstance(filename, Path): pass else: raise TypeError('Filename is supposed to be a string or pathlib.Path or readable file object') self.file_path = filename self.file_name = filename.name # Open the file and quit if the file does not exist try: self.fid = open(self.file_path, 'rb') except IOError as e: print("I/O error({0}): {1}".format(e.errno, e.strerror)) # necessary declarations, if something fails self.mrcType = None self.dataType = None self.dataSize = None self.gridSize = None self.volumeSize = None self.voxelSize = None self.cellAngles = None self.axisOrientations = None self.minMaxMean = None self.extra = None self.FEIinfo = None self.dataOffset = None self.dataOut = {} # will hold the data and metadata to output to the user after getDataset() call # Add a top level variable to indicate verbose output for debugging self.v = verbose self.parseHeader() def __del__(self): """Close the file. """ if not self.fid.closed: if self.v: print('Closing input file: {}'.format(str(self.file_path))) self.fid.close() return None def __enter__(self): """Implement python's with statement """ return self def __exit__(self, exception_type, exception_value, traceback): """Implement python's with statement. Close the file using __del__ """ self.__del__() return None
[docs] def parseHeader(self): """Read the header information which includes data type, data size, data shape, and metadata. Note ----- This header uses Fortran-style ordering. Numpy uses C-style ordering. The header is read in and then some attributes are reversed [::-1] at the end for output to the user to match C-ordering in numpy. TODO ---- Implement special dtype to read the entire header at once. Read everything at once using special dtype. ~5x faster than multiple np.fromfile() reads: Untested but works in theory headerDtype = np.dtype([('head1','10int32'),('head2','6float32'),('axisOrientations','3int32'), ('minMaxMean','3int32'),('extra','32int32')]) head = np.fromfile(self.fid,dtype=headerDtype,count=1) """ # Always start at the beginning of the file. self.fid.seek(0) header_dtype = np.dtype( [('head1', '10int32'), ('head2', '6float32'), ('axisOrientations', '3int32'), ('minMaxMean', '3int32'), ('extra', '32int32')]) header = np.fromfile(self.fid, dtype=header_dtype, count=1) # Read in the initial header values head1 = header['head1'][0] if self.v: print('header1 = {}'.format(head1)) # Set the number of pixels for each dimension self.dataSize = head1[0:3] if self.v: print('dataSize (fortran ordering) = {}'.format(self.dataSize)) # Set the data type and convert to numpy type self.mrcType = head1[3] self.dataType = self._getMRCType(self.mrcType) if self.v: print('dataType = {}'.format(self.dataType)) # Get the grid size self.gridSize = head1[7:10] if self.v: print('mrc defined gridSize = {}'.format(self.gridSize)) # Get the physical volume size (always in Angstroms) (starting at byte #11 in the file). head2 = header['head2'][0] self.volumeSize = head2[0:3] if self.v: print('mrc defined volumeSize = {}'.format(self.volumeSize)) # calculate the voxel size based on volume and grid sizes # account for zero values which are often stored in volume and grid size values self.voxelSize = np.ones(3) for ii in range(0, 3): if self.volumeSize[ii] == 0: v = 1 else: v = self.volumeSize[ii] if self.gridSize[ii] == 0: gs = 1 else: gs = self.gridSize[ii] self.voxelSize[ii] = v / gs """if self.volumeSize.any() == 0 or self.gridSize.any() == 0: if self.v: print('Detected 0 volume or grid size. Setting voxel size to 1 (Ang).') self.voxelSize = np.ones(3) # use 1 as a voxel size if its not set in the file else: self.voxelSize = (self.volumeSize / np.float32(self.gridSize)) if self.v: print('voxelSize (Ang) = {}'.format(self.voxelSize)) """ # Pixel (cell) angles self.cellAngles = head2[3:6] if self.v: print('cellAngles = {}'.format(self.cellAngles)) # Axis orientations. Tells which axes are X,Y,Z self.axisOrientations = header['axisOrientations'][0] if self.v: print('axisOrientations = {}'.format(self.axisOrientations)) # Min, max,mean self.minMaxMean = header['minMaxMean'][0] # Extra information (for FEI MRC file, extra(1) is the size of the FEI information encoded with the file in # terms of 4 byte floats) self.extra = header['extra'][0] # Numpy uses C-style ordering. The header is written in Fortran-Style ordering. # Flip the order of everything useful if self.v: print( 'Note: The MRC header is written in Fortran-Style ordering, but Numpy uses C-style ordering. ' 'This program will now reverse the order (using [::-1]) of useful metadata: (dataSize, gridSize, ' 'volumeSize, voxelSize, cellAngles, axisOrientations)') self.dataSize = self.dataSize[::-1] self.gridSize = self.gridSize[::-1] self.volumeSize = self.volumeSize[::-1] self.voxelSize = self.voxelSize[::-1] self.cellAngles = self.cellAngles[::-1] self.axisOrientations = self.axisOrientations[::-1] # self.Shape = self.Shape[::-1] # Move to the end of the normal header self.fid.seek(1024) # Read in the extended header if it exists (for FEI MRC files) if self.extra[1] != 0: pos1 = self.fid.tell() if self.v: print('Extra header found. Most likely and FEI-style MRC file.') print('Position before reading extra header: ' + str(pos1)) print('Extra header size = ' + str(self.extra[1])) ''' Read the extra FEI header described as follows: 1 a_tilt first Alpha tilt (deg) 2 b_tilt first Beta tilt (deg) 3 x_stage Stage x position (Unit=m. But if value>1, unit=???m) 4 y_stage Stage y position (Unit=m. But if value>1, unit=???m) 5 z_stage Stage z position (Unit=m. But if value>1, unit=???m) 6 x_shift Image shift x (Unit=m. But if value>1, unit=???m) 7 y_shift Image shift y (Unit=m. But if value>1, unit=???m) 8 defocus starting Defocus Unit=m. But if value>1, unit=???m) 9 exp_time Exposure time (s) 10 mean_int Mean value of image 11 tilt_axis Tilt axis (deg) 12 pixel_size Pixel size of image (m) 13 magnification Magnification used 14 voltage accelerating voltage 15 ?? ''' FEIinfoValues = np.fromfile(self.fid, dtype=np.float32, count=15) self.FEIinfo = {'a_tilt': FEIinfoValues[0], 'b_tilt': FEIinfoValues[1], 'x_stage': FEIinfoValues[2], 'y_stage': FEIinfoValues[3], 'z_stage': FEIinfoValues[4], 'x_shift': FEIinfoValues[5], 'y_shift': FEIinfoValues[6], 'defocus': FEIinfoValues[7], 'exposure_time': FEIinfoValues[8], 'mean': FEIinfoValues[9], 'tilt_axis': FEIinfoValues[10], 'pixel_size': FEIinfoValues[11], 'magnification': FEIinfoValues[12], 'voltage': FEIinfoValues[13], 'unknown': FEIinfoValues[14]} self.voxelSize[0] = 1. # set this to 1 but it should be the tilt angles. These can be non-uniform though. if self.FEIinfo['pixel_size'] != 0: self.voxelSize[1] = self.FEIinfo['pixel_size'] * 1e10 # convert meter to Angstroms, standard for MRCs self.voxelSize[2] = self.FEIinfo['pixel_size'] * 1e10 if self.v: print('Extended header data') for aa, bb in self.FEIinfo.items(): print('{} = {}'.format(aa, bb)) else: self.FEIinfo = {} self.dataOffset = 1024 + self.extra[1] # offset of the data from the start of the file # Add relevant information (metadata) to the output dictionary self.dataOut = {'pixelSize': self.voxelSize, 'voxelSize': self.voxelSize, 'cellAngles': self.cellAngles, 'axisOrientations': self.axisOrientations, 'filename': self.file_name} if self.extra[1] != 0: self.dataOut['FEIinfo'] = self.FEIinfo return 1
[docs] def getDataset(self): """Read in the full data block and reshape to an ndarray with C-style ordering. """ self.fid.seek(self.dataOffset, 0) # move to the start of the data from the start of the file try: num0 = int(np.prod(self.dataSize, dtype=np.uint64)) data1 = np.fromfile(self.fid, dtype=self.dataType, count=num0) self.dataOut['data'] = data1.reshape(self.dataSize) except MemoryError: print("Not enough memory to read in the full data set. Use getMemmap") return self.dataOut
[docs] def getSlice(self, num): """Read in a slice of an MRC file. Useful for parsing through a large file without reading the entire data set into memory. Parameters ---------- num : int Get the requested image. Returns ------- out : ndarray A 2D slice or a 3D set of slices along the first index Raises ------ IndexError If num > the number of slices. """ # Check num is within the data array size bounds if num > (self.dataSize[0] - 1): raise IndexError('Index {} is out of bounds for array with size {}'.format(num, self.dataSize[0])) self.fid.seek(self.dataOffset, 0) # move to the start of the data by skipping the header imSize = self.dataSize[1] * self.dataSize[2] # size of each image in pixels byteSize = int(np.dtype(self.dataType).itemsize * imSize) # make sure this is an int32 self.fid.seek(num * byteSize, 1) # skip to the slice requested from the start of the data data1 = np.fromfile(self.fid, dtype=self.dataType, count=imSize) # read in the requested image data1 = data1.reshape((self.dataSize[1], self.dataSize[2])) # reshape the image return data1
[docs] def getMemmap(self): """Return a numpy memmap object (read-only) for the dataset. This is very useful for very large datasets to avoid loading the entire data set into memory. No meta data is returned. Returns -------- : numpy.core.memmap A read-only numpy memmap object with access to the data on disk. """ mm = np.memmap(self.fid, dtype=self.dataType, mode='r', offset=self.dataOffset, shape=tuple(self.dataSize)) return mm
def _applyAxisOrientations(self, arrayIn): """ This is untested and unused. """ return [arrayIn[x - 1] for x in self.axisOrientations] def _getMRCType(self, dataType): """Return the correct data type according to the official MRC type list: 0 image : signed 8-bit bytes range -128 to 127 1 image : 16-bit 2 image : 32-bit reals 3 transform : complex 16-bit integers 4 transform : complex 32-bit reals 6 image : unsigned 16-bit range 0 to 65535 Parameters ---------- dataType: int The data type value encoded in an MRC header Returns -------- out: numpy dtype The corresponding numpy data type. """ if dataType == 0: Type = np.int8 elif dataType == 1: Type = np.int16 elif dataType == 2: Type = np.float32 elif dataType == 6: Type = np.uint16 else: print("Unsupported data type" + str(dataType)) # complex data types are currently unsupported Type = None return Type
[docs]def mrcReader(file_name): """A simple function to read open a MRC, parse the header, and read the full data set. Parameters ---------- file_name : str or pathlib.Path The path to the file to load. Returns ------- out: dict A dictionary containing the data and interesting metadata. The data is attached to the 'data' key. Example ------- Read in all data from disk into memory. This assumes the dataset is 3 dimensional: >> from ncempy.io.mrc import mrcReader >> import matplotlib.pyplot as plt >> mrc1 = mrcReader('filename.mrc') >> plt.imshow(mrc1['data'][0, :, :]) # show the first image in the data set """ if isinstance(file_name, str): file_name = Path(file_name) with fileMRC(file_name) as f1: # open the file and init the class im1 = f1.getDataset() # read in the dataset # Add extra meta data not already in im1 im1['filename'] = file_name.name im1['pixelUnit'] = 'A' return im1 # return the data and metadata as a dictionary
[docs]def mrc2raw(file_name): """ Convert an MRC type file to raw binary. The entire data is read into memory and then written to a new raw file in the same location with the data type and shape (C-ordering) written into the filename. No other meta data is retained. Parameters ---------- file_name : str or pathlib.Path The file name to convert. """ if isinstance(file_name, str): file_name = Path(file_name) tomo = mrcReader(file_name) # Add metadata to file name out_name = file_name.stem + '_{}_{}.raw'.format(tomo['data'].dtype, tomo['data'].shape) outPath = file_name.with_name(out_name.replace(' ', '')) print(outPath) with open(outPath, 'wb') as f0: f0.write(tomo['data']) # write out as C ordered data
[docs]def mrc2emd(file_name): """Write an MRC file as an HDF5 file in EMD format with same file name and .emd ending. Header information is retained as attributes. Parameters ---------- file_name: str The name of the file to convert from MRC to EMD format. Returns ------- out: int 1 if successful. TODO ----- Update this to use ncempy.emd class """ import h5py # Read in the MRC data and reshape to C-style ordering tomo = mrcReader(file_name) # create the HDF5 file with h5py.File(file_name.rsplit('.mrc', 1)[0] + '.emd', 'w') as f1: # w- will error if the file exists # Create the axis vectors in nanometers. Standard MRC pixel size is in Angstroms xFull = np.linspace(0, tomo['voxelSize'][0] * tomo['data'].shape[0] - 1, tomo['data'].shape[0]) yFull = np.linspace(0, tomo['voxelSize'][1] * tomo['data'].shape[1] - 1, tomo['data'].shape[1]) zFull = np.linspace(0, tomo['voxelSize'][2] * tomo['data'].shape[2] - 1, tomo['data'].shape[2]) # Root data group dataTop = f1.create_group('data') # Create tilt series group tiltseriesGroup = dataTop.create_group('data') tiltseriesGroup.attrs['emd_group_type'] = np.int8(1) # Save the data to the EMD file and reshape it to a C-style array try: _ = tiltseriesGroup.create_dataset('data', data=tomo['data'], compression='gzip', shuffle=True) except MemoryError: raise MemoryError("Not enough memory to write out data to EMD file") dim1 = tiltseriesGroup.create_dataset('dim1', data=xFull) dim1.attrs['name'] = np.string_('x') dim1.attrs['units'] = np.string_('') dim2 = tiltseriesGroup.create_dataset('dim2', data=yFull) dim2.attrs['name'] = np.string_('y') dim2.attrs['units'] = np.string_('') dim3 = tiltseriesGroup.create_dataset('dim3', data=zFull) dim3.attrs['name'] = np.string_('z') dim3.attrs['units'] = np.string_('') # Create the other groups scopeGroup = f1.create_group('Microscope') scopeGroup.attrs['voxel sizes'] = tomo['voxelSize'] userGroup = f1.create_group('User') commentGroup = f1.create_group('Comments')
[docs]def mrcWriter(filename, data, pixelSize, forceWrite=False): """Write out a MRC type file according to the specification at http://bio3d.colorado.edu/imod/doc/mrc_format.txt Parameters ---------- filename : str or pathlib.Path The name or Path of the file to write out to. data : ndarray The array data to write to disk. pixelSize : tuple The size of the pixel along each direction (in Angstroms) as a 3 element vector (sizeZ,sizeY,sizeX). forceWrite : bool This will write the data as a C-contiguous array. It is not suggested to use this option and it will be removed in future versions. """ with open(filename, 'wb') as fid: if len(data.shape) > 3: print("Too many dimensions") return if not data.flags['C_CONTIGUOUS']: print("Error: Array must be C-style ordering: [numImages,Y,X]. " "Use numpy.tranpspose and np.ascontiguousarray to change data ordering in memory") print('Exiting') return # initialize the header with 256 zeros with size 4 bytes header = np.zeros(256, dtype=np.int32) fid.write(header) fid.seek(0, 0) # return to the beginning of the file # Initialize the int32 part of the header header1 = np.zeros(10, dtype=np.int32) # Write the number of columns, rows and sections (images) # header1[0:3] = np.int32(dims) #data size in pixels header1[0] = np.int32(data.shape[2]) # num columns, the last index in C-style ordering header1[1] = np.int32(data.shape[1]) # num rows header1[2] = np.int32(data.shape[0]) # num sections (images) if data.dtype == np.float32: header1[3] = np.int32(2) elif data.dtype == np.uint16: header1[3] = np.int32(6) elif data.dtype == np.int16: header1[3] = np.int32(1) elif data.dtype == np.int8: header1[3] = np.int32(0) else: print("Data type {} is unsupported. Only int8, int16, uint16, and float32 are supported".format(data.dtype)) return # Starting point of sub image (not used in IMOD) header1[4:7] = np.zeros(3, dtype=np.int32) # Grid size in X,Y,Z # header1[7:10] = np.int32(dims); #data size in pixels header1[7] = np.int32(data.shape[2]) # mx header1[8] = np.int32(data.shape[1]) # my header1[9] = np.int32(data.shape[0]) # mz # Write out the first part of the header information fid.write(header1) # Cell dimensions (in Angstroms) # pixel spacing = xlen/mx, ylen/my, zlen/mz fid.write(np.float32(pixelSize[2] * data.shape[2])) # xlen fid.write(np.float32(pixelSize[1] * data.shape[1])) # ylen fid.write(np.float32(pixelSize[0] * data.shape[0])) # zlen # Cell angles (in degrees) fid.write(np.float32([90.0, 90.0, 90.0])) # Description of array directions with respect to: Columns, Rows, Images fid.write(np.int32([1, 2, 3])) # Minimum and maximum density fid.write(np.float32(np.min(data))) fid.write(np.float32(np.max(data))) fid.write(np.float32(np.mean(data))) # Needed to indicate that the data is little endian for NEW-STYLE MRC image2000 HEADER - IMOD 2.6.20 and above fid.seek(212, 0) fid.write(np.int8([68, 65, 0, 0])) # use [17,17,0,0] for big endian # Write out the data fid.seek(1024) if forceWrite: fid.write(np.ascontiguousarray(data)) # Change to C ordering array for writing to disk else: fid.write(data) # must be C-contiguous
# Close the file
[docs]def writeHeader(filename, shape, dtype, pixelSize): """Write out a MRC type file header according to the specification at http://bio3d.colorado.edu/imod/doc/mrc_format.txt. This is useful for initializing an MRC file and then writing to it manually or see appendData() function below. Parameters ---------- filename : str The name of the EMD file shape : tuple The shape of the data to write dtype : numpy.dtype The dtype to write out the data as. Only some numpy dtypes are supported byt his format. It is suggested to use np.float32 in most cases for maximum compatibility. pixelSize : tuple The size of the pixel along each direction (in Angstroms) as a 3 element vector (sizeX,sizeY,sizeZ). sizeZ could be the angular step for a tilt series """ with open(filename, 'wb') as fid: if len(shape) > 3: print("Too many dimensions") return 0 # Initialize the header with 256 zeros with size 4 bytes header = np.zeros(256, dtype=np.int32) fid.write(header) fid.seek(0, 0) # return to the beginning of the file # Initialize the int32 part of the header with zeros header1 = np.zeros(10, dtype=np.int32) # Write the number of columns, rows and sections (images) header1[0] = np.int32(shape[2]) # num columns, the last index in C-style ordering header1[1] = np.int32(shape[1]) # num rows header1[2] = np.int32(shape[0]) # num sections (images) if dtype == np.float32: header1[3] = np.int32(2) elif dtype == np.uint16: header1[3] = np.int32(6) elif dtype == np.int16: header1[3] = np.int32(1) elif dtype == np.int8: header1[3] = np.int32(0) else: print("Data type " + str(dtype) + " is unsupported. Only int8, int16, uint16, and float32 are supported") return 0 # Starting point of sub image (not used in IMOD) header1[4:7] = np.zeros(3, dtype=np.int32) # Grid size in X,Y,Z header1[7] = np.int32(shape[2]) # mx header1[8] = np.int32(shape[1]) # my header1[9] = np.int32(shape[0]) # mz # Write out the first part of the header information fid.write(header1) # Cell dimensions (in Angstroms) # pixel spacing = xlen/mx, ylen/my, zlen/mz fid.write(np.float32(pixelSize[2] * shape[2])) # xlen fid.write(np.float32(pixelSize[1] * shape[1])) # ylen fid.write(np.float32(pixelSize[0] * shape[0])) # zlen # Cell angles (in degrees) fid.write(np.float32([90.0, 90.0, 90.0])) # Description of array directions with respect to: Columns, Rows, Images fid.write(np.int32([1, 2, 3])) # Minimum and maximum density # fid.write(np.float32(np.min(data))) # fid.write(np.float32(np.max(data))) # fid.write(np.float32(np.mean(data))) fid.write(np.zeros(3, dtype=np.float32)) # Needed to indicate that the data is little endian for NEW-STYLE MRC image2000 HEADER - IMOD 2.6.20 and above fid.seek(212, 0) fid.write(np.int8([68, 65, 0, 0])) # use [17,17,0,0] for big endian
[docs]def appendData(filename, data): """Append a binary set of data to the end of a MRC file. This should only be used in conjunction with writeHeader() above. Parameters ---------- filename : str Name of the MRC file with pre-initiated header and some data already written. data : ndarray Data to append to the file. """ with open(filename, 'ab') as fid: # Write out the data fid.seek(0, 2) # seek to the end of the file fid.write(data) # Change to C ordering array for writing to disk
[docs]def emd2mrc(filename, dsetPath): """Convert EMD data set into MRC data set. The final data type is float32 for convenience. Parameters ---------- filename : str The name of the EMD file dsetPath : str The HDF5 path to the top group holding the data. ex. '/data/raw/' """ import h5py with h5py.File(filename, 'r') as f1: # Get the pixel sizes and convert to Ang dimsPath = dsetPath + '/dim' print('Warning: Assuming EMD dim vectors are in nanometer') print('dim vector names/units are:') for ii in range(1, 4): print('name, units = {}, {}'.format(f1[dimsPath + str(ii)].attrs['name'], f1[dimsPath + str(ii)].attrs['units'])) pixelSizeX = (f1[dsetPath + '/dim2'][1] - f1[dsetPath + '/dim2'][0]) * 10 # change nanometers to Ang pixelSizeY = (f1[dsetPath + '/dim3'][1] - f1[dsetPath + '/dim3'][0]) * 10 # change nanometers to Ang filenameOut = filename.split('.emd')[ 0] + '.mrc' # use the first part of the file as the prefix removing the .emd on the end print('Warning: Converting to float32 before writing to disk') mrcWriter(filenameOut, np.float32(f1[dsetPath + '/data']), (1, pixelSizeY, pixelSizeX)) print('Finished writing to: {}'.format(filenameOut))