Source code for m2aia.Dataset

from operator import index
from os import times
from time import time
from typing import *
from . import ImageIO
import hashlib
import pathlib
import numpy as np
import random



[docs]
class BaseDataSet():
    def __init__(self, images : List[ImageIO.ImzMLReader], buffer_type : str) -> None:
        self.images = images
        self.elements = []

        self.buffer_type = buffer_type
        if 'disk' in self.buffer_type: # buffer images on disk; check path exists
            self.buffer_path = pathlib.Path(self.buffer_type.split(':')[1])
            if not self.buffer_path.exists():
                raise Exception(f"The path {str(self.buffer_path)} does not exist!")
    
    def __len__(self) -> int:
        pass

    def __getitem__(self, index):
        pass


[docs]
    def getitems(self, indexes: List[int]):
        pass


   



[docs]
class SpectrumDataset(BaseDataSet):
    """Dataset for accession individual spectra and class labels (optional) of multiple images (m2aia.ImzMLReader objects). 
    
    The aim of the SpectrumDataset is to provide convenient access to spectra of single or multiple ImzMLReaders.
    Two access strategy exist:
    1) Spectral approach: a single spectrum is returned.
    2) Spatio-spectral: a central spectrum and corresponding neighbors are returned.

    To use multiple images a spectra depth of equal size for each image is required.

    A label mask can be provided and is used to return labels for each accessed element.
    
    To use the spatio-spectral approach, a shape element is required. The Dataset will then return 
    the spectrum embedded in neighboring spectra, i.e. if the shape tuple is shape=(5,5) the shape of a
    data entry is [B,C,H,W], with batchsize as B = 1, spectrum depth as C = len(spectrum), width 
    as W=5 and height as H=5 of the patch.

    If no shape element was provided, the Dataset will return a single spectrum with shape [B=1,C].

    If multiple elements of the Dataset should be queried at one, the SpectrumDataset.getitems(list_of_indices)
    returns a batch like object containing the elements. i.e. without a shape definition returned elements will have 
    the shape [B=len(list_of_indices), C] and with shape=(5,5) the shape [B=len(list_of_indices),C,H=5,W=5].
    This is used in m2aia.BatchGenerator.

    
    
    Complete processing examples with focus on deep learning can be found on https://github.com/m2aia/pym2aia-examples 
    
    Example usage::

        import m2aia as m2

        I = m2.ImzMLReader("path/to/imzMl/file.imzML")
        I.SetNormalization(m2.m2NormalizationTIC)
        I.SetIntensityTransformation(m2.m2IntensityTransformationSquareRoot)
        I.Execute()      

        dataset = m2.SpectrumDataset([I], shuffle=True)
        for X,Y in dataset():
            print("Spectrum", X.shape, "Class Labels", Y.shape)
            do_something(X,Y)

    """


[docs]
    def find_nearest_indices(self, centroids: np.array, xaxis: np.array):
        return np.array([np.argmin(np.abs(xaxis - mz)) for mz in centroids])



[docs]
    def find_subrange_indices(self, xs, center_index, tolerance, is_ppm):
        # Calculate the lower and upper bounds based on the index and tolerance
        center_value = xs[center_index]
        if is_ppm:
            tol = xs[center_index] * tolerance * 10e-6

        lower_bound = center_value - tol
        upper_bound = center_value + tol

        # Initialize search pointers
        left_index = center_index
        right_index = center_index

        # Move the left pointer to find the lower bound
        while left_index > 0 and xs[left_index - 1] >= lower_bound:
            left_index -= 1

        # Move the right pointer to find the upper bound
        while right_index < len(xs) - 1 and xs[right_index + 1] <= upper_bound:
            right_index += 1

        # print(left_index,center_index, right_index + 1, "=>", abs(left_index - (right_index+1)))
        # Extract the values within the range
        return left_index, right_index + 1



    def __init__(self,images:List[ImageIO.ImzMLReader], 
                 labeled_images:List[np.array] = None,
                 sampling_masks:List[np.array] = None, 
                 spectrum_mask_indices:List[int] = None,
                 tolerance:np.float32=None, 
                 is_tolerance_in_ppm:bool=True, 
                 label_map: Dict = None,
                 shape:Tuple = None, 
                 transform_data = None, 
                 transform_labels = None, 
                 buffer_type:str='memory', 
                 reduce_function=np.mean,
                 shuffle=False,
                 quiet_init=True)-> None:
        """_summary_

        Args:
            images (List[ImageIO.ImzMLReader]): A list of ImageIO.ImzMLReader objects
            labeled_images (List[np.array], optional): A list of labeled masks. If non the ImzMLReader.GetMaskArray is used for each image. Defaults to None.
            exclude_labels (List[np.int32]): A list of labels which are excluded.
            spectrum_mask_indices (np.array, optional): A list of indices along the x axis (indices of m/z values). If None, the whole spectra with all m/z values is loaded. Defaults to None.
            shape (Tuple, optional): The shape can be used to query a neighborhood around a given spectrum/pixel. For example if shape is set to (-1,5,5), a 5x5 neighborhood is sampled around a queried pixel position. If shape is set to (-1,) the shape size is set to either the number of indices given in the spectrum_mask_indices or is set to hew whole spectrum depth. Defaults to (-1,).
            tolerance (int, optional): if spectrum_mask_indices is used, a tolerance can be set to apply a reduce function around the indices. Defaults to 20.
            reduce_function (function, optional): Reduce function if tolerance is set. Defaults to numpy.mean.
            transforms (Function, optional): A transformation can pe applied to a given spectrum using e.g. the transforms of torchvision. Defaults to None.
            buffer_type (str, optional): During querying the images it is possible to buffer queried spectra in memory to provide a fast access to upcoming queries e.g. in the next epoch. Defaults to 'memory'. Disable by setting it to None.
        """
        super().__init__(images, buffer_type)

        # track member variables        
        self.shape = shape
        self.spectrum_mask_indices = spectrum_mask_indices
        self.tolerance = tolerance
        self.is_ppm = is_tolerance_in_ppm
        self.x_hws = 0
        self.y_hws = 0
        self.xs = self.images[0].GetXAxis()
        self.reduce_function=reduce_function
        self.ranges = None
        self.transform_data = transform_data
        self.transform_labels = transform_labels
        self.label_map = label_map
        self.labels = set()
        
        
        

        if self.tolerance and self.spectrum_mask_indices is not None:
            self.ranges = [self.find_subrange_indices(self.xs, index, self.tolerance, self.is_ppm) for index in self.spectrum_mask_indices]
        else:
            self.tolerance = None
            self.ranges = None


        # make sure all images have identically x axis
        for imageID, handle in enumerate(self.images):          
            assert(np.all(self.xs == handle.GetXAxis())) # check for equal x axis size
        
        if self.spectrum_mask_indices is None: # complete spectrum data
            self.spectrum_depth = self.images[0].GetXAxisDepth()
        else: # mask a spectrum using a list of indices
            self.spectrum_depth = self.spectrum_mask_indices.shape[0]

            
        if self.shape and len(self.shape) >= 2:
            if self.shape[-2]%2 == 0 or self.shape[-1]%2 == 0:
                raise Exception(f"We only support odd neighborhood sizes!")          
            
            # half window size 
            self.x_hws = self.shape[-1]//2
            self.y_hws = self.shape[-2]//2

        if "memory" == self.buffer_type:
            self.buffer = []            
            for k in range(len(self.images)):
                buffer_spectrum_label = np.zeros((self.images[k].GetNumberOfSpectra(),), dtype=np.bool)
                buffer_spectrum_data = np.zeros((self.images[k].GetNumberOfSpectra(), self.spectrum_depth), dtype = np.float32)
                self.buffer.append((buffer_spectrum_label, buffer_spectrum_data))
        
        # self.neighborhood_size = neighborhood_size
        self.index_images = []
        self.hit_counter = [0]*len(self.images)


        # for each image
        for imageID, handle in enumerate(self.images):
            imageElements = []

            index_image = -np.ones(handle.GetShape()[::-1], dtype=np.int32)
        
            if labeled_images is None:
                mask_image = handle.GetMaskArray()
            else:
                mask_image = labeled_images[imageID]

            if sampling_masks is None:
                sampling_mask = handle.GetMaskArray()
            else:
                sampling_mask = sampling_masks[imageID]

            for spectrumID in range(handle.GetNumberOfSpectra()):
                (x,y,z) = handle.GetSpectrumPosition(spectrumID)
                
                if sampling_mask[z,y,x] <= 0 or mask_image[z,y,x] < 0:
                    continue
                index_image[z,y,x] = spectrumID
                label = mask_image[z,y,x]
                imageElements.append((imageID, spectrumID, (x,y,z), label))
                self.labels.add(label)
                

            # for each imageID insert elements and an index image
            self.elements.extend(imageElements)
            self.index_images.append(index_image)
    

        if shuffle:
            random.shuffle(self.elements)

    def __len__(self) -> int:
        """
        Returns the number of accessible spectra.
        If multiple images are used, len is the sum of accessible elements of all images.
        
        Returns:
            int: Number of accessible elements of this Dataset.
        """
    
        return len(self.elements)

    def __getitem__(self, index):
        
        return self.getitems([index])
    


[docs]
    def getitems(self, dataset_query_indices: List[int]):
        
        image_query_indices = {imageID:[] for imageID, _ in enumerate(self.images)}
        image_query_labels = {imageID:[] for imageID, _ in enumerate(self.images)}

        for index in dataset_query_indices:
            # check origin
            imageID, spectrumID, (x,y,z), label = self.elements[index]
            image_shape = self.images[imageID].GetShape()
            # save label
            image_query_labels[imageID].append(label)
            if self.shape:
                left = np.clip(x-self.x_hws, 0, image_shape[0])
                right = np.clip(x+self.x_hws+1, 0, image_shape[0])
                bottom = np.clip(y-self.y_hws,0, image_shape[1])
                top = np.clip(y+self.y_hws+1,0, image_shape[1])
                indices = self.index_images[imageID][0,bottom:top,left:right].flatten()
                if any(indices < 0): 
                    choices = np.random.choice(indices[indices>=0], len(indices[indices < 0]))
                    indices[indices < 0] = choices
                
                expected_indices = (2*self.x_hws+1) * (2*self.y_hws+1)
                if len(indices) < expected_indices:
                    missing_indices = expected_indices - len(indices)
                    indices = np.concatenate([indices,np.random.choice(indices, missing_indices)])

                image_query_indices[imageID].append(indices)
            else:
                image_query_indices[imageID].extend([spectrumID])

        BUFFER_QUERY = 0
        BUFFER_DATA = 1
        result_data = None
        result_labels = None

        for imageID, image in enumerate(self.images):
            # get all indices for the image
            indices = np.array(image_query_indices[imageID], dtype=np.int32)
            labels = np.array(image_query_labels[imageID], dtype=np.int32)
            

            if self.buffer_type == "memory": # buffering is used
                image_buffer_data = self.buffer[imageID][BUFFER_DATA]
                image_buffer_query = self.buffer[imageID][BUFFER_QUERY]
                
                # mask indices which require to be loaded from ImzML image (buffer status False indicates "not buffered")
                query_mask = image_buffer_query[indices] == False
                miss_indices = indices[query_mask]
                # hit_indices = image_indices[~query_mask]

                # load from imzML and store in buffer
                if np.any(query_mask):
                    # print(len(miss_indices)/len(indices))

                    # get data for those indices from the image
                    spectra = image.GetSpectra(miss_indices).astype(np.float32)
                    image_buffer_query[miss_indices] = True

                    if self.spectrum_mask_indices is None: # check if a centroids list exists
                        # no one was set so we put the raw spectra into the buffer
                        image_buffer_data[miss_indices] = spectra
                    else: # a spectrum mask exists
                        if self.ranges: # use range queries along the spectra
                            for k, [l,u] in enumerate(self.ranges):
                                image_buffer_data[miss_indices,k] = self.reduce_function(spectra[:,l:u],axis=1)
                                # print(self.reduce_function(spectra[:,l:u],axis=1), l,u, image_buffer_data[miss_indices][:,k])
                        else:
                            image_buffer_data[miss_indices] = spectra[..., self.spectrum_mask_indices]
            
                # load from buffer           
                batch_data = image_buffer_data[indices]
                # print("mean", np.mean(batch_data))
            else: # no buffering is used
                spectra = self.images[imageID].GetSpectra(indices)
                if self.spectrum_mask_indices is not None:
                    if self.ranges: # use range queries along the spectra
                        for k, [l,u] in enumerate(self.ranges):
                            batch_data = self.reduce_function(spectra[:,l:u],axis=1)
                    else:
                        batch_data = spectra[..., self.spectrum_mask_indices]
                else:
                    batch_data = self.images[imageID].GetSpectra(indices)            

            if self.shape: # reshape to [B,C,H,W]
                batch_data = np.reshape(batch_data, (-1,) + self.shape + (self.spectrum_depth,))
                batch_data = np.swapaxes(batch_data, 1,3)
                
            
            # combine queries from different images
            if result_data is None:
                result_data = batch_data
            else:
                result_data = np.concatenate([result_data, batch_data])

            if result_labels is None:
                result_labels = np.array(labels, np.int32)
            else:
                result_labels = np.concatenate([result_labels, np.array(labels, np.int32)])
            
            
        
        # at this stage we receive a list of spectra for n queried central spectra we receive a list of
        # n*(2*s+1)^2 where s is the half size of the neighborhood
        # we can add a transformer that now reshapes those spectra into the correct shape.
        # e.g. from [n*(2*s+1)^2,D] we go to => [n, D, 2*s+1, 2*s+1]
        # with D is the number of channels

        # trafo = transforms.Compose([
        #     transforms.Lambda(lambda x: np.transpose(x)), #=> (D,9)
        #     transforms.Lambda(lambda x: np.reshape(x, (1, x.shape[0], int(np.sqrt(x.shape[1])), int(np.sqrt(x.shape[1]))))), #=> (1,D,3,3)
        #     transforms.Lambda(lambda x: torch.tensor(x, dtype=torch.float32))])
        if self.transform_data is not None:
            result_data = self.transform_data(result_data)
        
        if self.transform_labels is not None:
            result_labels = self.transform_labels(result_labels)
        
        return result_data, result_labels






[docs]
class IonImageDataset(BaseDataSet):
    def __init__(self, images : List[ImageIO.ImzMLReader], 
                       centroids:List[float], 
                       tolerance:float, 
                       tolerance_type:str='ppm', 
                       buffer_type='memory', 
                       transforms=None)-> None:
        super().__init__(images, buffer_type)
        self.elements = centroids
        self.tolerance = tolerance
        self.tolerance_type = tolerance_type
        self.transforms = transforms
        self.buffer = [{} for _ in images]
        



[docs]
    def get_tolerance(self, c): 
        if self.tolerance_type == "ppm":
            return c * self.tolerance * 10e-6
        else:
            return self.tolerance



[docs]
    def make_buffered_image(self, index):
        c = self.elements[index%len(self.elements)]
        image_id = index//len(self.elements)
        if c not in self.buffer[image_id] or self.buffer_type == 'none':
            # create ion image
            ii = self.images[image_id].GetArray(c, self.get_tolerance(c), squeeze=False)
            
            if self.buffer_type == 'memory': # buffer image in memory
                self.buffer[image_id][c] = ii
            else: # buffer image on disk
                hash = hashlib.sha224("{:.8f}".format(c).encode())
                hash.update("{:.8f}".format(self.get_tolerance(c)).encode())
                path = self.buffer_path.joinpath(self.images[image_id].GetImageName())
                path.mkdir(exist_ok=True)
                path = path.joinpath(hash.hexdigest())
                np.save(path ,ii)
                self.buffer[image_id][c] = path

        else: # load buffered entry
            if self.buffer_type == 'memory':
                ii = self.buffer[image_id][c]
            else:
                ii = np.load(str(self.buffer[image_id][c])+".npy")
        
        if self.transforms is not None:
            ii = self.transforms(ii)
        return ii


    def __len__(self) -> int:
        return len(self.elements) * len(self.images)

    def __getitem__(self, index):
        return self.make_buffered_image(index)


[docs]
    def getitems(self, indexes: List[int]):
        return np.stack([self.__getitem__(index) for index in indexes])