Just for fun: UMAPs#

In this notebook we demonstrate how repeatedly executed UMAPs, sorted to arrange changes between them, lead to a funny video of jumping UMAPs.

from skimage.data import human_mitosis
from napari_segment_blobs_and_things_with_membranes import voronoi_otsu_labeling
from napari_simpleitk_image_processing import label_statistics
from sklearn.preprocessing import StandardScaler
import numpy as np
import umap
import seaborn
import pyclesperanto_prototype as cle
import matplotlib.pyplot as plt

Example data#

First, we load the example image (taken from the scikit-image examples) showing human cell nuclei, segment them and measure properties of objects.

image = cle.asarray(human_mitosis())
labels = cle.voronoi_otsu_labeling(image, spot_sigma=2.5, outline_sigma=0)
nuclei_statistics = label_statistics(image, labels, 
                                     intensity=True, 
                                     size=True, 
                                     shape=True, 
                                     perimeter=True,
                                     moments=True)
nuclei_statistics.head(15)
label maximum mean median minimum sigma sum variance elongation feret_diameter ... number_of_pixels_on_border perimeter perimeter_on_border perimeter_on_border_ratio principal_axes0 principal_axes1 principal_axes2 principal_axes3 principal_moments0 principal_moments1
0 1 83.0 69.886364 73.359375 48.0 9.890601 3075.0 97.823996 1.858735 9.486833 ... 10 25.664982 10.0 0.389636 0.998995 -0.044825 0.044825 0.998995 1.926448 6.655680
1 2 46.0 42.125000 43.328125 38.0 2.748376 337.0 7.553571 0.000000 7.000000 ... 8 15.954349 8.0 0.501431 1.000000 0.000000 0.000000 1.000000 0.000000 5.250000
2 3 53.0 44.916667 45.265625 38.0 4.461111 539.0 19.901515 3.609511 6.000000 ... 7 14.843629 7.0 0.471583 1.000000 0.000000 0.000000 1.000000 0.243056 3.166667
3 4 92.0 65.603175 65.609375 38.0 14.475273 4133.0 209.533538 1.212933 10.000000 ... 7 29.687257 7.0 0.235791 0.914511 0.404561 -0.404561 0.914511 4.227271 6.219189
4 5 59.0 46.315068 46.234375 38.0 5.065890 3381.0 25.663242 1.249507 11.401754 ... 9 34.264893 9.0 0.262660 0.363116 -0.931744 0.931744 0.363116 5.001247 7.808286
5 6 75.0 54.087838 53.984375 38.0 8.394952 8005.0 70.475225 1.280946 14.866069 ... 6 42.864805 6.0 0.139975 0.183365 -0.983045 0.983045 0.183365 9.187499 15.075056
6 7 96.0 61.033333 62.703125 38.0 15.166831 3662.0 230.032768 3.402583 15.132746 ... 0 36.716372 0.0 0.000000 -0.014158 -0.999900 0.999900 -0.014158 1.446932 16.751957
7 8 100.0 70.000000 74.328125 39.0 16.138291 4480.0 260.444444 1.022473 9.219544 ... 0 29.917295 0.0 0.000000 0.917896 -0.396820 0.396820 0.917896 5.073067 5.303642
8 9 65.0 53.117647 53.015625 38.0 6.358751 3612.0 40.433714 1.299197 9.848858 ... 0 29.687257 0.0 0.000000 0.108143 -0.994135 0.994135 0.108143 4.175749 7.048300
9 10 79.0 59.594937 60.765625 38.0 9.191020 4708.0 84.474846 1.412593 11.661904 ... 0 36.946410 0.0 0.000000 0.957268 0.289203 -0.289203 0.957268 4.709643 9.397712
10 11 80.0 59.333333 58.828125 38.0 10.281620 4450.0 105.711712 1.136874 10.000000 ... 0 30.472655 0.0 0.000000 0.494998 -0.868894 0.868894 0.494998 5.244425 6.778330
11 12 96.0 69.666667 71.421875 39.0 14.848699 4389.0 220.483871 1.177721 9.486833 ... 0 28.021176 0.0 0.000000 -0.266249 -0.963904 0.963904 -0.266249 4.246311 5.889743
12 13 102.0 78.533333 82.078125 42.0 16.168994 3534.0 261.436364 2.122467 11.000000 ... 12 27.791138 12.0 0.431792 -0.020958 -0.999780 0.999780 -0.020958 1.781948 8.027435
13 14 72.0 53.914286 53.984375 38.0 6.819867 7548.0 46.510586 1.226967 14.317821 ... 0 42.864805 0.0 0.000000 0.972168 -0.234285 0.234285 0.972168 9.112690 13.718688
14 15 99.0 72.049180 73.359375 38.0 17.157531 4395.0 294.380874 1.298605 9.433981 ... 0 28.021176 0.0 0.000000 0.562829 -0.826574 0.826574 0.562829 3.746099 6.317325

15 rows × 27 columns

Feature selection and scaling#

Before applying dimensionality reduction, we select columns and scale the data.

selected_table = nuclei_statistics[
    [
        "mean",
        "variance",
        "elongation",
    ]
]

selected_statistics = selected_table.values

scaled_statistics = StandardScaler().fit_transform(selected_statistics)

type(scaled_statistics), scaled_statistics.shape
(numpy.ndarray, (320, 3))

Computing many UMAPs#

The algorithm behind the UMAP is partially a non-deterministic. Thus, if you run the same code multiple times and sort the results, you can get happily jumping UMAPs. Technically, the only difference between the UMAPS shown below are local and global rotations.

from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, leaves_list
import os
import tempfile
from PIL import Image

# Generate 100 different UMAP embeddings
print("Generating 100 UMAP embeddings...")
umap_embeddings = []
for i in range(100):
    umap_reducer = umap.UMAP()
    umap_embedding = umap_reducer.fit_transform(scaled_statistics)
    umap_embeddings.append(umap_embedding)

# Calculate similarity between embeddings
print("Calculating similarities between embeddings...")
# Flatten each embedding and calculate pairwise distances
flattened_embeddings = [emb.flatten() for emb in umap_embeddings]
distance_matrix = pdist(flattened_embeddings, metric='euclidean')
distance_matrix_square = squareform(distance_matrix)

# Sort embeddings by similarity using hierarchical clustering
print("Sorting embeddings by similarity...")
linkage_matrix = linkage(distance_matrix, method='ward')
sorted_indices = leaves_list(linkage_matrix)
sorted_embeddings = [umap_embeddings[i] for i in sorted_indices]

# Create temporary directory for saving plots
temp_dir = tempfile.mkdtemp()
print(f"Creating plots and saving to {temp_dir}...")

# Create and save plots
plot_files = []
for i, umap_embedding in enumerate(sorted_embeddings):
    fig, ax = plt.subplots(figsize=(6.4, 4.8))  # 640x480 pixels at 100 DPI
    scatter = ax.scatter(umap_embedding[:, 0], umap_embedding[:, 1], s=10, c=range(len(umap_embedding[:, 0])), cmap='viridis')
    ax.set_title(f'UMAP {i+1}')
    ax.set_xlabel('UMAP 1')
    ax.set_ylabel('UMAP 2')
    
    # Save plot
    filename = os.path.join(temp_dir, f'umap_{i:03d}.png')
    plt.savefig(filename, dpi=100, bbox_inches=None)
    plot_files.append(filename)
    plt.close(fig)

# Load images back into numpy array
print("Loading images into numpy array...")
plot_images = []
for filename in plot_files:
    img = Image.open(filename)
    img_array = np.array(img)
    plot_images.append(img_array)

# Convert to 2D numpy array (flatten spatial dimensions)
images_array = np.array(plot_images)
print(f"Final array shape: {images_array.shape}")

# Clean up temporary files
for filename in plot_files:
    os.remove(filename)
os.rmdir(temp_dir)

print("Done! The sorted UMAP plots are stored in 'images_array'")
print(f"Array contains {len(images_array)} images of shape {images_array[0].shape}")
Generating 100 UMAP embeddings...
Calculating similarities between embeddings...
Sorting embeddings by similarity...
Creating plots and saving to C:\Users\rober\AppData\Local\Temp\tmp332cayzj...
Loading images into numpy array...
Final array shape: (100, 480, 640, 4)
Done! The sorted UMAP plots are stored in 'images_array'
Array contains 100 images of shape (480, 640, 4)
import stackview
stackview.animate(images_array, filename="happy_umap.gif")
C:\Users\rober\miniforge3\envs\bio11\Lib\site-packages\stackview\_animate.py:63: UserWarning: The image is quite large (> 10 MByte) and might not be properly shown in the notebook when rendered over the internet. Consider subsampling or cropping the image for visualization purposes.
  warnings.warn("The image is quite large (> 10 MByte) and might not be properly shown in the notebook when rendered over the internet. Consider subsampling or cropping the image for visualization purposes.")