Just for fun: UMAPs

Just for fun: UMAPs#

In this notebook we demonstrate how repeatedly executed UMAPs, sorted to arrange changes between them, lead to a funny video of jumping UMAPs.

from skimage.data import human_mitosis
from napari_segment_blobs_and_things_with_membranes import voronoi_otsu_labeling
from napari_simpleitk_image_processing import label_statistics
from sklearn.preprocessing import StandardScaler
import numpy as np
import umap
import seaborn
import pyclesperanto_prototype as cle
import matplotlib.pyplot as plt

Example data#

First, we load the example image (taken from the scikit-image examples) showing human cell nuclei, segment them and measure properties of objects.

image = cle.asarray(human_mitosis())
labels = cle.voronoi_otsu_labeling(image, spot_sigma=2.5, outline_sigma=0)
nuclei_statistics = label_statistics(image, labels, 
                                     intensity=True, 
                                     size=True, 
                                     shape=True, 
                                     perimeter=True,
                                     moments=True)
nuclei_statistics.head(15)

	label	maximum	mean	median	minimum	sigma	sum	variance	elongation	feret_diameter	...	number_of_pixels_on_border	perimeter	perimeter_on_border	perimeter_on_border_ratio	principal_axes0	principal_axes1	principal_axes2	principal_axes3	principal_moments0	principal_moments1
0	1	83.0	69.886364	73.359375	48.0	9.890601	3075.0	97.823996	1.858735	9.486833	...	10	25.664982	10.0	0.389636	0.998995	-0.044825	0.044825	0.998995	1.926448	6.655680
1	2	46.0	42.125000	43.328125	38.0	2.748376	337.0	7.553571	0.000000	7.000000	...	8	15.954349	8.0	0.501431	1.000000	0.000000	0.000000	1.000000	0.000000	5.250000
2	3	53.0	44.916667	45.265625	38.0	4.461111	539.0	19.901515	3.609511	6.000000	...	7	14.843629	7.0	0.471583	1.000000	0.000000	0.000000	1.000000	0.243056	3.166667
3	4	92.0	65.603175	65.609375	38.0	14.475273	4133.0	209.533538	1.212933	10.000000	...	7	29.687257	7.0	0.235791	0.914511	0.404561	-0.404561	0.914511	4.227271	6.219189
4	5	59.0	46.315068	46.234375	38.0	5.065890	3381.0	25.663242	1.249507	11.401754	...	9	34.264893	9.0	0.262660	0.363116	-0.931744	0.931744	0.363116	5.001247	7.808286
5	6	75.0	54.087838	53.984375	38.0	8.394952	8005.0	70.475225	1.280946	14.866069	...	6	42.864805	6.0	0.139975	0.183365	-0.983045	0.983045	0.183365	9.187499	15.075056
6	7	96.0	61.033333	62.703125	38.0	15.166831	3662.0	230.032768	3.402583	15.132746	...	0	36.716372	0.0	0.000000	-0.014158	-0.999900	0.999900	-0.014158	1.446932	16.751957
7	8	100.0	70.000000	74.328125	39.0	16.138291	4480.0	260.444444	1.022473	9.219544	...	0	29.917295	0.0	0.000000	0.917896	-0.396820	0.396820	0.917896	5.073067	5.303642
8	9	65.0	53.117647	53.015625	38.0	6.358751	3612.0	40.433714	1.299197	9.848858	...	0	29.687257	0.0	0.000000	0.108143	-0.994135	0.994135	0.108143	4.175749	7.048300
9	10	79.0	59.594937	60.765625	38.0	9.191020	4708.0	84.474846	1.412593	11.661904	...	0	36.946410	0.0	0.000000	0.957268	0.289203	-0.289203	0.957268	4.709643	9.397712
10	11	80.0	59.333333	58.828125	38.0	10.281620	4450.0	105.711712	1.136874	10.000000	...	0	30.472655	0.0	0.000000	0.494998	-0.868894	0.868894	0.494998	5.244425	6.778330
11	12	96.0	69.666667	71.421875	39.0	14.848699	4389.0	220.483871	1.177721	9.486833	...	0	28.021176	0.0	0.000000	-0.266249	-0.963904	0.963904	-0.266249	4.246311	5.889743
12	13	102.0	78.533333	82.078125	42.0	16.168994	3534.0	261.436364	2.122467	11.000000	...	12	27.791138	12.0	0.431792	-0.020958	-0.999780	0.999780	-0.020958	1.781948	8.027435
13	14	72.0	53.914286	53.984375	38.0	6.819867	7548.0	46.510586	1.226967	14.317821	...	0	42.864805	0.0	0.000000	0.972168	-0.234285	0.234285	0.972168	9.112690	13.718688
14	15	99.0	72.049180	73.359375	38.0	17.157531	4395.0	294.380874	1.298605	9.433981	...	0	28.021176	0.0	0.000000	0.562829	-0.826574	0.826574	0.562829	3.746099	6.317325

15 rows × 27 columns

Feature selection and scaling#

Before applying dimensionality reduction, we select columns and scale the data.

selected_table = nuclei_statistics[
    [
        "mean",
        "variance",
        "elongation",
    ]
]

selected_statistics = selected_table.values

scaled_statistics = StandardScaler().fit_transform(selected_statistics)

type(scaled_statistics), scaled_statistics.shape

(numpy.ndarray, (320, 3))

Computing many UMAPs#

The algorithm behind the UMAP is partially a non-deterministic. Thus, if you run the same code multiple times and sort the results, you can get happily jumping UMAPs. Technically, the only difference between the UMAPS shown below are local and global rotations.

from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, leaves_list
import os
import tempfile
from PIL import Image

# Generate 100 different UMAP embeddings
print("Generating 100 UMAP embeddings...")
umap_embeddings = []
for i in range(100):
    umap_reducer = umap.UMAP()
    umap_embedding = umap_reducer.fit_transform(scaled_statistics)
    umap_embeddings.append(umap_embedding)

# Calculate similarity between embeddings
print("Calculating similarities between embeddings...")
# Flatten each embedding and calculate pairwise distances
flattened_embeddings = [emb.flatten() for emb in umap_embeddings]
distance_matrix = pdist(flattened_embeddings, metric='euclidean')
distance_matrix_square = squareform(distance_matrix)

# Sort embeddings by similarity using hierarchical clustering
print("Sorting embeddings by similarity...")
linkage_matrix = linkage(distance_matrix, method='ward')
sorted_indices = leaves_list(linkage_matrix)
sorted_embeddings = [umap_embeddings[i] for i in sorted_indices]

# Create temporary directory for saving plots
temp_dir = tempfile.mkdtemp()
print(f"Creating plots and saving to {temp_dir}...")

# Create and save plots
plot_files = []
for i, umap_embedding in enumerate(sorted_embeddings):
    fig, ax = plt.subplots(figsize=(6.4, 4.8))  # 640x480 pixels at 100 DPI
    scatter = ax.scatter(umap_embedding[:, 0], umap_embedding[:, 1], s=10, c=range(len(umap_embedding[:, 0])), cmap='viridis')
    ax.set_title(f'UMAP {i+1}')
    ax.set_xlabel('UMAP 1')
    ax.set_ylabel('UMAP 2')
    
    # Save plot
    filename = os.path.join(temp_dir, f'umap_{i:03d}.png')
    plt.savefig(filename, dpi=100, bbox_inches=None)
    plot_files.append(filename)
    plt.close(fig)

# Load images back into numpy array
print("Loading images into numpy array...")
plot_images = []
for filename in plot_files:
    img = Image.open(filename)
    img_array = np.array(img)
    plot_images.append(img_array)

# Convert to 2D numpy array (flatten spatial dimensions)
images_array = np.array(plot_images)
print(f"Final array shape: {images_array.shape}")

# Clean up temporary files
for filename in plot_files:
    os.remove(filename)
os.rmdir(temp_dir)

print("Done! The sorted UMAP plots are stored in 'images_array'")
print(f"Array contains {len(images_array)} images of shape {images_array[0].shape}")

Generating 100 UMAP embeddings...
Calculating similarities between embeddings...
Sorting embeddings by similarity...
Creating plots and saving to C:\Users\rober\AppData\Local\Temp\tmp332cayzj...
Loading images into numpy array...
Final array shape: (100, 480, 640, 4)
Done! The sorted UMAP plots are stored in 'images_array'
Array contains 100 images of shape (480, 640, 4)

import stackview
stackview.animate(images_array, filename="happy_umap.gif")

C:\Users\rober\miniforge3\envs\bio11\Lib\site-packages\stackview\_animate.py:63: UserWarning: The image is quite large (> 10 MByte) and might not be properly shown in the notebook when rendered over the internet. Consider subsampling or cropping the image for visualization purposes.
  warnings.warn("The image is quite large (> 10 MByte) and might not be properly shown in the notebook when rendered over the internet. Consider subsampling or cropping the image for visualization purposes.")

Just for fun: UMAPs

Contents

Just for fun: UMAPs#

Example data#

Feature selection and scaling#

Computing many UMAPs#