📥 Download all notebooks

3.1. Validation: Gene signatures

📘 Overview

This notebook performs quality control on raw count data to filter out low-quality cells, and then uses DESeq2 to compute differential gene expression signatures (compound vs. DMSO) for each compound.

Inputs
Raw gene expression count matrix
Output
An AnnData file with DESeq2 differential expression results (log₂ fold changes, adjusted p-values)
[ ]:
import numpy as np
import pandas as pd
import anndata as ad

import seaborn as sns
import matplotlib.pyplot as plt

import dilimap as dmap
[ ]:
%load_ext autoreload
%autoreload 2
[ ]:
dmap.logging.print_version()

Pull raw data

[4]:
adata = dmap.s3.read('validation_data_counts.h5ad', package_name='public/data')
Package: s3://dilimap/public/data. Top hash: dd0504e0f5
[5]:
from dilimap.utils import platemap
[6]:
## Example plate design

adata.obs['WELL_ROW'] = adata.obs['WELL_ID'].str[0]
adata.obs['WELL_COL'] = adata.obs['WELL_ID'].str[1:]

data = adata[adata.obs['PLATE_NAME'] == 'APP1_S1']

CMPD_COLOR = (
    data.obs['COMPOUND']
    .map(
        {
            'DMSO': 'seashell',
            'DMSO_replaced': 'seashell',
            'Chlorpromazine': 'lightsalmon',
        }
    )
    .fillna('lightblue')
)
CMPD_COLOR_OPACITY = '; opacity: ' + data.obs['DOSE_LEVEL'].astype(str).map(
    {'Control': '1', 'Low': '.4', 'Middle': '.6', 'Mid-High': '.8', 'High': '1'}
)


def highlight_cells(x):
    return (
        'background-color: '
        + x.map(dict(zip(data.obs['CMPD_DOSE'], CMPD_COLOR))).fillna('white')
        + x.map(dict(zip(data.obs['CMPD_DOSE'], CMPD_COLOR_OPACITY)))
    )


df_cmpd_map = platemap(data, value_key='CMPD_DOSE', batch='PLATE_NAME')

col_borders = dict(subset=['4', '8'], **{'border-right': '1.5pt solid black'})
row_borders = dict(
    subset=(df_cmpd_map.index.str.endswith(('A', 'D', 'G')), df_cmpd_map.columns),
    **{'border-top': '1.5pt solid black'},
)

display(
    df_cmpd_map.style.apply(highlight_cells)
    .set_properties(**col_borders)
    .set_properties(**row_borders)
)
  1 2 3 4 5 6 7 8 9 10 11 12
APP1_S1_A Orelabrutinib_High Orelabrutinib_Mid-High Orelabrutinib_Middle Orelabrutinib_Low Ibrutinib_High Ibrutinib_Mid-High Ibrutinib_Middle Ibrutinib_Low Remibrutinib_High Remibrutinib_Mid-High Remibrutinib_Middle Remibrutinib_Low
APP1_S1_B Orelabrutinib_High Orelabrutinib_Mid-High Orelabrutinib_Middle Orelabrutinib_Low Ibrutinib_High Ibrutinib_Mid-High Ibrutinib_Middle Ibrutinib_Low Remibrutinib_High Remibrutinib_Mid-High Remibrutinib_Middle Remibrutinib_Low
APP1_S1_C Orelabrutinib_High Orelabrutinib_Mid-High Orelabrutinib_Middle Orelabrutinib_Low Ibrutinib_High Ibrutinib_Mid-High Ibrutinib_Middle Ibrutinib_Low Remibrutinib_High Remibrutinib_Mid-High Remibrutinib_Middle Remibrutinib_Low
APP1_S1_D Evobrutinib_High Evobrutinib_Mid-High Evobrutinib_Middle Evobrutinib_Low BMS-986142_High BMS-986142_Mid-High BMS-986142_Middle BMS-986142_Low Rilzabrutinib_High Rilzabrutinib_Mid-High Rilzabrutinib_Middle Rilzabrutinib_Low
APP1_S1_E Evobrutinib_High Evobrutinib_Mid-High Evobrutinib_Middle Evobrutinib_Low BMS-986142_High BMS-986142_Mid-High BMS-986142_Middle BMS-986142_Low Rilzabrutinib_High Rilzabrutinib_Mid-High Rilzabrutinib_Middle Rilzabrutinib_Low
APP1_S1_F Evobrutinib_High Evobrutinib_Mid-High Evobrutinib_Middle Evobrutinib_Low BMS-986142_High BMS-986142_Mid-High BMS-986142_Middle BMS-986142_Low Rilzabrutinib_High Rilzabrutinib_Mid-High Rilzabrutinib_Middle Rilzabrutinib_Low
APP1_S1_G DMSO_Control DMSO_Control DMSO_Control DMSO_Control DMSO_Control DMSO_Control DMSO_Control DMSO_Control Chlorpromazine_High Chlorpromazine_High Chlorpromazine_High Chlorpromazine_High

QC (total counts, mtRNA, rRNA)

[7]:
adata.obs['LDH_QC'].value_counts()
[7]:
LDH_QC
              763
tech error     12
cell death      5
Name: count, dtype: int64
[8]:
adata.obs['ldh_qc_pass'] = adata.obs['LDH_QC'] == ''
[9]:
## QC of total counts and mtRNA

dmap.pp.qc_metrics(adata)

thresh_counts = (
    np.median(adata.obs['log_totalRNA']) - 2.5 * adata.obs['log_totalRNA'].std()
)
adata.obs['rna_qc_pass'] = (adata.obs['log_totalRNA'] > thresh_counts) & (
    adata.obs['pct_mtRNA'] < 9
)
Added the following to `adata.obs`: ['log_totalRNA', 'pct_mtRNA', 'pct_rRNA']
[10]:
fig, axs = plt.subplots(1, 3, figsize=(12, 3), gridspec_kw={'wspace': 0.4})

sns.histplot(adata.obs['log_totalRNA'], bins=50, ax=axs[0])
sns.histplot(adata.obs['pct_mtRNA'], bins=200, ax=axs[1])
sns.histplot(adata.obs['pct_rRNA'], bins=50, ax=axs[2])

axs[1].set_xlim(0, 20)

axs[0].axvline(thresh_counts, c='k')
axs[1].axvline(9, c='k')
axs[2].axvline(9, c='k')

plt.show()
../_images/reproducibility_3.1_Validation_Gene_Signatures_12_0.png
[11]:
## Example plate with QC metrics

data = adata[adata.obs['PLATE_NAME'] == 'APP1_S1']


def color_boolean(val):
    return f'color: {"lightred" if val else ""}; font-weight: {"bold" if val else ""}'


for val in ['log_totalRNA', 'pct_mtRNA']:
    print(f'\033[1m{val}')

    df = platemap(data, val, batch='PLATE_NAME')
    df_bool = np.invert(platemap(data, 'rna_qc_pass', batch='PLATE_NAME').astype(bool))

    if val != 'notes_flags':
        mu, std = np.median(data.obs[val]), np.std(data.obs[val])

        display(
            df.style.format(precision=2)
            .background_gradient(vmin=mu - 3 * std, vmax=mu + 3 * std, cmap='RdBu_r')
            .apply(lambda c: df_bool[c.name].apply(color_boolean))
            .applymap(lambda x: 'background-color: white' if pd.isnull(x) else '')
            .set_properties(**col_borders)
            .set_properties(**row_borders)
        )

    else:
        display(
            df.style.format(precision=2)
            .apply(lambda c: df_bool[c.name].apply(color_boolean))
            .applymap(lambda x: 'background-color: salmon' if x != '' else '')
            .set_properties(**col_borders)
            .set_properties(**row_borders)
        )
log_totalRNA
/var/folders/lz/prv79nmj5msg8h6nzqn0w7cw0000gn/T/ipykernel_23937/2583658983.py:22: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.
  .applymap(lambda x: 'background-color: white' if pd.isnull(x) else '')
  1 2 3 4 5 6 7 8 9 10 11 12
APP1_S1_A 6.35 6.40 6.35 6.33 6.27 6.27 6.31 6.31 6.29 6.33 6.21 6.27
APP1_S1_B 6.32 6.43 6.68 6.64 6.59 6.57 6.64 6.61 6.59 6.61 6.60 6.57
APP1_S1_C 6.57 6.57 6.51 6.51 6.47 6.44 6.50 6.52 6.46 6.49 6.40 6.45
APP1_S1_D 6.63 6.73 6.71 6.62 6.54 6.57 6.63 6.60 6.59 6.60 6.55 6.57
APP1_S1_E 6.27 3.68 6.31 6.50 6.54 6.58 6.61 6.53 6.51 6.48 6.52 6.55
APP1_S1_F 6.57 6.61 6.58 6.47 6.39 6.42 6.48 6.46 6.42 6.36 6.34 6.37
APP1_S1_G 6.51 6.55 6.51 6.51 6.45 6.45 6.49 6.48 6.46 6.47 6.45 6.45
pct_mtRNA
/var/folders/lz/prv79nmj5msg8h6nzqn0w7cw0000gn/T/ipykernel_23937/2583658983.py:22: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.
  .applymap(lambda x: 'background-color: white' if pd.isnull(x) else '')
  1 2 3 4 5 6 7 8 9 10 11 12
APP1_S1_A 3.03 3.66 3.40 3.43 3.15 3.56 3.45 3.03 2.94 3.51 3.23 3.18
APP1_S1_B 3.88 3.55 3.26 3.27 2.95 3.35 3.36 3.07 3.64 3.20 3.18 3.14
APP1_S1_C 3.94 3.69 3.53 3.55 3.08 3.51 3.72 3.26 3.83 3.72 3.68 3.56
APP1_S1_D 2.98 4.92 4.08 4.06 3.10 3.79 3.94 3.73 2.98 3.69 3.56 3.50
APP1_S1_E 5.96 3.86 6.88 3.95 3.54 3.41 3.63 3.61 3.83 3.74 3.25 3.35
APP1_S1_F 4.35 4.58 4.15 4.21 3.74 3.38 3.75 3.42 3.95 3.63 3.32 3.22
APP1_S1_G 3.67 3.60 3.32 3.38 3.54 3.54 3.91 3.34 2.73 2.59 2.70 2.51

QC (cross-replicate correlation)

[12]:
dmap.pp.qc_cross_rep_correlation(adata, group_key='CMPD_DOSE', plate_key='PLATE_NAME')
Added the following to `adata.obs`: ['cross_rep_correlation', 'rep_corr_qc_pass']
[13]:
fig, ax = plt.subplots(1, 1, figsize=(4, 3), gridspec_kw={'wspace': 0.4})

sns.histplot(
    adata.obs['cross_rep_correlation'][
        adata.obs['ldh_qc_pass'] & adata.obs['rna_qc_pass']
    ],
    bins=100,
    ax=ax,
)
ax.axvline(0.99, c='k')

plt.show()
../_images/reproducibility_3.1_Validation_Gene_Signatures_16_0.png
[14]:
## Example plate with cross-replicate correlations

data = adata[adata.obs['PLATE_NAME'] == 'APP1_S1']

val = 'cross_rep_correlation'

df = platemap(data, val, batch='PLATE_NAME')
df_bool = np.invert(
    platemap(data, 'rep_corr_qc_pass', batch='PLATE_NAME').fillna(False).astype(bool)
)
mu, std = np.nanmean(data.obs[val]), np.nanstd(data.obs[val])

col_borders = dict(subset=['4', '8'], **{'border-right': '1.5pt solid black'})
row_borders = dict(
    subset=(df.index.str.endswith(('A', 'D', 'G')), df.columns),
    **{'border-top': '1.5pt solid black'},
)

print(val, f'CI=[{mu - 2 * std:0.3f},{1}]')
display(
    df.style.format(precision=3)
    .background_gradient(vmin=mu - 5 * std, vmax=1, cmap='Reds_r')
    .apply(lambda c: df_bool[c.name].apply(color_boolean))
    .applymap(lambda x: 'background-color: white' if pd.isnull(x) else '')
    .set_properties(**col_borders)
    .set_properties(**row_borders)
)
cross_rep_correlation CI=[0.964,1]
/var/folders/lz/prv79nmj5msg8h6nzqn0w7cw0000gn/T/ipykernel_23937/2197300645.py:21: FutureWarning: Styler.applymap has been deprecated. Use Styler.map instead.
  .applymap(lambda x: 'background-color: white' if pd.isnull(x) else '')
  1 2 3 4 5 6 7 8 9 10 11 12
APP1_S1_A 0.929 0.998 0.997 0.998 0.956 0.997 0.997 0.997 0.976 0.997 0.998 0.998
APP1_S1_B 0.999 0.999 0.999 0.998 0.999 0.998 0.996 1.000 0.998 0.998 0.999 0.999
APP1_S1_C 0.999 0.999 0.999 0.998 0.999 0.998 0.997 1.000 0.998 0.998 0.999 0.999
APP1_S1_D 0.915 0.998 0.998 0.998 0.930 0.998 0.997 0.999 0.973 0.999 0.999 0.999
APP1_S1_E 0.994 0.964 0.988 0.998 0.999 0.999 0.997 0.999 0.999 0.999 0.999 0.999
APP1_S1_F 0.994 0.998 0.998 0.998 0.999 0.999 0.997 0.999 0.999 0.999 0.998 0.999
APP1_S1_G 0.998 0.998 0.999 0.999 0.999 0.999 0.999 0.999 0.999 0.999 0.998 0.999
[15]:
from natsort import index_natsorted

sns.set(font_scale=0.8)

df_cmpd_flags = data.obs[~data.obs['rep_corr_qc_pass']]['COMPOUND'].value_counts()
df_cmpd_flags = df_cmpd_flags[~df_cmpd_flags.index.str.contains('DMSO')]
cmpd_flags = df_cmpd_flags.index[:8]

fig, axs = plt.subplots(
    1, len(cmpd_flags), figsize=(5 * len(cmpd_flags), 3), gridspec_kw={'wspace': 0.5}
)
for i, cmpd_name in enumerate(cmpd_flags):
    df_sub = data[data.obs['COMPOUND'] == cmpd_name].to_df()
    df_sub.index = data[data.obs['COMPOUND'] == cmpd_name].obs['WELL_ID']

    df_sub_corr = df_sub.T.corr()
    idx_sort = index_natsorted(df_sub_corr.index.str[1:] + df_sub_corr.index.str[0])
    df_sub_corr = df_sub_corr.iloc[idx_sort, idx_sort]

    g = sns.heatmap(df_sub_corr, vmax=1, vmin=0.9, yticklabels=True, ax=axs[i])

    flagged = (
        data[(data.obs['COMPOUND'] == cmpd_name) & ~data.obs['rep_corr_qc_pass']]
        .obs['WELL_ID']
        .values
    )
    for label in g.get_yticklabels():
        if label.get_text() in flagged:
            label.set_color('red')

    axs[i].set_title(cmpd_name)
../_images/reproducibility_3.1_Validation_Gene_Signatures_18_0.png
[16]:
cmpd_flags = ['DMSO', 'Chlorpromazine']

fig, axs = plt.subplots(
    1, len(cmpd_flags), figsize=(5 * len(cmpd_flags), 3), gridspec_kw={'wspace': 0.5}
)
for i, cmpd_name in enumerate(cmpd_flags):
    df_sub = data[data.obs['COMPOUND'] == cmpd_name].to_df()
    df_sub.index = data[data.obs['COMPOUND'] == cmpd_name].obs['PLATE_NAME'].astype(str)

    df_sub_corr = df_sub.T.corr()
    idx_sort = index_natsorted(df_sub_corr.index.str[1:] + df_sub_corr.index.str[0])
    df_sub_corr = df_sub_corr.iloc[idx_sort, idx_sort]

    g = sns.heatmap(df_sub_corr, vmax=1, vmin=0.9, yticklabels=4, ax=axs[i])

    flagged = (
        data[(data.obs['COMPOUND'] == cmpd_name) & ~data.obs['rep_corr_qc_pass']]
        .obs['WELL_ID']
        .values
    )
    for label in g.get_yticklabels():
        if label.get_text() in flagged:
            label.set_color('red')

    axs[i].set_title(cmpd_name)
../_images/reproducibility_3.1_Validation_Gene_Signatures_19_0.png

QC (hepatocyte fidelity sanity check)

[17]:
# Log-normalize total counts (similar to sc.pp.normalize_total and sc.pp.log1p)

X = adata.X if isinstance(adata.X, np.ndarray) else adata.X.toarray()
cell_sums = X.sum(axis=1, keepdims=True)
X_norm = (X / cell_sums) * np.median(cell_sums)
X_norm = np.log10(X_norm + 1)

adata_norm = ad.AnnData(X=X_norm, obs=adata.obs, var=adata.var)
/opt/anaconda3/envs/py310/lib/python3.10/site-packages/anndata/_core/anndata.py:1756: UserWarning: Observation names are not unique. To make them unique, call `.obs_names_make_unique`.
  utils.warn_names_duplicates("obs")
[18]:
# Celltype markers curated from https://panglaodb.se/markers.html and https://pmc.ncbi.nlm.nih.gov/articles/PMC6687507

marker_celltypes = {
    # Hepatocytes, cluster 14 in liver atlas
    'Hepatocytes': [
        'ALB',
        'CYP3A4',
        'APOA1',
        'APOA2',
        'APOC3',
        'TF',
        'APOH',
        'ALDOB',
        'TTR',
        'GSTA2',
    ],
    # Kupffer, from panglaodb
    'Kupffer': ['C1QB', 'C1QA', 'CD163', 'MAFB', 'CSF1R', 'CLEC4F', 'FCNA', 'ADGRE1'],
    # Liver Sinusoidal Endothelial Cells (LSEC), cluster 9 in liver atlas
    'LSEC': ['EGFL7', 'FLT1', 'CLEC4M', 'CLEC4G', 'FCN3', 'STAB1', 'DNASE1L3'],
    # Hepatic Stellate Cells (HSC), from panglaodb
    'HSC': ['IGFBP6', 'COL1A2', 'SPARC', 'TPM2', 'COL3A1', 'MYL9', 'DCN'],
    # Cholangiocyte, cluster 4+7 in liver atlas
    'Cholangiocyte': ['AGR2', 'ELF3', 'CLDN4', 'KRT7', 'AQP1'],  # "KRT19",
    # Dedifferentiation, low in mature, high in dedifferentiated
    'Dedifferentiated': [
        'KRT19',  # Biliary/progenitor marker
        'EPCAM',  # Also indicates epithelial progenitors
        'LAMB1',  # ECM remodeling and stress-induced dedifferentiation
        'PROM1',  # Progenitor/stemness marker
        'LGR5',  # Liver progenitor marker
        'AFP',  # Strongly fetal/hepatoblast marker
    ],
}

all_markers = np.hstack(list(marker_celltypes.values()))
all_markers = [g for g in all_markers if g in adata_norm.var_names]

idx = (
    adata_norm.obs['ldh_qc_pass']
    & adata_norm.obs['rna_qc_pass']
    & adata_norm.obs['rep_corr_qc_pass']
)
marker_expr = adata_norm[
    idx & (adata_norm.obs['COMPOUND'] == 'DMSO'), all_markers
].to_df()
[19]:
import matplotlib.pyplot as plt
import numpy as np

# Color palette
celltype_palette = {
    'Hepatocytes': '#1f77b4',
    'Kupffer': '#d62728',
    'LSEC': '#2ca02c',
    'HSC': '#9467bd',
    'Cholangiocyte': '#ff7f0e',
    'Dedifferentiated': '#7f7f7f',
}

# Flatten mapping: gene → celltype
marker_to_celltype = {g: ct for ct, genes in marker_celltypes.items() for g in genes}
genes = list(marker_expr.columns)
celltypes = [marker_to_celltype.get(g, 'Unknown') for g in genes]

# Plot
fig, ax = plt.subplots(figsize=(8, 3.5))
flierprops = dict(
    marker='o', markersize=1.5, linestyle='none', markerfacecolor='black', alpha=0.4
)
box = marker_expr.boxplot(
    rot=90,
    patch_artist=True,
    flierprops=flierprops,
    boxprops=dict(color='black', linewidth=1.2),
    ax=ax,
)

# Main x-axis: gene names
xticks = np.arange(1, len(genes) + 1)
ax.set_xticks(xticks)
ax.set_xticklabels(genes, rotation=90, fontsize=8)
for label in ax.get_xticklabels():
    gene = label.get_text()
    celltype = marker_to_celltype.get(gene, 'black')
    label.set_color(celltype_palette.get(celltype, 'black'))

# Y-axis: log10(counts) → raw counts
ax.set_yticks([0, 1, 2, 3, 4, 5])
ax.set_yticklabels(['1', '10', '100', '1k', '10k', '100k'])

# Horizontal guide
ax.axhline(3, ls='--', lw=1, color='gray')

# Compute midpoints for grouped cell types
unique_blocks = []
start_idx = 0
for i in range(1, len(genes)):
    if celltypes[i] != celltypes[i - 1]:
        mid = (start_idx + i - 1) / 2 + 1
        unique_blocks.append((mid, celltypes[i - 1]))
        start_idx = i
mid = (start_idx + len(genes) - 1) / 2 + 1
unique_blocks.append((mid, celltypes[-1]))

# Add cell type labels below the plot (just below x-axis)
y_pos = -0.5  # adjust as needed
for x, label_text in unique_blocks:
    ax.text(
        x,
        y_pos,
        label_text,
        ha='center',
        va='top',
        fontsize=9,
        fontweight='bold',
        color=celltype_palette.get(label_text, 'black'),
        transform=ax.get_xaxis_transform(),
    )

# Final touches
ax.set_title('Expression of Liver Cell-Type Markers', fontsize=12)
ax.set_ylabel('Counts (normalized)', fontsize=10)
plt.tight_layout()
plt.show()
../_images/reproducibility_3.1_Validation_Gene_Signatures_23_0.png

DESeq2

[ ]:
%%capture

from dask.distributed import LocalCluster, Client

cluster = LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='auto')
client = Client(cluster)
[ ]:
adatas = {}

for plate in np.unique(adata.obs['PLATE_NAME']):
    adata_batch = adata[adata.obs['PLATE_NAME'] == plate].copy()
    adata_batch = adata_batch[
        adata_norm.obs['ldh_qc_pass']
        & adata_norm.obs['rna_qc_pass']
        & adata_norm.obs['rep_corr_qc_pass']
    ].copy()

    adata_batch.obs_names_make_unique()

    adatas[plate] = dmap.pp.deseq2(
        adata_batch,
        pert_name_col='COMPOUND',
        other_pert_cols=['DOSE_LEVEL'],
        dask_client=client,
    )

    adatas[plate].obs['PLATE_NAME'] = plate
[ ]:
adata_deseq = ad.concat(adatas.values(), join='outer')
[ ]:
np.sum(~np.isnan(adata_deseq.X[~np.all(np.isnan(adata_deseq.X), 1)].sum(0)))
[ ]:
adata_deseq = adata_deseq[~np.all(np.isnan(adata_deseq.X), 1)].copy()

Push file to S3

[ ]:
# dmap.s3.write(adata_deseq, 'validation_data_deseq2.h5ad', package_name='public/data')