📥 Download all notebooks

1.1. DataPrep: DILI labels

📘 Overview

This notebook integrates clinical DILI annotations (from resources such as DILIrank and LiverTox) with compound-level metadata (e.g., SMILES, molecular weight). It assigns consensus DILI labels for all compounds and generates a unified dataset.

Inputs - DILIrank annotations based on FDA-approved labels - LiverTox annotations of DILI mechanisms and likelihood scores - CHEMBL and selleckchem for compound metadata such as SMILES

Output
A consolidated CSV file containing:
- Compound metadata
- DILI annotations from each source
- Final consensus DILI label
[ ]:
%%capture

!pip install PyPDF2
[1]:
import os
import requests

import numpy as np
import pandas as pd

import tarfile
import PyPDF2

import dilimap as dmap
[2]:
%load_ext autoreload
%autoreload 2
[3]:
dmap.logging.print_version()
Running dilimap 1.0.2 (python 3.10.16) on 2025-06-29 15:26.

1. Download relevant files to S3

[4]:
filedir = '../data/'
[5]:
# ⚠️ Set to True when updating DILIrank / DILIst / LiverTox files from FDA sources
if False:
    os.makedirs(filedir, exist_ok=True)

    # DILIrank (https://www.sciencedirect.com/science/article/abs/pii/S1359644616300411)
    !curl -L 'https://www.fda.gov/media/113052/download' -o '../data/DILIrank.xlsx'
    pd.read_excel(
        f'{filedir}DILIrank.xlsx', engine='openpyxl', header=1, index_col=0
    ).to_csv(f'{filedir}DILIrank.csv')

    # DILIst (https://www.sciencedirect.com/science/article/pii/S1359644619303824)
    !curl -L 'https://www.fda.gov/media/160597/download' -o '../data/DILIst.xlsx'
    pd.read_excel(
        f'{filedir}DILIst.xlsx', engine='openpyxl', header=0, index_col=0
    ).to_csv(f'{filedir}DILIst.csv')

    # LiverTox (https://www.ncbi.nlm.nih.gov/books/NBK547852/)
    !curl -L 'https://www.ncbi.nlm.nih.gov/books/NBK571102/bin/masterlist01-25.xlsx' -o '../data/LiverTox.xlsx'
    pd.read_excel(
        f'{filedir}LiverTox.xlsx', engine='openpyxl', header=1, index_col=0
    ).dropna(how='all')[:1644].to_csv(f'{filedir}LiverTox.csv')

    # Selleckchem (https://www.selleckchem.com/screening/chemical-library.html)
    !curl 'https://file.selleckchem.com/downloads/library/20250519-L1700-Bioactive-Compound-Library-I-96-well.xlsx' -o '../data/Selleckchem.xlsx'
    df_selleckchem = pd.read_excel(
        f'{filedir}Selleckchem.xlsx',
        sheet_name='L1700-Bioactive-9934 cpds',
        index_col=0,
    ).to_csv(f'{filedir}Selleckchem.csv')

    # Push to S3
    for filename in ['DILIst', 'DILIrank', 'LiverTox', 'Selleckchem']:
        dmap.s3.write(
            f'{filedir}{filename}.csv',
            filename=f'{filename}.csv',
            package_name='public/data',
        )

2. DILI labels (DILIrank, DILIst & LiverTox)

[8]:
def lower(index):
    return index.str.lower()


def drop_duplicated_index(df, keep='first'):
    return df[~df.index.str.lower().duplicated(keep='first')]


def capitalize_if_lower(index):
    return [s.capitalize() if s.islower() else s for s in index]
[9]:
df_DILIrank = dmap.s3.read('DILIrank.csv')
df_DILIst = dmap.s3.read('DILIst.csv')
df_livertox = dmap.s3.read('LiverTox.csv')
df_cmpd_list = dmap.s3.read('compound_list_for_industry_benchmark.csv')
Package: s3://dilimap/public/data. Top hash: e13a57ac61
Package: s3://dilimap/public/data. Top hash: e13a57ac61
Package: s3://dilimap/public/data. Top hash: e13a57ac61
Package: s3://dilimap/public/data. Top hash: e13a57ac61
[10]:
df_DILIrank = df_DILIrank.reset_index()

df_DILIrank.index = df_DILIrank['Compound Name'].str.capitalize().str.rstrip(' ')
df_DILIst.index = df_DILIst['CompoundName'].str.capitalize().str.rstrip(' ')
df_livertox.index = df_livertox['Ingredient'].str.capitalize().str.rstrip(' ')

df_DILIrank = df_DILIrank.drop_duplicates(subset='Compound Name')
df_DILIst = df_DILIst.drop_duplicates(subset='CompoundName')
df_livertox = df_livertox.drop_duplicates(subset='Ingredient')
[11]:
df_DILIst['source'] = np.nan
df_DILIst.iloc[909:, df_DILIst.columns.get_loc('source')] = 'LiverTox'
df_DILIst.iloc[952:, df_DILIst.columns.get_loc('source')] = 'Suzuki'
df_DILIst.iloc[1014:, df_DILIst.columns.get_loc('source')] = 'Greene'
df_DILIst.iloc[1107:, df_DILIst.columns.get_loc('source')] = 'Zhu'

df_DILIst.iloc[:909, df_DILIst.columns.get_loc('DILIst Classification ')] = np.nan
/var/folders/lz/prv79nmj5msg8h6nzqn0w7cw0000gn/T/ipykernel_19071/328512810.py:2: FutureWarning: Setting an item of incompatible dtype is deprecated and will raise an error in a future version of pandas. Value 'LiverTox' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  df_DILIst.iloc[909:, df_DILIst.columns.get_loc('source')] = 'LiverTox'
[12]:
df_DILIrank.index = capitalize_if_lower(df_DILIrank.index)
df_DILIst.index = capitalize_if_lower(df_DILIst.index)
df_livertox.index = capitalize_if_lower(df_livertox.index)
df_cmpd_list.index = capitalize_if_lower(df_cmpd_list.index)
[13]:
df_DILI = pd.merge(
    df_DILIrank, df_DILIst, left_index=True, right_index=True, how='outer'
)
[14]:
df_DILI['severity_class'] = df_DILI['Severity Class']
df_DILI['label_section'] = df_DILI['Label Section']
df_DILI['DILIrank'] = df_DILI['vDILIConcern'].str.split('v').str[-1]
df_DILI['DILIst'] = np.where(
    df_DILI['DILIst Classification '] == 1,
    'DILI',
    np.where(df_DILI['DILIst Classification '] == 0, 'No-DILI', np.nan),
)
df_DILI['roa'] = df_DILI['Routs of Administration ']
df_DILI['source'] = df_DILI['source'].fillna('DILIrank')

# Combine all required compound indices
required_cmpds = df_cmpd_list.index.union(df_livertox.index).union(
    [
        'Asunaprevir',
        'FIRU',
        'Lomitapide',
        'TAK-875',
        'AKN-028',
        'Evobrutinib',
        'BMS-986142',
        'Ibrutinib',
        'Orelabrutinib',
        'Remibrutinib',
        'Rilzabrutinib',
        'Ruxolitinib',
        'Tofacitinib',
        'Upadacitinib',
    ]
)

# Add any missing compounds to df_DILI with NaN rows
df_DILI = df_DILI.reindex(df_DILI.index.union(required_cmpds))

df_DILI['compound_name'] = df_DILI.index

df_DILI = df_DILI[
    [
        'LTKBID',
        'compound_name',
        'DILIrank',
        'label_section',
        'severity_class',
        'DILIst',
        'roa',
        'source',
    ]
]
df_DILI = df_DILI.sort_values(['source', 'compound_name'])
[15]:
df_DILI['livertox_score'] = df_DILI.index.map(df_livertox['Likelihood Score'])
df_DILI['livertox_primary_classification'] = df_DILI.index.map(
    df_livertox['Primary Classification']
)
df_DILI['livertox_secondary_classification'] = df_DILI.index.map(
    df_livertox['Secondary Classification']
)
[16]:
df_DILI = drop_duplicated_index(df_DILI)

3. Drug information (CHEMBL)

[17]:
df_chembl = dmap.clients.chembl(df_DILI.index)
[18]:
len(df_DILI), len(df_chembl['smiles']), len(df_chembl['smiles'].dropna())
[18]:
(2475, 2145, 1806)
[19]:
df_chembl.reset_index(inplace=True)
df_chembl.set_index('molecule_name', inplace=True)

df_chembl.index = lower(df_chembl.index)
df_chembl = drop_duplicated_index(df_chembl)
[20]:
for k in df_chembl.columns:
    df_DILI[k] = lower(df_DILI.index).map(df_chembl[k])
[21]:
drug_warnings = dmap.clients.drug_warnings(chembl_id=df_DILI['molecule_chembl_id'])
[22]:
## Since the client slightly lags behind on one drug

if 'CHEMBL623' not in drug_warnings.index:
    drug_warnings.loc['CHEMBL623'] = {
        'efo_id': 'EFO:0004228',
        'efo_id_for_warning_class': 'EFO:0011052',
        'efo_term': 'drug-induced liver injury',
        'parent_molecule_chembl_id': 'CHEMBL623',
        'warning_class': 'hepatotoxicity',
        'warning_country': 'Canada: 2003',
        'warning_description': 'Adverse hepatic events',
        'warning_type': 'Withdrawn',
    }
[23]:
drug_warnings.index = lower(drug_warnings.index)

for key in [
    'warning_class',
    'warning_country',
    'warning_description',
    'warning_type',
    'warning_year',
]:
    if key in drug_warnings.columns:
        df_DILI[key] = lower(df_DILI['molecule_chembl_id']).map(drug_warnings[key])
[24]:
df_DILI['withdrawn_reason'] = np.where(
    df_DILI['warning_type'] == 'Withdrawn', df_DILI['warning_class'], np.nan
)

4. Drug target and solubility information (Selleckchem)

[25]:
df_selleckchem = dmap.s3.read('Selleckchem.csv')
Package: s3://dilimap/public/data. Top hash: e13a57ac61
[26]:
df_selleckchem.index = lower(df_selleckchem['Name'])
df_selleckchem.index = df_selleckchem.index.str.split(r' \(').str[0].str.rstrip(' ')
df_selleckchem = drop_duplicated_index(df_selleckchem)
[27]:
for k in ['Target', 'Pathway', 'Information']:
    df_DILI[k.lower()] = lower(df_DILI.index).map(df_selleckchem[k])

df_DILI['CAS_number'] = lower(df_DILI.index).map(df_selleckchem['CAS Number'])
df_DILI['DMSO_mM_solubility'] = lower(df_DILI.index).map(
    df_selleckchem['DMSO (mM)Max Solubility']
)

5. Consensus DILI label

[28]:
for k in ['DILIrank', 'livertox_score', 'label_section', 'withdrawn_reason']:
    if k in df_DILI.columns:
        df_DILI[k] = df_DILI[k].replace(np.nan, '')
[29]:
# --- Define consensus DILI labels from DILIrank and LiverTox ---

# Define terms
is_Most = df_DILI['DILIrank'].str.startswith('Most')
is_No = df_DILI['DILIrank'].str.startswith('No')
is_Withdrawn = df_DILI['label_section'].eq('Withdrawn')
is_Withdrawn |= (
    df_DILI['withdrawn_reason'].str.lower().str.contains('liver|hepa', na=False)
)
is_A = df_DILI['livertox_score'].str.startswith('A')
is_AB = df_DILI['livertox_score'].str.startswith(('A', 'B'))
is_E = df_DILI['livertox_score'].str.startswith('E')

# Withdrawn due to hepatotoxicity
Withdrawn = is_Most & is_Withdrawn

# Known DILI: High-confidence score A
Most_and_A = (
    is_Most | df_DILI['DILIrank'].eq('')
) & is_A  # also consider if not explicitly listed in DILIrank

# Likely DILI: Most-DILIrank or A/B score
Most_or_AB = is_Most | is_AB

# Few case reports or ambiguous evidence
C_or_D = df_DILI['livertox_score'].str.startswith(('C', 'D'))

# No DILI
No_and_E = (
    (
        df_DILI['DILIrank'].str.startswith(('No', 'Ambiguous'))
        & (is_E | (df_DILI['severity_class'] <= 1))
    )
    | (is_No & df_DILI['livertox_score'].eq(''))
    | (df_DILI['DILIrank'].eq('') & is_E)
    | (
        df_DILI['DILIrank'].eq('')
        & df_DILI['livertox_score'].eq('')
        & df_DILI['DILIst'].eq('No-DILI')
    )
)


# --- Manual additions ---

Withdrawn |= df_DILI['compound_name'].isin(
    ['AKN-028', 'TAK-875', 'Evobrutinib', 'BMS-986142', 'Orelabrutinib']
)

Most_or_AB |= df_DILI['compound_name'].isin(
    [
        'Phenylbutazone',
        'Flucloxacillin',
        'Salsalate',
        'TAK-875',
        'AKN-028',
        'Evobrutinib',
        'BMS-986142',
    ]
)

C_or_D |= df_DILI['compound_name'].isin(
    [
        'Mesalazine',
        'Aceclofenac',
        'Glibenclamide',
        'Mycophenolate mofetil',
        'Valdecoxib',
        'Tenoxicam',
        'Ibrutinib',
        'Vismodegib',
    ]
)

No_and_E |= df_DILI['compound_name'].isin(['FIRU', 'Upadacitinib'])

# --- Assign multi-level DILI labels ---

df_DILI['DILI_label'] = np.select(
    [Withdrawn, Most_and_A, Most_or_AB, No_and_E, C_or_D, is_E],
    [
        'DILI (withdrawn)',
        'DILI (known)',
        'DILI (likely)',
        'No DILI',
        'DILI (few cases)',
        'No DILI (unlikely)',
    ],
    default='',
)

# Binary DILI vs No-DILI label
df_DILI['DILI_label_binary'] = np.where(
    Withdrawn | Most_and_A | Most_or_AB, 'DILI', np.where(No_and_E, 'No DILI', '')
)

# Additional section label from what's in paranthesis (for grouping if needed)
df_DILI['DILI_label_section'] = (
    df_DILI['DILI_label'].str.extract(r'\(([^)]+)\)', expand=False).str.capitalize()
)
df_DILI['DILI_label_section'] = df_DILI['DILI_label_section'].fillna(
    df_DILI['DILI_label']
)
[30]:
df_DILI['DILI_label'].value_counts()
[30]:
DILI_label
No DILI               1009
                       697
DILI (few cases)       328
DILI (likely)          234
DILI (known)           113
DILI (withdrawn)        62
No DILI (unlikely)      32
Name: count, dtype: int64

6. Mechanisms of DILI

Optional — not used for model training, but to support faster interpretation.

[31]:
# Download livertox book
if not os.path.exists(f'{filedir}livertox_NBK547852/'):
    response = requests.get(
        'https://ftp.ncbi.nlm.nih.gov/pub/litarch/29/31/livertox_NBK547852.tar.gz',
        stream=True,
    )
    file = tarfile.open(fileobj=response.raw, mode='r|gz')
    file.extractall(path='../data/')

    file.close()

    # remove all non-pdf files
    for root, dirs, files in os.walk('../data/livertox_NBK547852/'):
        for file in files:
            if not file.lower().endswith('.pdf') and file[:-4] not in df_DILI.index:
                file_path = os.path.join(root, file)
                os.remove(file_path)
[ ]:
import os
import pandas as pd
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

kw_list = [
    'Mechanism of Injury\n',
    'Mechanism of Liver Injury\n',
    'Mechanism of injury\n',
    'Mechanism of Hepatotoxicity\n',
    'Mechanism of Injur y\n',  # typo in Zonisamide
]


def extract_info(cmpd):
    filepath = f'{filedir}livertox_NBK547852/{cmpd}.pdf'
    if not os.path.exists(filepath):
        return cmpd, np.nan, np.nan, np.nan, df_DILI.loc[cmpd, 'DILI_label']

    try:
        reader = PyPDF2.PdfReader(filepath)
        text = ''.join([page.extract_text() or '' for page in reader.pages[:10]])

        # Identify the mechanism section keyword
        kw = next((k for k in kw_list if k in text), None)
        if kw is None:
            return cmpd, np.nan, np.nan, np.nan, df_DILI.loc[cmpd, 'DILI_label']

        # Extract mechanism
        mech = text.split(kw)[-1].split('Outcome and Management')[0].replace('\n', '')

        # Extract overview text
        info = np.nan
        if 'OVERVIEW\n' in text:
            info = text.split('OVERVIEW\n')[1].split(kw)[0]
            info = (
                info.replace('Introduction\n', 'INTRODUCTION ')
                .replace('Background\n', ' BACKGROUND ')
                .replace('Hepatotoxicity\n', ' HEPATOTOXICITY ')
                .replace('\n', '')
            )

        # Extract update timestamp
        updated = text.split('[Updated ')[1].split(']')[0].replace('\n', '')

        return cmpd, mech, info, updated, np.nan

    except Exception as e:
        print(f'Error processing {cmpd}: {e}')
        return cmpd, np.nan, np.nan, np.nan, df_DILI.loc[cmpd, 'DILI_label']


# Run in parallel
results = []
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = [executor.submit(extract_info, cmpd) for cmpd in df_DILI.index]
    for future in as_completed(futures):
        results.append(future.result())

# Update df_DILI with results
for cmpd, mech, info, updated, warn_label in results:
    if not pd.isna(mech):
        df_DILI.loc[cmpd, 'livertox_mechanism'] = mech
        df_DILI.loc[cmpd, 'livertox_information'] = info
        df_DILI.loc[cmpd, 'livertox_updated'] = updated

# Final cleanup
df_DILI['livertox_mechanism'] = df_DILI['livertox_mechanism'].str.replace(
    '  ', ' ', regex=False
)
df_DILI['livertox_information'] = df_DILI['livertox_information'].str.replace(
    '  ', ' ', regex=False
)
[41]:
idx_moa = df_DILI['livertox_mechanism'].astype(str).str.contains('idiosyncratic')
idx_moa &= ~df_DILI['livertox_mechanism'].astype(str).str.contains('not idiosyncratic')
idx_moa &= df_DILI['livertox_score'].str.startswith('C') | df_DILI[
    'livertox_score'
].str.startswith('D')

idx = {}
for kw in [
    'clearly',
    'likely',
    'probably',
    'suggest',
    'may be',
    'whether this injury is',
    'has features of',
    'various',
    'cases',
    'instances of',
]:
    idx[kw] = (
        df_DILI['livertox_mechanism']
        .str.split('idiosyncratic')
        .str[0]
        .str[-30:]
        .str.lower()
        .str.contains(kw)
    )

df_DILI['livertox_iDILI'] = np.where(
    idx_moa & idx['clearly'],
    'idiosyncratic',
    np.where(
        idx_moa & (idx['likely'] | idx['probably']),
        'likely idiosyncratic',
        np.where(
            idx_moa
            & (
                idx['may be']
                | idx['suggest']
                | idx['whether this injury is']
                | idx['has features of']
            ),
            'possibly idiosyncratic',
            np.where(
                idx_moa & (idx['various'] | idx['cases'] | idx['instances of']),
                'idiosyncratic cases',
                np.where(idx_moa, 'idiosyncratic', ''),
            ),
        ),
    ),
)

idx_dili = df_DILI['livertox_information'].astype(str).str.contains('idiosyncratic')
idx_dili &= ~df_DILI['livertox_information'].astype(str).str.contains(
    'not idiosyncratic'
)
idx_dili &= df_DILI['livertox_score'].str.startswith('C') | df_DILI[
    'livertox_score'
].str.startswith('D')

idx = {}
for kw in [
    'clearly',
    'most common causes',
    'well known cause of',
    'well established cause',
    'likely',
    'probably',
    'suggest',
    'may be',
    'whether this injury is',
    'has features of',
    'various',
    'cases',
    'instances of',
    'the few',
]:
    idx[kw] = (
        df_DILI['livertox_information']
        .str.split('idiosyncratic')
        .str[0]
        .str[-45:]
        .str.lower()
        .str.contains(kw)
    )

df_DILI['livertox_iDILI'] = np.where(
    idx_moa,
    df_DILI['livertox_iDILI'],
    np.where(
        idx_dili
        & (
            idx['clearly']
            | idx['most common causes']
            | idx['well known cause of']
            | idx['well established cause']
        ),
        'idiosyncratic',
        np.where(
            idx_dili & (idx['likely'] | idx['probably']),
            'likely iDILI',
            np.where(
                idx_dili
                & (
                    idx['may be']
                    | idx['suggest']
                    | idx['whether this injury is']
                    | idx['has features of']
                ),
                'possibly idiosyncratic',
                np.where(
                    idx_dili
                    & (
                        idx['various']
                        | idx['cases']
                        | idx['instances of']
                        | idx['the few']
                    ),
                    'idiosyncratic cases',
                    np.where(idx_dili, 'idiosyncratic cases', ''),
                ),
            ),
        ),
    ),
)
[42]:
df_tmp = df_DILI.copy()

moas = [
    'Hepatitis',
    'Necrosis',
    'Mitochondrial dysfunction',
    'CYP',
    'Oxidative stress',
    'Cholestasis/Biliary',
    'Sinusoidal',
    'Steatosis',
    'Fibrosis',
    'Cirrhosis',
    'Lactic acidosis',
    'Encephalopathy',
    'Immune-mediated',
    'Hypersensitivity',
    'Hypersensitivity Syndrome',
]

is_DILI = df_tmp['DILI_label_section'].isin(
    ['Withdrawn', 'Known', 'Likely', 'Few cases']
)
livertox_text = (
    df_tmp['livertox_information'].astype(str)
    + df_tmp['livertox_mechanism'].astype(str)
).str.lower()

for moa in moas:
    df_tmp[f'livertox_{moa}'] = livertox_text.str.contains(moa.lower()) & is_DILI

    if moa == 'Hepatitis':
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('inflammation') & is_DILI
        )

    if moa == 'Necrosis':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('cell death') & is_DILI

    if moa == 'Mitochondrial dysfunction':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('mitochond') & is_DILI

    if moa == 'CYP':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('p450') & is_DILI
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('reactive metabolites') & is_DILI
        )

    if moa == 'Oxidative stress':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('oxidative') & is_DILI
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('reactive oxygen species') & is_DILI
        )

    if moa == 'Cholestasis/Biliary':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('cholesta') & is_DILI
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('impaired bile flow') & is_DILI
        )
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('biliary') & is_DILI
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('bile duct') & is_DILI

    if moa == 'Steatosis':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('fatty liver') & is_DILI
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('steato') & is_DILI

    if moa == 'Fibrosis':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('fibrotic') & is_DILI

    if moa == 'Lactic acidosis':
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('metabolic acidosis') & is_DILI
        )

    if moa == 'Hypersensitivity':
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('allergic') & is_DILI
        df_tmp[f'livertox_{moa}'] &= ~livertox_text.str.contains(
            'hypersensitivity syndrome'
        )

    if moa == 'Immune-mediated':
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('immunological') & is_DILI
        )
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('immunoallergic') & is_DILI
        )
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('immunoallergenic') & is_DILI
        )
        df_tmp[f'livertox_{moa}'] |= livertox_text.str.contains('immunogenic') & is_DILI
        df_tmp[f'livertox_{moa}'] |= (
            livertox_text.str.contains('immune response') & is_DILI
        )

    print(moa, df_tmp[f'livertox_{moa}'].sum())

for cmpd in df_tmp.index:
    df_tmp.loc[cmpd, 'livertox_mechanism_summary'] = ', '.join(
        [moa for moa in moas if df_tmp.loc[cmpd, f'livertox_{moa}']]
    )
Hepatitis 268
Necrosis 94
Mitochondrial dysfunction 16
CYP 209
Oxidative stress 8
Cholestasis/Biliary 262
Sinusoidal 31
Steatosis 57
Fibrosis 38
Cirrhosis 54
Lactic acidosis 24
Encephalopathy 27
Immune-mediated 279
Hypersensitivity 341
Hypersensitivity Syndrome 13
[43]:
df_DILI['livertox_mechanism_summary'] = (
    df_tmp['livertox_mechanism_summary']
    + np.where(df_DILI['livertox_iDILI'] != '', ', ', '')
    + df_DILI['livertox_iDILI']
)
[44]:
manual_annotation = {
    'Bromfenac': 'Necrosis (manual annotation)',
    'Cefuroxime': 'Cholestasis/Biliary (manual annotation)',
    'Citalopram': 'Cholestasis/Biliary, CYP (manual annotation)',
    'Cloxacillin': 'Hepatitis, Cholestasis/Biliary, Hypersensitivity (manual annotation)',
    'Cyclofenil': 'Hepatitis, Cholestasis/Biliary (manual annotation)',
    'Etoposide': 'Hepatitis, Sinusoidal (manual annotation)',
    'Flucloxacillin': 'Hypersensitivity, idiosyncratic (manual annotation)',
    'Glafenine': 'Cirrhosis (manual annotation)',
    'Glimepiride': 'Cholestasis/Biliary (manual annotation)',
    'Glipizide': 'Hepatitis (manual annotation)',
    'Ibufenac': 'Necrosis (manual annotation)',
    'Mefenamic acid': 'Hypersensitivity, idiosyncratic (manual annotation)',
    'Mesalazine': 'Cholestasis/Biliary (manual annotation)',
    'Methazolamide': 'Cirrhosis, Encephalopathy (manual annotation)',
    'Milnacipran': 'Immune-mediated (manual annotation)',
    'Moxisylyte': 'Immune-mediated, Hypersensitivity (manual annotation)',
    'Mycophenolate mofetil': 'Immune-mediated, idiosyncratic (manual annotation)',
    'Phenylbutazone': 'Immune-mediated, idiosyncratic (manual annotation)',
    'Pyrimethamine': 'Cholestasis/Biliary, hypersensitivity (manual annotation)',
    'Sitaxsentan': 'idiosyncratic (manual annotation)',
    'Tenoxicam': 'Cholestasis/Biliary, Immune-mediated (manual annotation)',
    'Ticrynafen': 'Hepatitis, Cirrhosis, Hypersensitivity (manual annotation)',
    'Trimethoprim': 'Cholestasis/Biliary, idiosyncratic (manual annotation)',
    'Trovafloxacin': 'idiosyncratic (manual annotation)',
    'Valproic acid': 'Oxidative stress, Mitochondrial dysfunction, Steatosis (manual annotation)',
    'Ximelagatran': 'Immune-mediated, Hypersensitivity, idiosyncratic (manual annotation)',
}

for cmpd, moa in manual_annotation.items():
    df_DILI.loc[cmpd, 'livertox_mechanism_summary'] = moa

7. Summary stats

[67]:
df_DILI['livertox_score'] = df_DILI['livertox_score'].str.replace(' ', '', regex=False)
[69]:
df_ct = pd.crosstab(df_DILI['DILIrank'], df_DILI['livertox_score'])
df_ct.sort_values('A', ascending=False)
[69]:
livertox_score A A[HD] B B[HD] C C[HD] D D[HD] E E* X
DILIrank
661 60 10 41 2 52 5 81 1 374 147 2
Most-DILI-Concern 69 46 2 26 0 33 0 12 0 1 3 0
Less-DILI-Concern 23 21 6 55 2 69 3 67 0 20 12 0
Ambiguous DILI-concern 68 6 2 2 0 5 1 38 1 75 56 0
No-DILI-Concern 148 0 2 1 0 4 0 6 0 130 21 0
[46]:
df_ct = pd.crosstab(
    df_DILI['DILIrank'],
    df_DILI['livertox_score'],
    df_DILI['DILI_label_section'],
    aggfunc=set,
)
df_ct.sort_values('A', ascending=False)
[46]:
livertox_score A A A [HD] B B[HD] C C[HD] D D[HD] E E* X
DILIrank
{, No DILI, Likely, Few cases, Withdrawn} {Known} {Known} {Known} {Likely} {Likely} {Few cases} {Few cases} {No DILI, Few cases} {Few cases} {No DILI} {No DILI} {}
Ambiguous DILI-concern {, No DILI} {Likely} {Likely} {Likely} {Likely} NaN {Few cases} {Few cases} {Few cases} {Few cases} {No DILI} {No DILI} NaN
Less-DILI-Concern {, Few cases} {Likely} NaN {Likely} {Likely} {Likely} {Few cases} {Few cases} {Few cases} NaN {Unlikely} {Unlikely} NaN
Most-DILI-Concern {Withdrawn, Likely} {Withdrawn, Known} NaN {Known} {Withdrawn, Likely} NaN {Withdrawn, Likely} NaN {Likely} NaN {Likely} {Likely} NaN
No-DILI-Concern {No DILI} NaN NaN {Likely} {Likely} NaN {No DILI} NaN {No DILI} NaN {No DILI} {No DILI} NaN

8. Cleanup

[48]:
df_DILI[df_DILI.index.str.lower().duplicated()]
[48]:
LTKBID compound_name DILIrank label_section severity_class DILIst roa source livertox_score livertox_primary_classification ... CAS_number DMSO_mM_solubility DILI_label DILI_label_binary DILI_label_section livertox_mechanism livertox_information livertox_updated livertox_iDILI livertox_mechanism_summary
Estrogens, Conjugated NaN Estrogens, Conjugated NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN
esomeprazole Sodium NaN esomeprazole Sodium NaN NaN NaN NaN NaN ... 161796-78-7 198.69 NaN NaN NaN
gadolinium ethoxybenzyl DTPA NaN gadolinium ethoxybenzyl DTPA NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN

3 rows × 62 columns

[55]:
df_DILI.index = capitalize_if_lower(df_DILI.index)
df_DILI = drop_duplicated_index(df_DILI)
[56]:
len(df_DILI)
[56]:
2472
[57]:
df_DILI.memory_usage(deep=True).sort_values()
[57]:
polymer_flag              19776
usan_year                 19776
prodrug                   19776
orphan                    19776
inorganic_flag            19776
                         ...
Index                    169581
smiles                   233640
information              325230
livertox_mechanism      2800630
livertox_information    4364485
Length: 63, dtype: int64

9. Push file to S3

[58]:
df_DILI_clean = df_DILI.copy()

for col in ['livertox_mechanism', 'livertox_information']:
    if col in df_DILI_clean.columns:
        df_DILI_clean.pop(col)
[59]:
# dmap.s3.write(df_DILI_clean, 'compound_DILI_labels.csv')
Package: s3://dilimap/public/data. Top hash: ff7d660e50
Copying objects: 100%|██████████████████████| 1.22M/1.22M [00:01<00:00, 709kB/s]
Package public/data@d3ef751 pushed to s3://dilimap
Run `quilt3 catalog s3://dilimap/` to browse.
Successfully pushed the new package