#loading libraries
import numpy as np
import scipy as sp
import pandas as pd
# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
import bokeh.plotting
import bokeh.layouts
# For interfacing with the file system
import glob
import subprocess
import os
import time
import importlib
import bilge_pype as bpy
# pipes bokeh output to the notebook
bokeh.io.output_notebook()
# enables some logging output
bpy.init_log(level='INFO')
This notebook is intended to give an overview of the clustering module in ashure.py
. The following will examine both the naive and augmented implementation of OPTICS on clustering nanopore sequence data and show how applying OPTICS to sequence data can help us infer the true haplotype sequences present in our sequencing experiment.
The pseudo reference database generated by ashure.py
will be used as the input dataset for the clustering module. To generate the pseudo reference database, run ashure.py
on this subset of basecalled reads we have provided.
cmd = 'python3 ashure.py prfg -fq fastq/*.fq -fs 500-1200 -p primers.csv -o pseudo_ref.csv.gz --low_mem -r'
subprocess.run(cmd, shell=True) # submit the command to shell
CompletedProcess(args='python3 ashure.py prfg -fq fastq/*.fq -fs 500-1200 -p primers.csv -o pseudo_ref.csv.gz --low_mem -r', returncode=0)
Next, run the clst
module of ashure.py
on the pseudo reference database generated. This will compute the cluster centers and write the output to pseudo_clst.csv.gz
. We will visualize these cluster centers relative to the whole dataset in the next section.
cmd = 'python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r' # runs clustering for 10 iterations
subprocess.run(cmd, shell=True)
CompletedProcess(args='python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r', returncode=0)
t-SNE can help visualize the relationship between sequences by performing dimensionality reduction on the pairwise distances between sequences. In the code snippet below, pseudo reference sequences, cluster center sequences of the pseudo reference, and the mock50 reference sequences are loaded and concatenated. Pairwise alignments are computed via run_minimap2()
and saved to a csv file called pseudo_pw.csv.gz
def compute_pw(df):
# note that -D in config remove entries for diagonal alignments.
config = '-k15 -w10 -p 0.9 -D --dual=no'
df_pw = bpy.run_minimap2(df, df, config=config, cleanup=True)
# get the best alignment
return bpy.get_best(df_pw, ['query_id','database_id'], metric='AS', stat='idxmax')
# load data and concatenate dataframes
df_prf = pd.read_csv('pseudo_ref.csv.gz')[['id','sequence']]
df_cl = pd.read_csv('pseudo_clst.csv.gz')[['id','sequence']]
df_ref = bpy.read_fasta('ref_db.fa')[['id','sequence']]
# use only mock50
df_ref = df_ref[[('HAP' in i) for i in df_ref['id']]]
df = pd.concat([df_prf, df_cl, df_ref])
df_pw = compute_pw(df[['id','sequence']])
df_pw.to_csv('pseudo_pw.csv.gz', index=False, compression='infer') # save the data
pid[90550] 2020-10-09 22:59:43.868 INFO: Making directory ./minimap2/
The code snippet below loads the alignment information generated by minimap2
, transforms it into a distance matrix, and runs t-SNE on the pairwise distance data. t-SNE output is written to a pseudo_tsne.csv.gz
for plotting.
def get_dist_matrix(df):
# extract the pairwise distance matrix and use match_score as the metric
m = bpy.get_feature_vector(df[['query_id','database_id','similarity']], symmetric=True)
# make this matrix symmetric
m = bpy.get_symmetric_matrix(m, sym_larger=False)
# invert similarity to distance
for i in range(0,len(m)):
m.iloc[:,i] = 1-m.iloc[:,i]
m.iat[i,i] = 0 # set diagonal values to zero
return m
def get_cluster(dist, metric='precomputed', alt=False, min_samples=None):
# runs tsne on distance matrix
tsne = bpy.run_TSNE(dist, metric=metric)
# run optics
optics = bpy.cluster_OPTICS(dist, metric=metric, alt_label=alt, min_samples=min_samples)
optics = optics.rename(columns={'cluster_id':'optics_id','ordering':'optics_order'})
# merge the data
tsne = tsne.merge(optics, on='id', how='left')
return tsne
# load pairwise data
df = pd.read_csv('pseudo_pw.csv.gz')
# get the distance matrix
m = get_dist_matrix(df)
# perform tsne, hdscan, and optics on the data
tsne = get_cluster(m, metric='precomputed', min_samples=11)
tsne.to_csv('pseudo_tsne.csv.gz', compression='infer', index=False)
pid[90550] 2020-10-09 22:59:55.891 INFO: running sklearn tsne with n_comp = 2 pid[90550] 2020-10-09 23:01:20.680 INFO: Running OPTICS pid[90550] 2020-10-09 23:01:21.033 INFO: max_eps = 0.5 pid[90550] 2020-10-09 23:01:21.034 INFO: clust_OPTICS: iter=0 using min_samples=11 pid[90550] 2020-10-09 23:01:28.544 INFO: clust_OPTICS: clusters=25 outliers=2158 delta=5.5 pid[90550] 2020-10-09 23:01:28.546 INFO: n_clusters=25 n_unclustered=2158 N=5639
The following code snippet loads the t-SNE data, matches the sequences to their haplotype via minimap2
, and adds color maps to each cluster label.
def match_haplotypes(pfile='',rfile='',cfile='',tfile=''):
# load sequence data
prf = pd.read_csv(pfile)
prf['source'] = 'samples'
ref = bpy.read_fasta(rfile)
# use only mock50
ref = ref[[('HAP' in i) for i in ref['id']]]
ref['source'] = 'reference'
clst = pd.read_csv(cfile)
clst['source'] = 'clst'
cols = ['id','sequence','source']
df = pd.concat([prf[cols],ref[cols],clst[cols]])
# align sequences to haplotypes and cluster centers
A = bpy.run_minimap2(df, ref, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
A = bpy.get_best(A,['id'],metric='AS',stat='idxmax')
B = bpy.run_minimap2(df, clst, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
B = bpy.get_best(B,['id'],metric='AS',stat='idxmax')
A = A.rename(columns={'database_id':'ref_id','similarity':'ref_match'})
B = B.rename(columns={'database_id':'clst_id','similarity':'clst_match'})
# merge data
df = df.merge(A[['id','ref_id','ref_match']], on='id', how='left')
df = df.merge(B[['id','clst_id','clst_match']], on='id', how='left')
# merge into tsne
tsne = pd.read_csv(tfile)
tsne = tsne.merge(df, on='id', how='left')
# add colors to ref_id
col = 'ref_id'
rid = np.unique(tsne[col].dropna())
cmap = bokeh.palettes.Category20b_20
colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'ref_id_color'])
tsne = tsne.merge(colors,on='ref_id',how='left')
# add colors to clst_id
col = 'clst_id'
rid = np.unique(tsne[col].dropna())
colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'clst_id_color'])
tsne = tsne.merge(colors,on='clst_id',how='left')
return tsne
pfile = 'pseudo_ref.csv.gz'
rfile = 'ref_db.fa'
cfile = 'pseudo_clst.csv.gz'
tfile = 'pseudo_tsne.csv.gz'
tsne = match_haplotypes(pfile, rfile, cfile, tfile)
pid[90550] 2020-10-10 19:19:14.672 INFO: Making directory ./minimap2/ pid[90550] 2020-10-10 19:19:19.127 INFO: Making directory ./minimap2/
The following code snippet plots the t-SNE data with bokeh and adds interactive tooltips for each datapoint.
def make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1', s1=2, s2=5, s3=7, color='clst_id_color', R=[0,1]):
TOOLTIPS = [('id','@id'),
('ref_id', '@ref_id'),
('ref_match','@ref_match'),
('clst_id','@clst_id'),
('clst_match','@clst_match'),
('optics_id','@optics_id'),
('reachability','@reachability')]
p = bokeh.plotting.figure(output_backend='webgl', tooltips=TOOLTIPS)
d = tsne[tsne['source']=='samples']
if color in ['ref_match','clst_match','ref_error','cl_error','reachability']:
# establish colormap for numerical data
mapper = bokeh.transform.linear_cmap(field_name=color, palette=bokeh.palettes.Turbo256, low=R[0], high=R[1])
color_bar = bokeh.models.ColorBar(color_mapper=mapper['transform'], width=8, location=(0,0))
p.circle(x_axis, y_axis, color=mapper, size=s1, source=d)
p.add_layout(color_bar, 'right')
else:
# colorize by category
p.circle(x_axis, y_axis, color=color, size=s1, source=d)
# plot cluster centers
d = tsne[tsne['source']=='clst']
p.square(x_axis, y_axis, line_color='black', fill_color='red', size=s2, source=d)
d = tsne[tsne['source']=='reference']
p.diamond(x_axis, y_axis, line_color='black', fill_color='blue', size=s3, source=d)
return p
p = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1', s1=4, s2=5, s3=7, color='clst_id_color')
p.xaxis.axis_label = 'tSNE1'
p.yaxis.axis_label = 'tSNE2'
p.title.text = 'Colorized by identity of cluster center'
grid = bokeh.layouts.gridplot([[p]], plot_width=600, plot_height=500)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_1.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)
INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_1.html' already exists, will be overwritten.
'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_1.html'
Each circle above represents a sequence from the pseudo reference database. Red squares are the cluster centers computed by our augmented implementation of OPTICS. Blue diamonds are the haplotype sequences.
Each sequence from the pseudo reference database is currently colorized by the cluster_id
, which is the label associated with each cluster center computed by our implementation of OPTICS.
On this t-SNE plot, data points lying close to each other have low pairwise distance, which mean their sequences match closely to each other. Data points far apart are more unrelated. The overlap between haplotype sequence and cluster centers on the t-SNE plot indicate that cluster centers computed by the clst
module match closely to the true haplotype sequence.
Despite these reads being highly errored, density based clustering approaches, such as OPTICS, are able to classify them into haplotype groups. Another benefit of OPTICS is computation distances from a theoretical cluster center, which is termed reachability. Lower reachability means the sequence is closer to the centroid of a cluster of sequence. Reachability is highly correlated with error in the sequence, and we are able to use it to infer the true haplotype sequence from corrupted reads.
Why this works is more apparent if we examine the ordering vs reachability plot generated by OPTICS.
p1 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='reachability')
p1.xaxis.axis_label = 'OPTICS ordering'
p1.yaxis.axis_label = 'reachability'
p1.title.text = 'A) Reachability vs ordering'
tsne['ref_error'] = 1-tsne['ref_match']
p2 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='ref_error')
p2.xaxis.axis_label = 'OPTICS ordering'
p2.yaxis.axis_label = 'read error'
p2.title.text = 'B) Error profile vs ordering'
# share x-axis between p1 and p2
p2.x_range = p1.x_range
grid = bokeh.layouts.gridplot([[p1],[p2]], plot_width=800, plot_height=200)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_2a.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)
INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_2a.html' already exists, will be overwritten.
'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_2a.html'
The OPTICS algorithm attempts to identify centroid regions in a set of data based on density. After centroid regions are identified, data points are ordered relative to their distance from these initial centroids. The reachability metric is the distance from the center of a cluster.
The resulting reachability vs ordering plot reveal a series of dips, which correspond to spatially dense clusters of data. In the sequence data context, these low reachability regions correspond to sequences which match closely to each other and have low error. Moving away from the cluster center, the sequences become more disimilar and have higher error.
Our augmented implementation of OPTICS (the clst
module) tries to infer the identity of the haplotype sequence by performing multi-alignment on sequences in these low reachability regions and computing an error corrected consensus. This procedure is analogous to computing the centroid of a set of data in k-means clustering.
The above plots were computed with haplotype sequences and cluster center sequences included in the data. This could potentially bias clustering results. To show that OPTICS can identify low error reads unbiasly, the same analysis is repeated while excluding cluster center and haplotype sequences from the computation.
# load data
df = pd.read_csv('pseudo_ref.csv.gz')
df_pw = compute_pw(df[['id','sequence']])
df_pw.to_csv('pseudo_pw_unbiased.csv.gz', index=False, compression='infer') # save the data
# load pairwise data
df = pd.read_csv('pseudo_pw_unbiased.csv.gz')
# get the distance matrix
m = get_dist_matrix(df)
# perform tsne and optics on the data
tsne = get_cluster(m, metric='precomputed', min_samples=None)
tsne.to_csv('pseudo_tsne_unbiased.csv.gz', compression='infer', index=False)
pid[90550] 2020-10-10 19:21:04.177 INFO: running sklearn tsne with n_comp = 2 pid[90550] 2020-10-10 19:22:08.381 INFO: Running OPTICS pid[90550] 2020-10-10 19:22:08.735 INFO: max_eps = 0.5 pid[90550] 2020-10-10 19:22:08.736 INFO: clust_OPTICS: iter=0 using min_samples=2763 pid[90550] 2020-10-10 19:22:10.033 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=1381.5 pid[90550] 2020-10-10 19:22:10.034 INFO: clust_OPTICS: iter=1 using min_samples=1382
/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers. warnings.warn("All reachability values are inf. Set a larger"
pid[90550] 2020-10-10 19:22:10.926 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=1381 pid[90550] 2020-10-10 19:22:10.928 INFO: clust_OPTICS: iter=2 using min_samples=692
/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers. warnings.warn("All reachability values are inf. Set a larger"
pid[90550] 2020-10-10 19:22:11.707 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=690 pid[90550] 2020-10-10 19:22:11.709 INFO: clust_OPTICS: iter=3 using min_samples=347
/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers. warnings.warn("All reachability values are inf. Set a larger"
pid[90550] 2020-10-10 19:22:12.398 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=345 pid[90550] 2020-10-10 19:22:12.400 INFO: clust_OPTICS: iter=4 using min_samples=175
/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers. warnings.warn("All reachability values are inf. Set a larger"
pid[90550] 2020-10-10 19:22:13.084 INFO: clust_OPTICS: clusters=2 outliers=5516 delta=172 pid[90550] 2020-10-10 19:22:13.109 INFO: clust_OPTICS: iter=5 using min_samples=89 pid[90550] 2020-10-10 19:22:14.142 INFO: clust_OPTICS: clusters=9 outliers=5019 delta=86 pid[90550] 2020-10-10 19:22:14.143 INFO: clust_OPTICS: iter=6 using min_samples=46 pid[90550] 2020-10-10 19:22:16.410 INFO: clust_OPTICS: clusters=13 outliers=3903 delta=43 pid[90550] 2020-10-10 19:22:16.417 INFO: clust_OPTICS: iter=7 using min_samples=25 pid[90550] 2020-10-10 19:22:21.523 INFO: clust_OPTICS: clusters=19 outliers=2668 delta=21 pid[90550] 2020-10-10 19:22:21.524 INFO: clust_OPTICS: iter=8 using min_samples=15 pid[90550] 2020-10-10 19:22:31.875 INFO: clust_OPTICS: clusters=23 outliers=1585 delta=10 pid[90550] 2020-10-10 19:22:31.877 INFO: clust_OPTICS: iter=9 using min_samples=10 pid[90550] 2020-10-10 19:22:51.210 INFO: clust_OPTICS: clusters=23 outliers=734 delta=5 pid[90550] 2020-10-10 19:22:51.211 INFO: clust_OPTICS: iter=10 using min_samples=8 pid[90550] 2020-10-10 19:23:22.104 INFO: clust_OPTICS: clusters=9 outliers=212 delta=2 pid[90550] 2020-10-10 19:23:22.105 INFO: clust_OPTICS: iter=11 using min_samples=11 pid[90550] 2020-10-10 19:23:38.472 INFO: clust_OPTICS: clusters=24 outliers=852 delta=1 pid[90550] 2020-10-10 19:23:38.473 INFO: clust_OPTICS: iter=12 using min_samples=12 pid[90550] 2020-10-10 19:23:53.060 INFO: clust_OPTICS: clusters=23 outliers=1002 delta=-1 pid[90550] 2020-10-10 19:23:53.061 INFO: clust_OPTICS: iter=13 using min_samples=10 pid[90550] 2020-10-10 19:24:12.309 INFO: clust_OPTICS: clusters=23 outliers=734 delta=-1 pid[90550] 2020-10-10 19:24:12.310 INFO: clust_OPTICS: iter=14 using min_samples=9 pid[90550] 2020-10-10 19:24:35.395 INFO: clust_OPTICS: clusters=20 outliers=530 delta=1 pid[90550] 2020-10-10 19:24:35.397 INFO: n_clusters=24 n_unclustered=852 N=5526
# compute error profile of each read
pfile = 'pseudo_ref.csv.gz'
rfile = 'ref_db.fa'
cfile = 'pseudo_clst.csv.gz'
tfile = 'pseudo_tsne_unbiased.csv.gz'
tsne = match_haplotypes(pfile, rfile, cfile, tfile)
pid[90550] 2020-10-10 19:24:35.656 INFO: Making directory ./minimap2/ pid[90550] 2020-10-10 19:24:40.146 INFO: Making directory ./minimap2/
p1 = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1', s1=3, s2=5, s3=7, color='ref_id_color')
p1.xaxis.axis_label = 'tSNE1'
p1.yaxis.axis_label = 'tSNE2'
p1.title.text = 'A) haplotype identity'
tsne['ref_error'] = 1-tsne['ref_match']
p2 = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1',s1=3,s2=5,s3=7,color='ref_error',R=[0,0.2])
p2.xaxis.axis_label = 'tSNE1'
p2.yaxis.axis_label = 'tSNE2'
p2.title.text = 'B) read error'
p3 = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1',s1=3,s2=5,s3=7,color='reachability',R=[0,0.4])
p3.xaxis.axis_label = 'tSNE1'
p3.yaxis.axis_label = 'tSNE2'
p3.title.text = 'C) reachability'
# share x and y axis between p1 and p2
p2.x_range = p1.x_range
p2.y_range = p1.y_range
p3.x_range = p1.x_range
p3.y_range = p1.y_range
grid = bokeh.layouts.gridplot([[p1,p2,p3]], plot_width=400, plot_height=350)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_3.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)
INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_3.html' already exists, will be overwritten.
'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_3.html'
The left tSNE plot has been colorized by the haplotype sequence. The middle plot is colorized by read error. The right plot is colorized by the reachability computed by OPTICS.
Haplotype from the same species lie close together in this spatial map. Cluster centroids tend to have low error and low reachability relative to peripheries of the clusters. This recapitulates what is already seen in the OPTICS ordering plot.
p1 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='reachability', color='ref_id_color')
p1.xaxis.axis_label = 'OPTICS ordering'
p1.yaxis.axis_label = 'reachability'
p1.title.text = 'A) Reachability vs ordering'
tsne['ref_error'] = 1-tsne['ref_match']
p2 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='ref_error', color='ref_id_color')
p2.xaxis.axis_label = 'OPTICS ordering'
p2.yaxis.axis_label = 'read error'
p2.title.text = 'B) Error profile vs ordering'
# share x-axis between p1 and p2
p2.x_range = p1.x_range
grid = bokeh.layouts.gridplot([[p1],[p2]], plot_width=800, plot_height=200)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_4a.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)
INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_4a.html' already exists, will be overwritten.
'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_4a.html'
Here the reachability plot has been colorized by the haplotype sequence. Sequences with low reachability have low error. Sequences from the same haplotype fall in the same cluster as defined by OPTICS.
The naive implementation of OPTICS in sci-kit learn presented above was unsuitable for clustering large (>10000 sequences) datasets. Computing pairwise distances for 10000 sequences is CPU and memory intensive. To make OPTICS tractable for computation on a laptop, random sampling was used to estimate the global distribution of pairwise distances, and OPTICS was modified to work on subsets of the whole data. This approach avoided computation of large pairwise distance matrices, saving significant memory and CPU time. However, this algorithm must now be run iteratively a kin to how k-means clustering works. If enough iterations are run, all cluster centers could be found. However, convergence is not always guaranteed. As shown in the tSNE plots, some cluster centers are still suboptimal, and we are still seeking ways to filter out bad results.