#loading libraries
import numpy as np
import scipy as sp
import pandas as pd

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
import bokeh.plotting
import bokeh.layouts

# For interfacing with the file system
import glob
import subprocess
import os
import time

import importlib
import bilge_pype as bpy

# pipes bokeh output to the notebook
bokeh.io.output_notebook()
# enables some logging output
bpy.init_log(level='INFO')


cmd = 'python3 ashure.py prfg -fq fastq/*.fq -fs 500-1200 -p primers.csv -o pseudo_ref.csv.gz --low_mem -r'
subprocess.run(cmd, shell=True) # submit the command to shell

CompletedProcess(args='python3 ashure.py prfg -fq fastq/*.fq -fs 500-1200 -p primers.csv -o pseudo_ref.csv.gz --low_mem -r', returncode=0)


cmd = 'python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r'  # runs clustering for 10 iterations
subprocess.run(cmd, shell=True)

CompletedProcess(args='python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r', returncode=0)


def compute_pw(df):
    # note that -D in config remove entries for diagonal alignments.
    config = '-k15 -w10 -p 0.9 -D --dual=no'
    df_pw = bpy.run_minimap2(df, df, config=config, cleanup=True)
    # get the best alignment
    return bpy.get_best(df_pw, ['query_id','database_id'], metric='AS', stat='idxmax')


# load data and concatenate dataframes
df_prf = pd.read_csv('pseudo_ref.csv.gz')[['id','sequence']]
df_cl = pd.read_csv('pseudo_clst.csv.gz')[['id','sequence']]
df_ref = bpy.read_fasta('ref_db.fa')[['id','sequence']]
# use only mock50
df_ref = df_ref[[('HAP' in i) for i in df_ref['id']]]
df = pd.concat([df_prf, df_cl, df_ref])
df_pw = compute_pw(df[['id','sequence']])
df_pw.to_csv('pseudo_pw.csv.gz', index=False, compression='infer') # save the data

pid[90550] 2020-10-09 22:59:43.868 INFO: Making directory ./minimap2/


def get_dist_matrix(df):
    # extract the pairwise distance matrix and use match_score as the metric
    m = bpy.get_feature_vector(df[['query_id','database_id','similarity']], symmetric=True)
    # make this matrix symmetric
    m = bpy.get_symmetric_matrix(m, sym_larger=False)
    # invert similarity to distance
    for i in range(0,len(m)):
        m.iloc[:,i] = 1-m.iloc[:,i]
        m.iat[i,i] = 0 # set diagonal values to zero
    return m

def get_cluster(dist, metric='precomputed', alt=False, min_samples=None):
    # runs tsne on distance matrix
    tsne = bpy.run_TSNE(dist, metric=metric)
    # run optics
    optics = bpy.cluster_OPTICS(dist, metric=metric, alt_label=alt, min_samples=min_samples)
    optics = optics.rename(columns={'cluster_id':'optics_id','ordering':'optics_order'})
    # merge the data
    tsne = tsne.merge(optics, on='id', how='left')
    return tsne


# load pairwise data
df = pd.read_csv('pseudo_pw.csv.gz')
# get the distance matrix
m = get_dist_matrix(df)
# perform tsne, hdscan, and optics on the data
tsne = get_cluster(m, metric='precomputed', min_samples=11)
tsne.to_csv('pseudo_tsne.csv.gz', compression='infer', index=False)

pid[90550] 2020-10-09 22:59:55.891 INFO: running sklearn tsne with n_comp = 2
pid[90550] 2020-10-09 23:01:20.680 INFO: Running OPTICS
pid[90550] 2020-10-09 23:01:21.033 INFO: max_eps = 0.5
pid[90550] 2020-10-09 23:01:21.034 INFO: clust_OPTICS: iter=0 using min_samples=11
pid[90550] 2020-10-09 23:01:28.544 INFO: clust_OPTICS: clusters=25 outliers=2158 delta=5.5
pid[90550] 2020-10-09 23:01:28.546 INFO: n_clusters=25 n_unclustered=2158 N=5639


def match_haplotypes(pfile='',rfile='',cfile='',tfile=''):
    # load sequence data
    prf = pd.read_csv(pfile)
    prf['source'] = 'samples'
    ref = bpy.read_fasta(rfile)
    # use only mock50
    ref = ref[[('HAP' in i) for i in ref['id']]]
    ref['source'] = 'reference'
    clst = pd.read_csv(cfile)
    clst['source'] = 'clst'
    cols = ['id','sequence','source']
    df = pd.concat([prf[cols],ref[cols],clst[cols]])
    # align sequences to haplotypes and cluster centers
    A = bpy.run_minimap2(df, ref, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
    A = bpy.get_best(A,['id'],metric='AS',stat='idxmax')
    B = bpy.run_minimap2(df, clst, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
    B = bpy.get_best(B,['id'],metric='AS',stat='idxmax')
    A = A.rename(columns={'database_id':'ref_id','similarity':'ref_match'})
    B = B.rename(columns={'database_id':'clst_id','similarity':'clst_match'})
    # merge data
    df = df.merge(A[['id','ref_id','ref_match']], on='id', how='left')
    df = df.merge(B[['id','clst_id','clst_match']], on='id', how='left')
    # merge into tsne
    tsne = pd.read_csv(tfile)
    tsne = tsne.merge(df, on='id', how='left')
    # add colors to ref_id
    col = 'ref_id'
    rid = np.unique(tsne[col].dropna())
    cmap = bokeh.palettes.Category20b_20
    colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
    colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'ref_id_color'])
    tsne = tsne.merge(colors,on='ref_id',how='left')
    # add colors to clst_id
    col = 'clst_id'
    rid = np.unique(tsne[col].dropna())
    colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
    colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'clst_id_color'])
    tsne = tsne.merge(colors,on='clst_id',how='left')
    return tsne


pfile = 'pseudo_ref.csv.gz'
rfile = 'ref_db.fa'
cfile = 'pseudo_clst.csv.gz'
tfile = 'pseudo_tsne.csv.gz'
tsne = match_haplotypes(pfile, rfile, cfile, tfile)

pid[90550] 2020-10-10 19:19:14.672 INFO: Making directory ./minimap2/
pid[90550] 2020-10-10 19:19:19.127 INFO: Making directory ./minimap2/


def make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1', s1=2, s2=5, s3=7, color='clst_id_color', R=[0,1]):
    TOOLTIPS = [('id','@id'),
            ('ref_id', '@ref_id'),
            ('ref_match','@ref_match'),
            ('clst_id','@clst_id'),
            ('clst_match','@clst_match'),
            ('optics_id','@optics_id'),
            ('reachability','@reachability')]
    p = bokeh.plotting.figure(output_backend='webgl', tooltips=TOOLTIPS)
    
    d = tsne[tsne['source']=='samples']
    if color in ['ref_match','clst_match','ref_error','cl_error','reachability']:
        # establish colormap for numerical data
        mapper = bokeh.transform.linear_cmap(field_name=color, palette=bokeh.palettes.Turbo256, low=R[0], high=R[1])
        color_bar = bokeh.models.ColorBar(color_mapper=mapper['transform'], width=8,  location=(0,0))
        p.circle(x_axis, y_axis, color=mapper, size=s1, source=d)
        p.add_layout(color_bar, 'right')
    else:
        # colorize by category
        p.circle(x_axis, y_axis, color=color, size=s1, source=d)
    
    # plot cluster centers
    d = tsne[tsne['source']=='clst']
    p.square(x_axis, y_axis, line_color='black', fill_color='red', size=s2, source=d)
    d = tsne[tsne['source']=='reference']
    p.diamond(x_axis, y_axis, line_color='black', fill_color='blue', size=s3, source=d)
    return p


p = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1', s1=4, s2=5, s3=7, color='clst_id_color')
p.xaxis.axis_label = 'tSNE1'
p.yaxis.axis_label = 'tSNE2'
p.title.text = 'Colorized by identity of cluster center'

grid = bokeh.layouts.gridplot([[p]], plot_width=600, plot_height=500)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_1.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)

INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_1.html' already exists, will be overwritten.

'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_1.html'


p1 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='reachability')
p1.xaxis.axis_label = 'OPTICS ordering'
p1.yaxis.axis_label = 'reachability'
p1.title.text = 'A) Reachability vs ordering'

tsne['ref_error'] = 1-tsne['ref_match']
p2 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='ref_error')
p2.xaxis.axis_label = 'OPTICS ordering'
p2.yaxis.axis_label = 'read error'
p2.title.text = 'B) Error profile vs ordering'

# share x-axis between p1 and p2
p2.x_range = p1.x_range

grid = bokeh.layouts.gridplot([[p1],[p2]], plot_width=800, plot_height=200)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_2a.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)

INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_2a.html' already exists, will be overwritten.

'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_2a.html'


# load data
df = pd.read_csv('pseudo_ref.csv.gz')
df_pw = compute_pw(df[['id','sequence']])
df_pw.to_csv('pseudo_pw_unbiased.csv.gz', index=False, compression='infer') # save the data


# load pairwise data
df = pd.read_csv('pseudo_pw_unbiased.csv.gz')
# get the distance matrix
m = get_dist_matrix(df)
# perform tsne and optics on the data
tsne = get_cluster(m, metric='precomputed', min_samples=None)
tsne.to_csv('pseudo_tsne_unbiased.csv.gz', compression='infer', index=False)

pid[90550] 2020-10-10 19:21:04.177 INFO: running sklearn tsne with n_comp = 2
pid[90550] 2020-10-10 19:22:08.381 INFO: Running OPTICS
pid[90550] 2020-10-10 19:22:08.735 INFO: max_eps = 0.5
pid[90550] 2020-10-10 19:22:08.736 INFO: clust_OPTICS: iter=0 using min_samples=2763
pid[90550] 2020-10-10 19:22:10.033 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=1381.5
pid[90550] 2020-10-10 19:22:10.034 INFO: clust_OPTICS: iter=1 using min_samples=1382

/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers.
  warnings.warn("All reachability values are inf. Set a larger"

pid[90550] 2020-10-10 19:22:10.926 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=1381
pid[90550] 2020-10-10 19:22:10.928 INFO: clust_OPTICS: iter=2 using min_samples=692

/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers.
  warnings.warn("All reachability values are inf. Set a larger"

pid[90550] 2020-10-10 19:22:11.707 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=690
pid[90550] 2020-10-10 19:22:11.709 INFO: clust_OPTICS: iter=3 using min_samples=347

/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers.
  warnings.warn("All reachability values are inf. Set a larger"

pid[90550] 2020-10-10 19:22:12.398 INFO: clust_OPTICS: clusters=0 outliers=5526 delta=345
pid[90550] 2020-10-10 19:22:12.400 INFO: clust_OPTICS: iter=4 using min_samples=175

/usr/local/lib/python3.8/site-packages/sklearn/cluster/_optics.py:501: UserWarning: All reachability values are inf. Set a larger max_eps or all data will be considered outliers.
  warnings.warn("All reachability values are inf. Set a larger"

pid[90550] 2020-10-10 19:22:13.084 INFO: clust_OPTICS: clusters=2 outliers=5516 delta=172
pid[90550] 2020-10-10 19:22:13.109 INFO: clust_OPTICS: iter=5 using min_samples=89
pid[90550] 2020-10-10 19:22:14.142 INFO: clust_OPTICS: clusters=9 outliers=5019 delta=86
pid[90550] 2020-10-10 19:22:14.143 INFO: clust_OPTICS: iter=6 using min_samples=46
pid[90550] 2020-10-10 19:22:16.410 INFO: clust_OPTICS: clusters=13 outliers=3903 delta=43
pid[90550] 2020-10-10 19:22:16.417 INFO: clust_OPTICS: iter=7 using min_samples=25
pid[90550] 2020-10-10 19:22:21.523 INFO: clust_OPTICS: clusters=19 outliers=2668 delta=21
pid[90550] 2020-10-10 19:22:21.524 INFO: clust_OPTICS: iter=8 using min_samples=15
pid[90550] 2020-10-10 19:22:31.875 INFO: clust_OPTICS: clusters=23 outliers=1585 delta=10
pid[90550] 2020-10-10 19:22:31.877 INFO: clust_OPTICS: iter=9 using min_samples=10
pid[90550] 2020-10-10 19:22:51.210 INFO: clust_OPTICS: clusters=23 outliers=734 delta=5
pid[90550] 2020-10-10 19:22:51.211 INFO: clust_OPTICS: iter=10 using min_samples=8
pid[90550] 2020-10-10 19:23:22.104 INFO: clust_OPTICS: clusters=9 outliers=212 delta=2
pid[90550] 2020-10-10 19:23:22.105 INFO: clust_OPTICS: iter=11 using min_samples=11
pid[90550] 2020-10-10 19:23:38.472 INFO: clust_OPTICS: clusters=24 outliers=852 delta=1
pid[90550] 2020-10-10 19:23:38.473 INFO: clust_OPTICS: iter=12 using min_samples=12
pid[90550] 2020-10-10 19:23:53.060 INFO: clust_OPTICS: clusters=23 outliers=1002 delta=-1
pid[90550] 2020-10-10 19:23:53.061 INFO: clust_OPTICS: iter=13 using min_samples=10
pid[90550] 2020-10-10 19:24:12.309 INFO: clust_OPTICS: clusters=23 outliers=734 delta=-1
pid[90550] 2020-10-10 19:24:12.310 INFO: clust_OPTICS: iter=14 using min_samples=9
pid[90550] 2020-10-10 19:24:35.395 INFO: clust_OPTICS: clusters=20 outliers=530 delta=1
pid[90550] 2020-10-10 19:24:35.397 INFO: n_clusters=24 n_unclustered=852 N=5526


# compute error profile of each read
pfile = 'pseudo_ref.csv.gz'
rfile = 'ref_db.fa'
cfile = 'pseudo_clst.csv.gz'
tfile = 'pseudo_tsne_unbiased.csv.gz'
tsne = match_haplotypes(pfile, rfile, cfile, tfile)

pid[90550] 2020-10-10 19:24:35.656 INFO: Making directory ./minimap2/
pid[90550] 2020-10-10 19:24:40.146 INFO: Making directory ./minimap2/


p1 = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1', s1=3, s2=5, s3=7, color='ref_id_color')
p1.xaxis.axis_label = 'tSNE1'
p1.yaxis.axis_label = 'tSNE2'
p1.title.text = 'A) haplotype identity'

tsne['ref_error'] = 1-tsne['ref_match']
p2 = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1',s1=3,s2=5,s3=7,color='ref_error',R=[0,0.2])
p2.xaxis.axis_label = 'tSNE1'
p2.yaxis.axis_label = 'tSNE2'
p2.title.text = 'B) read error'

p3 = make_bokeh_scatter(tsne, x_axis='f_0', y_axis='f_1',s1=3,s2=5,s3=7,color='reachability',R=[0,0.4])
p3.xaxis.axis_label = 'tSNE1'
p3.yaxis.axis_label = 'tSNE2'
p3.title.text = 'C) reachability'

# share x and y axis between p1 and p2
p2.x_range = p1.x_range
p2.y_range = p1.y_range
p3.x_range = p1.x_range
p3.y_range = p1.y_range

grid = bokeh.layouts.gridplot([[p1,p2,p3]], plot_width=400, plot_height=350)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_3.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)

INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_3.html' already exists, will be overwritten.

'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_3.html'


p1 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='reachability', color='ref_id_color')
p1.xaxis.axis_label = 'OPTICS ordering'
p1.yaxis.axis_label = 'reachability'
p1.title.text = 'A) Reachability vs ordering'

tsne['ref_error'] = 1-tsne['ref_match']
p2 = make_bokeh_scatter(tsne, x_axis='optics_order', y_axis='ref_error', color='ref_id_color')
p2.xaxis.axis_label = 'OPTICS ordering'
p2.yaxis.axis_label = 'read error'
p2.title.text = 'B) Error profile vs ordering'

# share x-axis between p1 and p2
p2.x_range = p1.x_range

grid = bokeh.layouts.gridplot([[p1],[p2]], plot_width=800, plot_height=200)
bokeh.plotting.output_file('../docs/pages/ashure/clustering_tsne_4a.html')
bokeh.plotting.save(grid)
#bokeh.plotting.show(grid)

INFO:bokeh.io.state:Session output file '../docs/pages/ashure/clustering_tsne_4a.html' already exists, will be overwritten.

'/Users/zchen/Desktop/ashure/docs/pages/ashure/clustering_tsne_4a.html'

Clustering sequence data with OPTICS¶

Introduction¶

Loading the dataset¶

Visualizing sequence data with t-SNE¶

Correlation between reachability and error profile of reads¶

tSNE without haplotype and cluster center bias¶

tSNE plot of sequences colorized by haplotype identity¶

Reachability plot colorized by haplotype identity¶

OPTICS on large datasets and room for improvement¶