#loading libraries
import numpy as np
import scipy as sp
import pandas as pd

# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
import bokeh.plotting

# For interfacing with the file system
import glob
import subprocess
import os
import time

import importlib
import bilge_pype as bpy

# pipes bokeh output to the notebook
bokeh.io.output_notebook()
# enables some logging output
bpy.init_log(level='INFO')


fastq_folder = 'fastq'
files = glob.glob(fastq_folder+'/*.fq')         # find list of fastq files to read
df_reads = bpy.load_ONT_fastq(files, low_mem=True, header=True) # reads fastq files and parses the header

pid[83575] 2020-10-09 12:31:17.771 INFO: Processing 0/50, current file = fastq/CO1_3.fq
pid[83575] 2020-10-09 12:31:17.772 INFO: elapse = 0.10s
pid[83575] 2020-10-09 12:31:19.772 INFO: Processing 20/50, current file = fastq/CO1_8.fq
pid[83575] 2020-10-09 12:31:19.773 INFO: elapse = 2.10s
pid[83575] 2020-10-09 12:31:21.850 INFO: Processing 40/50, current file = fastq/CO1_5.fq
pid[83575] 2020-10-09 12:31:21.851 INFO: elapse = 4.18s
pid[83575] 2020-10-09 12:31:22.802 INFO: adding ONT header info


df_reads.iloc[:5]


bpy.add_seq(df_reads.iloc[:5])


def plot_length_dist(ax, df, color = 'red', alpha = 0.5, label = '', by_counts = True):
    if by_counts:
        plt.hist(df['length'], bins = np.arange(0,50e3,100),
                 alpha = alpha, color = color, label = label)
        plt.ylabel('counts')
        plt.yscale('symlog')
    else:
        plt.hist(df['length'], bins = np.arange(0,50e3,100),
                 weights = df['length']/1e6, alpha = alpha, color = color, label = label)
        plt.ylabel('MB (nt)')
    plt.xlabel('read length (nt)')
    plt.xlim([0,50e3])

# print the distribution of read lengths
plt.figure(1, figsize=(10,3))

plt.subplot(1,2,1)
plot_length_dist(plt.gca(), df_reads, color='red')
plt.subplot(1,2,2)
plot_length_dist(plt.gca(), df_reads, color='red', by_counts=False)
plt.tight_layout()
plt.show()


cmd = 'python3 ashure.py prfg -fq fastq/*.fq -fs 500-1200 -p primers.csv -o pseudo_ref.csv.gz --low_mem -r'
subprocess.run(cmd, shell=True) # submit the command to shell
cmd = 'python3 ashure.py prfg -fq fastq/*.fq -fs 500-3000 -p primers.csv -o pseudo_ref_bad.csv.gz --low_mem -r'
subprocess.run(cmd, shell=True) # submit the command to shell

CompletedProcess(args='python3 ashure.py prfg -fq fastq/*.fq -fs 500-3000 -p primers.csv -o pseudo_ref_bad.csv.gz --low_mem -r', returncode=0)


# load the pseudo reference sequences
df_prf1 = pd.read_csv('pseudo_ref.csv.gz')
df_prf2 = pd.read_csv('pseudo_ref_bad.csv.gz')
# add length information
df_prf1['length'] = [len(i) for i in df_prf1['sequence']]
df_prf2['length'] = [len(i) for i in df_prf2['sequence']]
# plot the length distribution
plt.figure(1, figsize=(10,3))
plt.hist(df_prf2['length'], bins=np.arange(0,3000,10), color='red', alpha=0.5, label='500-3000bp')
plt.hist(df_prf1['length'], bins=np.arange(0,3000,10), color='blue', alpha=0.5, label='500-1200bp')
plt.legend()
plt.xlabel('sequence length (bp)')
plt.ylabel('frequency (count)')
plt.yscale('log')
plt.show()


# load true reference and pseudo reference databases
df_ref = bpy.read_fasta('ref_db.fa')
df_prf = pd.read_csv('pseudo_ref.csv.gz')
# align pseudo reference against the mock50
df_acc = bpy.run_minimap2(df_prf, df_ref, config='-k8 -w1', cleanup=True)
df_acc = bpy.get_best(df_acc,['query_id'], metric='AS', stat='idxmax')

pid[83575] 2020-10-09 12:34:00.800 INFO: Making directory ./minimap2/


def plot_acc(df_acc, haps):
    data = df_acc[df_acc['database_id'].isin(haps)]
    data = data.sort_values(by = ['database_id'])
    data['error'] = 1-data['similarity']

    # subsample to data to make swarmplot work
    df_sub = bpy.stats_subsample(data, col='database_id', N=100)
    # split into subplots swarm plot don't look crowded
    slist = np.array_split(haps,2)
    for i in range(0, len(slist)):
        plt.subplot(1,2,i+1)
        d = data[data['database_id'].isin(slist[i])]
        cmap = plt.cm.get_cmap('Dark2').colors
        bpy.mpl_box_whisker(plt.gca(), d[['database_id','error']],text_y=0.9, counts=True, cmap=cmap)
        bpy.mpl_violin(plt.gca(), d[['database_id','error']], face_color='white', edge_color='black')
        d = df_sub[df_sub['database_id'].isin(slist[i])]
        sns.swarmplot(x='error', y='database_id', data=d, alpha=1, color='Black', s=2)
        plt.ylabel('')
        if i == 0: plt.ylabel('Haplotypes')
        plt.xlim([0,1])
    plt.tight_layout()

# get the names of the mock50 haplotypes
haps = df_ref[['HAP' in i for i in df_ref['id']]]['id'].values
# make a violin/swarm plot
plt.figure(1, figsize=(10,5))
plot_acc(df_acc, haps)
plt.show()

/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 10.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 13.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 16.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 19.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 19.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 11.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 41.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 37.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 44.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 42.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 38.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 15.6% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot.
  warnings.warn(msg, UserWarning)


cmd = 'python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r'  # runs clustering for 10 iterations
subprocess.run(cmd, shell=True)

CompletedProcess(args='python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r', returncode=0)


# load the data
df_prfc = pd.read_csv('pseudo_clst.csv.gz')
df_prf = pd.read_csv('pseudo_ref.csv.gz')
df_prf['length'] = [len(i) for i in df_prf['sequence']]
df_prfc['length'] = [len(i) for i in df_prfc['sequence']]
# align pseudo reference against the mock50
df_acc2 = bpy.run_minimap2(df_prfc, df_ref, config='-k8 -w1', cleanup=True)
df_acc2 = bpy.get_best(df_acc2,['query_id'], metric='AS', stat='idxmax')
plt.figure(1, figsize=(10,5))
plot_acc(df_acc2, haps)

plt.figure(2, figsize=(12,3))
plt.hist(df_prf['length'], bins=np.arange(0,3000,10), color='red', label='unclustered')
plt.hist(df_prfc['length'], bins=np.arange(0,3000,10), color='blue', label='cluster centers')
plt.legend()
plt.xlabel('sequence length (bp)')
plt.ylabel('frequency (count)')
plt.yscale('symlog')
plt.show()

pid[83575] 2020-10-09 12:50:04.440 INFO: Making directory ./minimap2/


print('clst module produced ',len(df_prfc),' cluster centers')
print('original pseudo reference size = ',len(df_prf))
n1 = len(np.unique(df_acc[df_acc['database_id'].isin(haps)]['database_id']))
n2 = len(np.unique(df_acc2[df_acc2['database_id'].isin(haps)]['database_id']))
print('haplotypes before: ', n1)
print('haplotypes after: ', n2)

clst module produced  58  cluster centers
original pseudo reference size =  5600
haplotypes before:  42
haplotypes after:  33


def compute_pw(df):
    # note that -D in config remove entries for diagonal alignments.
    config = '-k15 -w10 -p 0.9 -D --dual=no'
    df_pw = bpy.run_minimap2(df, df, config=config, cleanup=True)
    # get the best alignment
    return bpy.get_best(df_pw, ['query_id','database_id'], metric='AS', stat='idxmax')


# load data and concatenate dataframes
col = ['id','sequence']
df_prf = pd.read_csv('pseudo_ref.csv.gz')[col]
df_cl = pd.read_csv('pseudo_clst.csv.gz')[col]
df_ref = bpy.read_fasta('ref_db.fa')[col]
# use only mock50
df_ref = df_ref[[('HAP' in i) for i in df_ref['id']]]
df = pd.concat([df_prf, df_cl, df_ref])
df_pw = compute_pw(df[['id','sequence']])
df_pw.to_csv('pseudo_pw.csv.gz', index=False, compression='infer') # save the data

pid[90394] 2020-10-09 22:37:49.082 INFO: Making directory ./minimap2/


def get_cluster(dist, metric='precomputed'):
    # runs tsne on distance matrix
    tsne = bpy.run_TSNE(dist, metric=metric)
    # run hdbscan on tsne output
    hdb = bpy.cluster_HDBSCAN(tsne, metric='euclidean', min_samples=20, min_cluster_size=10)
    hdb = hdb.rename(columns={'cluster_id':'hdbscan_id','ordering':'hdbscan_order'})
    return tsne.merge(hdb, on='id', how='left')

def get_dist_matrix(df):
    # extract the pairwise distance matrix and use match_score as the metric
    # match score is the tot_matching_bp/seq_len
    m = bpy.get_feature_vector(df[['query_id','database_id','similarity']], symmetric=True)
    # make this matrix symmetric
    m = bpy.get_symmetric_matrix(m, sym_larger=False)
    # invert similarity to distance
    for i in range(0,len(m)):
        m.iloc[:,i] = 1-m.iloc[:,i]
        m.iat[i,i] = 0 # set diagonal values to zero
    return m


# load pairwise data
df = pd.read_csv('pseudo_pw.csv.gz')
# get the distance matrix
m = get_dist_matrix(df)
# perform tsne, hdscan, and optics on the data
tsne = get_cluster(m, metric='precomputed')
tsne.to_csv('pseudo_tsne.csv.gz', compression='infer', index=False)

pid[90394] 2020-10-09 22:38:04.053 INFO: running sklearn tsne with n_comp = 2
pid[90394] 2020-10-09 22:39:09.663 INFO: Running HDBSCAN
pid[90394] 2020-10-09 22:39:09.874 INFO: getting and ordering the results
pid[90394] 2020-10-09 22:39:09.894 INFO: number of clusters = 3
pid[90394] 2020-10-09 22:39:09.894 INFO: unclustered = 7


# load sequence data
prf = pd.read_csv('pseudo_ref.csv.gz')
prf['source'] = 'samples'
ref = bpy.read_fasta('ref_db.fa')
# use only mock50
ref = ref[[('HAP' in i) for i in ref['id']]]
ref['source'] = 'reference'
ref['ref_id'] = ref['id']
clst = pd.read_csv('pseudo_clst.csv.gz')
clst['source'] = 'clst'
clst['clst_id'] = clst['id']
# align reference database to pseudo reference and pseudo reference clusters to pseudo reference
A = bpy.run_minimap2(prf, ref, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
A = bpy.get_best(A,['id'],metric='match_score',stat='idxmax')
B = bpy.run_minimap2(prf, clst, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
B = bpy.get_best(B,['id'],metric='match_score',stat='idxmax')
A = A.rename(columns={'database_id':'ref_id','match_score':'ref_match'})
B = B.rename(columns={'database_id':'clst_id','match_score':'clst_match'})
# merge alignment data
prf = prf.merge(A[['id','ref_id','ref_match']], on='id', how='left')
prf = prf.merge(B[['id','clst_id','clst_match']], on='id', how='left')
df = pd.concat([prf,ref,clst])
# merge into tsne
tsne = pd.read_csv('pseudo_tsne.csv.gz')
tsne = tsne.merge(df, on='id', how='left')
# add colors to ref_id
col = 'ref_id'
rid = np.unique(tsne[col].dropna())
cmap = bokeh.palettes.Category20b_20
colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'ref_id_color'])
tsne = tsne.merge(colors,on='ref_id',how='left')
# add colors to clst_id
col = 'clst_id'
rid = np.unique(tsne[col].dropna())
colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'clst_id_color'])
tsne = tsne.merge(colors,on='clst_id',how='left')
tsne = tsne.drop(columns=['sequence','quality'])

pid[90463] 2020-10-09 22:44:30.125 INFO: Making directory ./minimap2/
pid[90463] 2020-10-09 22:44:34.855 INFO: Making directory ./minimap2/


TOOLTIPS = [('id','@id'),
            ('ref_id', '@ref_id'),
            ('ref_match','@ref_match'),
            ('clst_id','@clst_id'),
            ('clst_match','@clst_match'),
            ('hdbscan_id','@hdbscan_id'),
            ('optics_id','@optics_id')]
p = bokeh.plotting.figure(output_backend='webgl',plot_width=600, plot_height=500, tooltips=TOOLTIPS)

d = tsne[tsne['source']=='samples']
# plot samples colored by clst_id
p.circle('f_0', 'f_1', color='clst_id_color', size=5, source=d)

# plot samples colored by ref_id
#p.circle('f_0', 'f_1', color='ref_id_color', size=5, source=d)

# plot cluster centers
d = tsne[tsne['source']=='clst']
p.square('f_0', 'f_1', line_color='black', fill_color='red', size=6, source=d)
d = tsne[tsne['source']=='reference']
p.diamond('f_0', 'f_1', line_color='black', fill_color='blue', size=10, source=d)

bokeh.plotting.output_file('../docs/pages/ashure/pseudo_tsne_1.html')
bokeh.plotting.save(p)
#bokeh.plotting.show(p)

INFO:bokeh.io.state:Session output file '../docs/pages/ashure/pseudo_tsne_1.html' already exists, will be overwritten.

'/Users/zchen/Desktop/ashure/docs/pages/ashure/pseudo_tsne_1.html'


TOOLTIPS = [('id','@id'),
            ('ref_id', '@ref_id'),
            ('ref_match','@ref_match'),
            ('clst_id','@clst_id'),
            ('clst_match','@clst_match'),
            ('hdbscan_id','@hdbscan_id'),
            ('optics_id','@optics_id')]
p = bokeh.plotting.figure(output_backend='webgl', plot_width=600, plot_height=500, tooltips=TOOLTIPS)

# plot samples colored by alignment score
d = tsne[tsne['source']=='samples']
mapper = bokeh.transform.linear_cmap(field_name='ref_match', palette=bokeh.palettes.Turbo256, low=0.8, high=1)
color_bar = bokeh.models.ColorBar(color_mapper=mapper['transform'], width=8,  location=(0,0))
p.circle('f_0', 'f_1', color=mapper, size=5, source=d)

# plot cluster centers
d = tsne[tsne['source']=='clst']
p.square('f_0', 'f_1', line_color='black', fill_color='red', size=6, source=d)
d = tsne[tsne['source']=='reference']
p.diamond('f_0', 'f_1', line_color='black', fill_color='blue', size=10, source=d)

p.add_layout(color_bar, 'right')

bokeh.plotting.output_file('../docs/pages/ashure/pseudo_tsne_2.html')
bokeh.plotting.save(p)
#bokeh.plotting.show(p)

INFO:bokeh.io.state:Session output file '../docs/pages/ashure/pseudo_tsne_2.html' already exists, will be overwritten.

'/Users/zchen/Desktop/ashure/docs/pages/ashure/pseudo_tsne_2.html'


clst[clst['id']=='cluster10'].values

array([['cluster10',
        'TGGCTCGGGTGTCTCAATGATGGAGATTACTGTAAACTTCAGGGTGACCAAAGAATCAGAATAAGTGTTGGTAAAGAATTGGGTCTCCTCCTCCAGCAGGGTCAAAAATGAGGTATTTAAGTTTCGGTCTGTTAAAAGTATAGTGATAGCTCCAGCTAAAACAGGGAGTGACAAAGCAATAGTACAGCTGTAATAACAACTGATCATACAAATAAAGGTATTCGGTCAAAAGTAATTCCATTTGATCGTATATTAATAATAGTTGTAATAAAATTTACAGCTCCTAAAATAGAGGAAATTCCGGCTAAATGTAAAAAAAAATAGCTAAATCAACGGAAGCTCCGGCATGAGCAATTCCAGAGGATAAAGGAGGGTAGACTGTTCATCCTGTTCCAGCTCCTGCTTCTACTATACTACTTGCTAATAAAAAGAGTTAAAGAAGGAGGTAACATTCAAAAACTTATATTATTTATTCGAGGAAAGGCCATATCAGGAGCTCCTAATATTAAAGGCACTAATCAATTTCCAAATCCCCAATTATAATAGGTATTACTATAAAAAAAATTATGACAAAAGCATGAGCTGTAACAATAACATTATAAATTTGATCATCACCAATAAGGATCCGGGATGACCTAATTCAGCTCGAATTAGTATTCTAAGAGAAGTTCCTACTATTCCTGCTCAAGCTCCAAAATGAAGTATAAAGTTCCGATGTCTTTATGATTTGTTGACCGTTCACCCGCGTAACTACCAAGTAATACCACG',
        False, 'clst', 'cluster10']], dtype=object)


c = [('1a27a9' in rid) for rid in prf['id']]
prf[c].values

array([['1a27a932-c851-4f13-9853-c55a56e50ba3_rs207_1064',
        'TGGCTCGGGGTGTCTCCAATGATGGAGATTACCAAACTTCAGGTGACCAAAAACCAAAACAGTTATATAATACGGACACCTCCGATTGAGGGGTCAAAAATGAAGTATTTAAATTTCTATCAGTTAACAACCAGTAATAGCAACGCCAGCCAATACAGTAAAACTTGCATTAATAATAAAAAAAGCTGTACTAAACACTGACCATACAAATAAAGGTAATTTATGGAAAGTTAATCCCGGTGCTCGGCATTATAGACAGTAATAAAAAAATTCAATTGAACCTAATAATGCATTTTAGAAAATAGCCAAATGTGTAAACCAAAAATAGCGCTAAATCAACAGAAGGTCCTGAATGCGCTTGAACACTAGATAATGGTGGATATACAGTCCAACCTGTACCTGCAATCTCAGATTCTACACAGTTGATGAAACTAATAAAAAGTGATGGAGGTAATAACCAAAACTAATATTATTCATTCTAGGAAAAGCCATCTGGAGCACCTTAACATTAAAGGAATAAACCAATTACCAAAACCCCAATTAAAATAGGCATAACCATAAAAAAATATCATAATAAAAAAAGCATGAGCAGTTACTACAACATTATATAATTGATGATTTCCCCATAAAAAATTTGATTTCCAGGTTGTGCTAATTCCATACGGATTAATACAGATACGTAGTACCCACTACACCTCGAAAAAAGCAATCTAATGTTAGGTGCTAATTCCATACGGATTAATACAGATAATGTAGTACCTACTACACCCGAAAAAGCACCAAAATCATATATAGAGTACCGATATCTTTATGGATTTGTTGACCGTTCACCCGCGTAACTACCAAGTAATACCACG',
        "-13---(,01*12'%'*,4+3433521+.)&$$&0/'(*'%'(%)64-+587)0-,32)&%#$'&)('.*(0%3*)&$%'%)(%&&&$%%*-/(*)778*4363/-/68255,/3,&%'&''+(.'.##'$&)'&1*%&$#&*+,,&)+)+*+'%$%%$(300*+'%$,%'(.%(4(/6782/,--*)%)$$-7/**-(-44/1)*)12)')*&%'')(++(&%')'62&(+),(,*./0.&%$%#$%++0,22/0&-&%',&+783+&())%&+(*-131(*1()*.'&%((323-/,0.*$%$$&+0+''())47/4,,885*-(&&(($'..&*)-%&*/635675067111***+001425.,*.**,.*2.-1-42001-132-/412)+,+),'&&$)+%&&)-.+-()*)&+%))$+-21038122631,32,+&'(2224/2++5+.7,-'-731),-/)*+5-632584+**2*252*()$'&$$%0(+*/$'(.20.,/,)/1(,74078.1-,+*-.-+,60+.76-/,.,,)(&'&.-/..34+21/*4885(-+/,,+*,/+,85-(%),,6.10/,.)(%/$',*31*424330.1.0/43/**-,'*/.--*.68850&1+*'*+++0335621..*,*4+2321/00312)-(-(,57,,*/(-+4/-,/0((%&&#&$'$%**266,*$$&/(*,5.0/1+),+(((&(./+2+4+**/)2+4+/+046-+,1-+*(*(%*)+/0/15('',,/*466,*++.-3880-42235555220-01(53301()-('(+-.27300.427342/+*+41/./-0)(',&%'.&&(*'*-+*(&",
        'samples', 'HAP39', 0.6971326164874552, 'cluster2',
        0.8320090805902384]], dtype=object)

	id	seek1	rlen	seek2	filename	header	read	ch	start_time	length
0	c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9	150	2077	2230	fastq/CO1_3.fq	c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9 runid=477...	7106	313	2019-02-06 05:20:32+00:00	2077
1	159d7ae4-7fa9-41f1-bda8-e7575dca0f27	4458	2724	7185	fastq/CO1_3.fq	159d7ae4-7fa9-41f1-bda8-e7575dca0f27 runid=477...	8990	342	2019-02-06 05:22:47+00:00	2724
2	44577c37-ad4b-41e7-9c42-303211cd7a20	10059	2145	12207	fastq/CO1_3.fq	44577c37-ad4b-41e7-9c42-303211cd7a20 runid=477...	8741	64	2019-02-06 05:22:57+00:00	2145
3	11d707b1-3c82-452c-9559-feaf359d2548	14501	2552	17056	fastq/CO1_3.fq	11d707b1-3c82-452c-9559-feaf359d2548 runid=477...	8958	7	2019-02-06 05:28:13+00:00	2552
4	8d647bc9-264d-477e-93d8-2139fbe460fa	19760	1894	21657	fastq/CO1_3.fq	8d647bc9-264d-477e-93d8-2139fbe460fa runid=477...	10585	135	2019-02-06 05:25:47+00:00	1894

	id	header	read	ch	start_time	length	sequence	quality
0	c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9	c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9 runid=477...	7106	313	2019-02-06 05:20:32+00:00	2077	ATGTTGTGCTTCGTTCGATTTACGTATTGCTGGCGTCTGTAGATCA...	#%#''%((),,(,0)%-%&//2)-0'('())0+*&##$,0.(...
1	159d7ae4-7fa9-41f1-bda8-e7575dca0f27	159d7ae4-7fa9-41f1-bda8-e7575dca0f27 runid=477...	8990	342	2019-02-06 05:22:47+00:00	2724	ATCAGTATTGCTTCTACCGTTCAGTTTACGTATTGCTGGCGTCTGC...	$)%(#%#,'&'--(%#$%'/22(.)6'''57.0.+,.,4../1$#&...
2	44577c37-ad4b-41e7-9c42-303211cd7a20	44577c37-ad4b-41e7-9c42-303211cd7a20 runid=477...	8741	64	2019-02-06 05:22:57+00:00	2145	ATCATTGCTTCGTTCTGGTTGCGTTTGCTGGCGTCTGTAGATCTTT...	'('(-()/)'')(%$%%'(%'&,.&'%((((,,*)(&&(('$%(...
3	11d707b1-3c82-452c-9559-feaf359d2548	11d707b1-3c82-452c-9559-feaf359d2548 runid=477...	8958	7	2019-02-06 05:28:13+00:00	2552	ACCAGTACTTCGTTCAGTTACGTATTGCGGCGTGCTTATCAACGTA...	$%$$'((-10+15-'-6,+-455)))-2/-.-*/)--,'%$$(#...
4	8d647bc9-264d-477e-93d8-2139fbe460fa	8d647bc9-264d-477e-93d8-2139fbe460fa runid=477...	10585	135	2019-02-06 05:25:47+00:00	1894	ATCCAGTAACTGCTTCGTTCAGTTACGTACTTGGCGTCATAGCGTC...	$&'$%$%%%'&1,40203-/,6++)+'&,%,&().-$%%%&%(&...

Generating a pseudo reference database¶

Loading data into python¶

Searching for reference sequences¶

Clustering and compressing the pseudo reference¶

Detection of rare haplotypes¶

Visualizing the pseudo reference clusters¶

Getting pairwise distance information¶

Plotting the TSNE representation of the distance relationships¶