#loading libraries
import numpy as np
import scipy as sp
import pandas as pd
# plotting libraries
import matplotlib.pyplot as plt
import seaborn as sns
import bokeh
import bokeh.plotting
# For interfacing with the file system
import glob
import subprocess
import os
import time
import importlib
import bilge_pype as bpy
# pipes bokeh output to the notebook
bokeh.io.output_notebook()
# enables some logging output
bpy.init_log(level='INFO')
After running the basecaller on fast5 files, you will have a folder containing basecalled fastq reads. bilge_pype.py
contains a few useful functions for reading and writing fastq and fasta files. As some of these files could be quite large, we use the low_mem=True
option to save on RAM when running this notebook, but still allow us to fetch the sequence and quality strings.
fastq_folder = 'fastq'
files = glob.glob(fastq_folder+'/*.fq') # find list of fastq files to read
df_reads = bpy.load_ONT_fastq(files, low_mem=True, header=True) # reads fastq files and parses the header
pid[83575] 2020-10-09 12:31:17.771 INFO: Processing 0/50, current file = fastq/CO1_3.fq pid[83575] 2020-10-09 12:31:17.772 INFO: elapse = 0.10s pid[83575] 2020-10-09 12:31:19.772 INFO: Processing 20/50, current file = fastq/CO1_8.fq pid[83575] 2020-10-09 12:31:19.773 INFO: elapse = 2.10s pid[83575] 2020-10-09 12:31:21.850 INFO: Processing 40/50, current file = fastq/CO1_5.fq pid[83575] 2020-10-09 12:31:21.851 INFO: elapse = 4.18s pid[83575] 2020-10-09 12:31:22.802 INFO: adding ONT header info
df_reads.iloc[:5]
id | seek1 | rlen | seek2 | filename | header | read | ch | start_time | length | |
---|---|---|---|---|---|---|---|---|---|---|
0 | c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9 | 150 | 2077 | 2230 | fastq/CO1_3.fq | c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9 runid=477... | 7106 | 313 | 2019-02-06 05:20:32+00:00 | 2077 |
1 | 159d7ae4-7fa9-41f1-bda8-e7575dca0f27 | 4458 | 2724 | 7185 | fastq/CO1_3.fq | 159d7ae4-7fa9-41f1-bda8-e7575dca0f27 runid=477... | 8990 | 342 | 2019-02-06 05:22:47+00:00 | 2724 |
2 | 44577c37-ad4b-41e7-9c42-303211cd7a20 | 10059 | 2145 | 12207 | fastq/CO1_3.fq | 44577c37-ad4b-41e7-9c42-303211cd7a20 runid=477... | 8741 | 64 | 2019-02-06 05:22:57+00:00 | 2145 |
3 | 11d707b1-3c82-452c-9559-feaf359d2548 | 14501 | 2552 | 17056 | fastq/CO1_3.fq | 11d707b1-3c82-452c-9559-feaf359d2548 runid=477... | 8958 | 7 | 2019-02-06 05:28:13+00:00 | 2552 |
4 | 8d647bc9-264d-477e-93d8-2139fbe460fa | 19760 | 1894 | 21657 | fastq/CO1_3.fq | 8d647bc9-264d-477e-93d8-2139fbe460fa runid=477... | 10585 | 135 | 2019-02-06 05:25:47+00:00 | 1894 |
id
is a unique identifier string for each fastq read.
ch
is the channel where the read originated from.
start_time
is the acquisition time of read.
length
is the length of the sequence.
Apply add_seq()
to df_reads
to get the sequence and quality information.
bpy.add_seq(df_reads.iloc[:5])
id | header | read | ch | start_time | length | sequence | quality | |
---|---|---|---|---|---|---|---|---|
0 | c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9 | c8ad9dfe-51cb-4396-8b2d-2acdcd8e22f9 runid=477... | 7106 | 313 | 2019-02-06 05:20:32+00:00 | 2077 | ATGTTGTGCTTCGTTCGATTTACGTATTGCTGGCGTCTGTAGATCA... | #%#''%(()*,,(*,0)%-*%&//2)-0'('*())0+*&##$,0.(... |
1 | 159d7ae4-7fa9-41f1-bda8-e7575dca0f27 | 159d7ae4-7fa9-41f1-bda8-e7575dca0f27 runid=477... | 8990 | 342 | 2019-02-06 05:22:47+00:00 | 2724 | ATCAGTATTGCTTCTACCGTTCAGTTTACGTATTGCTGGCGTCTGC... | $)%(#%#,'&'--(%#$%'/22(.)6'''57.0.+,.,4../1$#&... |
2 | 44577c37-ad4b-41e7-9c42-303211cd7a20 | 44577c37-ad4b-41e7-9c42-303211cd7a20 runid=477... | 8741 | 64 | 2019-02-06 05:22:57+00:00 | 2145 | ATCATTGCTTCGTTCTGGTTGCGTTTGCTGGCGTCTGTAGATCTTT... | '('(-*()/)'')(%$%%'(%'&,.*&'%((((,,*)(&&(('$%(... |
3 | 11d707b1-3c82-452c-9559-feaf359d2548 | 11d707b1-3c82-452c-9559-feaf359d2548 runid=477... | 8958 | 7 | 2019-02-06 05:28:13+00:00 | 2552 | ACCAGTACTTCGTTCAGTTACGTATTGCGGCGTGCTTATCAACGTA... | $%$$'((*-10+15-'-6,+-45*5)))-2/-.-*/)--,'%$$(#... |
4 | 8d647bc9-264d-477e-93d8-2139fbe460fa | 8d647bc9-264d-477e-93d8-2139fbe460fa runid=477... | 10585 | 135 | 2019-02-06 05:25:47+00:00 | 1894 | ATCCAGTAACTGCTTCGTTCAGTTACGTACTTGGCGTCATAGCGTC... | $&'$%$%%%'*&*1,40203-/,6++)+'&,%,&().-$%%%&%(&... |
ashure.py
uses reference sequence to search for concatemers in PCR products generated by rolling circle amplification. These repeats are multi-aligned with spoa
to generate a more error free consensus read. In the absence of a reference database, we would need a best guess at what we are sequencing. This is our pseudo reference database, which can be generated by performing a paired end search for primer sequences in the basecalled reads.
Searching every read for forward and reverse primer pairs is time intensive and unnecessary. PCR amplifies the original templates, making the reference over represented. We can filter out large repetitive fragments and just examine a subset that are likely to contain our reference sequences. For our study, we focus only on reads 500-1200bp long because these reads are likely to contain only one concatemer of CO1.
For users who are using short primers <20bp, the paired end primer search may fail because of read corruption. This is a limitation of minimap2, which was not design to align highly errored short sequences. To address this shortcoming, we recommend the following potential fixes to our users:
1) Augment primer sequences by combining the forward and reverse primer sequences. Use this as the new fwd and reverse sequence for the paired end search. pfwd = ATGC prev = GGGG new-pfwd = CCCC ATGC new-prev = GCAT GGGG
2) If the identity of the gene being sequenced is known, augment the primer sequences by add 40-60bp from both of ends of the gene being sequenced. The drawback of this approach is that the paired primer search becomes biased by the addition of the user sequence. The pseudo reference database may not contain all the gene families you are interested in examining ,and you may need to reexamine the reads which did not yield any hits in the first run of ashure.py
in order to make sure your sequencing results are complete.
def plot_length_dist(ax, df, color = 'red', alpha = 0.5, label = '', by_counts = True):
if by_counts:
plt.hist(df['length'], bins = np.arange(0,50e3,100),
alpha = alpha, color = color, label = label)
plt.ylabel('counts')
plt.yscale('symlog')
else:
plt.hist(df['length'], bins = np.arange(0,50e3,100),
weights = df['length']/1e6, alpha = alpha, color = color, label = label)
plt.ylabel('MB (nt)')
plt.xlabel('read length (nt)')
plt.xlim([0,50e3])
# print the distribution of read lengths
plt.figure(1, figsize=(10,3))
plt.subplot(1,2,1)
plot_length_dist(plt.gca(), df_reads, color='red')
plt.subplot(1,2,2)
plot_length_dist(plt.gca(), df_reads, color='red', by_counts=False)
plt.tight_layout()
plt.show()
The cytochrome oxidase subunit 1 (CO1) gene is about 700bp. Size filtering is used to filter out concatemers. We want to focus on non-repeated reads because there is a chance some reads are so corrupted that the primer sequence could not be found in the correct location. You could end up with concatemers instead of a single full length CO1.
To illustrate this, lets run ashure.py
with length filtering from 500-1200bp and 500-3000.
cmd = 'python3 ashure.py prfg -fq fastq/*.fq -fs 500-1200 -p primers.csv -o pseudo_ref.csv.gz --low_mem -r'
subprocess.run(cmd, shell=True) # submit the command to shell
cmd = 'python3 ashure.py prfg -fq fastq/*.fq -fs 500-3000 -p primers.csv -o pseudo_ref_bad.csv.gz --low_mem -r'
subprocess.run(cmd, shell=True) # submit the command to shell
CompletedProcess(args='python3 ashure.py prfg -fq fastq/*.fq -fs 500-3000 -p primers.csv -o pseudo_ref_bad.csv.gz --low_mem -r', returncode=0)
# load the pseudo reference sequences
df_prf1 = pd.read_csv('pseudo_ref.csv.gz')
df_prf2 = pd.read_csv('pseudo_ref_bad.csv.gz')
# add length information
df_prf1['length'] = [len(i) for i in df_prf1['sequence']]
df_prf2['length'] = [len(i) for i in df_prf2['sequence']]
# plot the length distribution
plt.figure(1, figsize=(10,3))
plt.hist(df_prf2['length'], bins=np.arange(0,3000,10), color='red', alpha=0.5, label='500-3000bp')
plt.hist(df_prf1['length'], bins=np.arange(0,3000,10), color='blue', alpha=0.5, label='500-1200bp')
plt.legend()
plt.xlabel('sequence length (bp)')
plt.ylabel('frequency (count)')
plt.yscale('log')
plt.show()
With larger bp size windows, concatemers will show up as peaks at around 1500bp and 2400bp. These concatemers are sometimes undesirable because they interfer with proper dereplication of an RCA read. In special cases, concatemers could be desirable because the gene of interest has many repetitive domains such as ankyrin proteins. We leave these decisions up to the user as you are the expert on the gene you are sequencing.
Filtering and concatenation of reads by size is fairly straight forward. An example is provided below.
x = df_prf2[(df_prf2['length'] > 700) & (df_prf2['length'] < 1000)] # get sequences between 700-1000bp
y = df_prf2[(df_prf2['length'] > 1200) & (df_prf2['length'] < 1600)] # get sequences between 1200-1600bp
df = pd.concat([x,y]) # concatenate the dateframes
df.to_csv('example.csv.gz', compression='infer', index=False) # write the data to csv
Compacting and dereplicating the pseudo reference can significantly speed up run time without hurting accuracy and sensitivity. This is possible because the pseudo reference contains redundant haplotype sequences. Remember that exact matches are not required. The pseudo reference is meant to be a fuzzy representation of the overall dataset such that genes of interest can quickly be found.
The code snippet below shows how to align the pseudo reference against the true reference database for mock50 samples and reveal the redundancy of information in the pseudo reference.
# load true reference and pseudo reference databases
df_ref = bpy.read_fasta('ref_db.fa')
df_prf = pd.read_csv('pseudo_ref.csv.gz')
# align pseudo reference against the mock50
df_acc = bpy.run_minimap2(df_prf, df_ref, config='-k8 -w1', cleanup=True)
df_acc = bpy.get_best(df_acc,['query_id'], metric='AS', stat='idxmax')
pid[83575] 2020-10-09 12:34:00.800 INFO: Making directory ./minimap2/
run_minimap2()
is a wrapper function to minimap2 in bilge_pype.py
. To get more help, type help(bpy.run_minimap2)
in python shell.
get_best()
is a helpful function to filter to the best alignment
Alignment of fasta or csv files against other fasta or csv file can also be invoked using get_accuracy.py
. This script depends on bilge_pype.py
. Below is an example usage:
./get_accuracy.py -h # get help
./get_accuracy.py -i pseudo_ref.csv.gz -o acc.csv.gz # input pseudo_ref.csv.gz and output to acc.csv.gz
def plot_acc(df_acc, haps):
data = df_acc[df_acc['database_id'].isin(haps)]
data = data.sort_values(by = ['database_id'])
data['error'] = 1-data['similarity']
# subsample to data to make swarmplot work
df_sub = bpy.stats_subsample(data, col='database_id', N=100)
# split into subplots swarm plot don't look crowded
slist = np.array_split(haps,2)
for i in range(0, len(slist)):
plt.subplot(1,2,i+1)
d = data[data['database_id'].isin(slist[i])]
cmap = plt.cm.get_cmap('Dark2').colors
bpy.mpl_box_whisker(plt.gca(), d[['database_id','error']],text_y=0.9, counts=True, cmap=cmap)
bpy.mpl_violin(plt.gca(), d[['database_id','error']], face_color='white', edge_color='black')
d = df_sub[df_sub['database_id'].isin(slist[i])]
sns.swarmplot(x='error', y='database_id', data=d, alpha=1, color='Black', s=2)
plt.ylabel('')
if i == 0: plt.ylabel('Haplotypes')
plt.xlim([0,1])
plt.tight_layout()
# get the names of the mock50 haplotypes
haps = df_ref[['HAP' in i for i in df_ref['id']]]['id'].values
# make a violin/swarm plot
plt.figure(1, figsize=(10,5))
plot_acc(df_acc, haps)
plt.show()
/usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 10.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 13.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 16.3% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.2% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 19.4% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 19.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 11.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 7.9% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 41.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 37.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 44.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 42.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 38.0% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning) /usr/local/lib/python3.8/site-packages/seaborn/categorical.py:1296: UserWarning: 15.6% of the points cannot be placed; you may want to decrease the size of the markers or use stripplot. warnings.warn(msg, UserWarning)
The violin/swarm plot above shows that some haplotypes are represented by 1 sequence, while some are represented by several hundred sequences. Compute time is waste on aligning every fastq read to every closely related pseudo reference sequence.
The ideal pseudo reference database for this dataset is around 50 sequences or one sequence for each mock50 sample. The pseudo reference can be compacted by invoking the clst
module in ashure.py
. The clst
module uses OPTICS density clustering to order, label, merge, and return the representative center sequences for each haplotype. OPTICS is an unsupervised clustering algorithm that uses density to flag potential sequence clusters and cluster centers.
cmd = 'python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r' # runs clustering for 10 iterations
subprocess.run(cmd, shell=True)
CompletedProcess(args='python3 ashure.py clst -i pseudo_ref.csv.gz -o pseudo_clst.csv.gz -iter 10 -r', returncode=0)
# load the data
df_prfc = pd.read_csv('pseudo_clst.csv.gz')
df_prf = pd.read_csv('pseudo_ref.csv.gz')
df_prf['length'] = [len(i) for i in df_prf['sequence']]
df_prfc['length'] = [len(i) for i in df_prfc['sequence']]
# align pseudo reference against the mock50
df_acc2 = bpy.run_minimap2(df_prfc, df_ref, config='-k8 -w1', cleanup=True)
df_acc2 = bpy.get_best(df_acc2,['query_id'], metric='AS', stat='idxmax')
plt.figure(1, figsize=(10,5))
plot_acc(df_acc2, haps)
plt.figure(2, figsize=(12,3))
plt.hist(df_prf['length'], bins=np.arange(0,3000,10), color='red', label='unclustered')
plt.hist(df_prfc['length'], bins=np.arange(0,3000,10), color='blue', label='cluster centers')
plt.legend()
plt.xlabel('sequence length (bp)')
plt.ylabel('frequency (count)')
plt.yscale('symlog')
plt.show()
pid[83575] 2020-10-09 12:50:04.440 INFO: Making directory ./minimap2/
print('clst module produced ',len(df_prfc),' cluster centers')
print('original pseudo reference size = ',len(df_prf))
n1 = len(np.unique(df_acc[df_acc['database_id'].isin(haps)]['database_id']))
n2 = len(np.unique(df_acc2[df_acc2['database_id'].isin(haps)]['database_id']))
print('haplotypes before: ', n1)
print('haplotypes after: ', n2)
clst module produced 58 cluster centers original pseudo reference size = 5600 haplotypes before: 42 haplotypes after: 33
Clustering removed many of the redundant sequences. Although some rare haplotypes may have been lost, we retain the majority of the interesting sequences which could still be in our sample.
For some sequences, the accuracy of the pseudo reference entry improves because merging via multi-alignment averages out the sequencing errors. For other sequences, the accuracy seems to have decreased because wrong sequences were merged together.
More on this will be elaborated in the next section and in clustering.ipynb
To run the remainder of ashure.py
with this compressed pseudo reference database, an example is shown below:
./ashure.py -h # get help
./ashure.py run -fq fastq/* -p primers.csv -db pseudo_clst.csv.gz -o1 cons.csv.gz --low_mem # run the whole pipeline
Sometimes rare haplotypes are missed in the first run of ashure.py
. This could be because rare haplotypes were not included in the pseudo reference database and were sufficiently distinct from found haplotypes that minimap2 did not find good alignments. Rare haplotypes could be recovered by re-examing the reads which did not yield any hits, blasting some of these reads, and/or adding some of these reads into the pseudo reference database.
The pseudo reference database does not need to be perfect. The sequences only need to approximately match your gene of interest. If the rare gene is wholly missed in the first run, ashure.py
can be rerun on reads which did not strongly map to anything.
The similarity relationships between sequences can be visualized via a dendrogram, pairwise matrix, or tsne plot. In this section, we show how functions in bilge_pype.py
can help you generate pairwise distance data and make tsne plots to visualize your sequence data.
In the code snippet below, pseudo reference sequences, cluster center sequences of the pseudo reference, and the mock50 reference sequences are loaded and concatenated. Pairwise alignments are computed via run_minimap2()
and saved to a csv file called pseudo_pw.csv.gz
.
def compute_pw(df):
# note that -D in config remove entries for diagonal alignments.
config = '-k15 -w10 -p 0.9 -D --dual=no'
df_pw = bpy.run_minimap2(df, df, config=config, cleanup=True)
# get the best alignment
return bpy.get_best(df_pw, ['query_id','database_id'], metric='AS', stat='idxmax')
# load data and concatenate dataframes
col = ['id','sequence']
df_prf = pd.read_csv('pseudo_ref.csv.gz')[col]
df_cl = pd.read_csv('pseudo_clst.csv.gz')[col]
df_ref = bpy.read_fasta('ref_db.fa')[col]
# use only mock50
df_ref = df_ref[[('HAP' in i) for i in df_ref['id']]]
df = pd.concat([df_prf, df_cl, df_ref])
df_pw = compute_pw(df[['id','sequence']])
df_pw.to_csv('pseudo_pw.csv.gz', index=False, compression='infer') # save the data
pid[90394] 2020-10-09 22:37:49.082 INFO: Making directory ./minimap2/
Dendrograms are not always suitable for viewing sequence relationships. Complicated relationships such as evenly spaced points on a sphere are not captured by dendrograms. Neighborhood embeddings such as TSNE are better at preserving high dimensional relationships. TSNE projects high dimensional relations onto low dimensional space and tries to minimize the divergence in neighborhood structure between the low dimension and high dimension representations of the data.
The code snippet below shows how to apply TSNE to the pseudo reference and obtain a 2D representation of the relationships between sequences. HDBSCAN is used to flag clusters in 2D space. Data is returned as a pandas dataframe and saved to pseudo_tnse.csv.gz
.
def get_cluster(dist, metric='precomputed'):
# runs tsne on distance matrix
tsne = bpy.run_TSNE(dist, metric=metric)
# run hdbscan on tsne output
hdb = bpy.cluster_HDBSCAN(tsne, metric='euclidean', min_samples=20, min_cluster_size=10)
hdb = hdb.rename(columns={'cluster_id':'hdbscan_id','ordering':'hdbscan_order'})
return tsne.merge(hdb, on='id', how='left')
def get_dist_matrix(df):
# extract the pairwise distance matrix and use match_score as the metric
# match score is the tot_matching_bp/seq_len
m = bpy.get_feature_vector(df[['query_id','database_id','similarity']], symmetric=True)
# make this matrix symmetric
m = bpy.get_symmetric_matrix(m, sym_larger=False)
# invert similarity to distance
for i in range(0,len(m)):
m.iloc[:,i] = 1-m.iloc[:,i]
m.iat[i,i] = 0 # set diagonal values to zero
return m
# load pairwise data
df = pd.read_csv('pseudo_pw.csv.gz')
# get the distance matrix
m = get_dist_matrix(df)
# perform tsne, hdscan, and optics on the data
tsne = get_cluster(m, metric='precomputed')
tsne.to_csv('pseudo_tsne.csv.gz', compression='infer', index=False)
pid[90394] 2020-10-09 22:38:04.053 INFO: running sklearn tsne with n_comp = 2 pid[90394] 2020-10-09 22:39:09.663 INFO: Running HDBSCAN pid[90394] 2020-10-09 22:39:09.874 INFO: getting and ordering the results pid[90394] 2020-10-09 22:39:09.894 INFO: number of clusters = 3 pid[90394] 2020-10-09 22:39:09.894 INFO: unclustered = 7
The code snippet below aligns the pseudo reference against the pseudo cluster centers and true reference. This information lets us colorize the scatter plots and interpret how well OPTICS performed.
# load sequence data
prf = pd.read_csv('pseudo_ref.csv.gz')
prf['source'] = 'samples'
ref = bpy.read_fasta('ref_db.fa')
# use only mock50
ref = ref[[('HAP' in i) for i in ref['id']]]
ref['source'] = 'reference'
ref['ref_id'] = ref['id']
clst = pd.read_csv('pseudo_clst.csv.gz')
clst['source'] = 'clst'
clst['clst_id'] = clst['id']
# align reference database to pseudo reference and pseudo reference clusters to pseudo reference
A = bpy.run_minimap2(prf, ref, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
A = bpy.get_best(A,['id'],metric='match_score',stat='idxmax')
B = bpy.run_minimap2(prf, clst, config='-k8 -w1', cleanup=True).rename(columns={'query_id':'id'})
B = bpy.get_best(B,['id'],metric='match_score',stat='idxmax')
A = A.rename(columns={'database_id':'ref_id','match_score':'ref_match'})
B = B.rename(columns={'database_id':'clst_id','match_score':'clst_match'})
# merge alignment data
prf = prf.merge(A[['id','ref_id','ref_match']], on='id', how='left')
prf = prf.merge(B[['id','clst_id','clst_match']], on='id', how='left')
df = pd.concat([prf,ref,clst])
# merge into tsne
tsne = pd.read_csv('pseudo_tsne.csv.gz')
tsne = tsne.merge(df, on='id', how='left')
# add colors to ref_id
col = 'ref_id'
rid = np.unique(tsne[col].dropna())
cmap = bokeh.palettes.Category20b_20
colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'ref_id_color'])
tsne = tsne.merge(colors,on='ref_id',how='left')
# add colors to clst_id
col = 'clst_id'
rid = np.unique(tsne[col].dropna())
colors = [cmap[i%len(cmap)] for i in range(0,len(rid))]
colors = pd.DataFrame(np.transpose([rid,colors]), columns=[col,'clst_id_color'])
tsne = tsne.merge(colors,on='clst_id',how='left')
tsne = tsne.drop(columns=['sequence','quality'])
pid[90463] 2020-10-09 22:44:30.125 INFO: Making directory ./minimap2/ pid[90463] 2020-10-09 22:44:34.855 INFO: Making directory ./minimap2/
TOOLTIPS = [('id','@id'),
('ref_id', '@ref_id'),
('ref_match','@ref_match'),
('clst_id','@clst_id'),
('clst_match','@clst_match'),
('hdbscan_id','@hdbscan_id'),
('optics_id','@optics_id')]
p = bokeh.plotting.figure(output_backend='webgl',plot_width=600, plot_height=500, tooltips=TOOLTIPS)
d = tsne[tsne['source']=='samples']
# plot samples colored by clst_id
p.circle('f_0', 'f_1', color='clst_id_color', size=5, source=d)
# plot samples colored by ref_id
#p.circle('f_0', 'f_1', color='ref_id_color', size=5, source=d)
# plot cluster centers
d = tsne[tsne['source']=='clst']
p.square('f_0', 'f_1', line_color='black', fill_color='red', size=6, source=d)
d = tsne[tsne['source']=='reference']
p.diamond('f_0', 'f_1', line_color='black', fill_color='blue', size=10, source=d)
bokeh.plotting.output_file('../docs/pages/ashure/pseudo_tsne_1.html')
bokeh.plotting.save(p)
#bokeh.plotting.show(p)
INFO:bokeh.io.state:Session output file '../docs/pages/ashure/pseudo_tsne_1.html' already exists, will be overwritten.
'/Users/zchen/Desktop/ashure/docs/pages/ashure/pseudo_tsne_1.html'
The above code snippet colorizes each sequence by the de novo cluster obtained from ashure.py
. Blue diamonds denote the position of reference haplotype sequences. Red squares denote the position of de novo cluster centers from ashure.py
The following code snippet colorizes sequences by percent match to the haplotype sequence.
TOOLTIPS = [('id','@id'),
('ref_id', '@ref_id'),
('ref_match','@ref_match'),
('clst_id','@clst_id'),
('clst_match','@clst_match'),
('hdbscan_id','@hdbscan_id'),
('optics_id','@optics_id')]
p = bokeh.plotting.figure(output_backend='webgl', plot_width=600, plot_height=500, tooltips=TOOLTIPS)
# plot samples colored by alignment score
d = tsne[tsne['source']=='samples']
mapper = bokeh.transform.linear_cmap(field_name='ref_match', palette=bokeh.palettes.Turbo256, low=0.8, high=1)
color_bar = bokeh.models.ColorBar(color_mapper=mapper['transform'], width=8, location=(0,0))
p.circle('f_0', 'f_1', color=mapper, size=5, source=d)
# plot cluster centers
d = tsne[tsne['source']=='clst']
p.square('f_0', 'f_1', line_color='black', fill_color='red', size=6, source=d)
d = tsne[tsne['source']=='reference']
p.diamond('f_0', 'f_1', line_color='black', fill_color='blue', size=10, source=d)
p.add_layout(color_bar, 'right')
bokeh.plotting.output_file('../docs/pages/ashure/pseudo_tsne_2.html')
bokeh.plotting.save(p)
#bokeh.plotting.show(p)
INFO:bokeh.io.state:Session output file '../docs/pages/ashure/pseudo_tsne_2.html' already exists, will be overwritten.
'/Users/zchen/Desktop/ashure/docs/pages/ashure/pseudo_tsne_2.html'
The sequence of each de novo cluster center can be obtained using the code snippet below. Feel free to blast these to check their alignment against the reference.
clst[clst['id']=='cluster10'].values
array([['cluster10', 'TGGCTCGGGTGTCTCAATGATGGAGATTACTGTAAACTTCAGGGTGACCAAAGAATCAGAATAAGTGTTGGTAAAGAATTGGGTCTCCTCCTCCAGCAGGGTCAAAAATGAGGTATTTAAGTTTCGGTCTGTTAAAAGTATAGTGATAGCTCCAGCTAAAACAGGGAGTGACAAAGCAATAGTACAGCTGTAATAACAACTGATCATACAAATAAAGGTATTCGGTCAAAAGTAATTCCATTTGATCGTATATTAATAATAGTTGTAATAAAATTTACAGCTCCTAAAATAGAGGAAATTCCGGCTAAATGTAAAAAAAAATAGCTAAATCAACGGAAGCTCCGGCATGAGCAATTCCAGAGGATAAAGGAGGGTAGACTGTTCATCCTGTTCCAGCTCCTGCTTCTACTATACTACTTGCTAATAAAAAGAGTTAAAGAAGGAGGTAACATTCAAAAACTTATATTATTTATTCGAGGAAAGGCCATATCAGGAGCTCCTAATATTAAAGGCACTAATCAATTTCCAAATCCCCAATTATAATAGGTATTACTATAAAAAAAATTATGACAAAAGCATGAGCTGTAACAATAACATTATAAATTTGATCATCACCAATAAGGATCCGGGATGACCTAATTCAGCTCGAATTAGTATTCTAAGAGAAGTTCCTACTATTCCTGCTCAAGCTCCAAAATGAAGTATAAAGTTCCGATGTCTTTATGATTTGTTGACCGTTCACCCGCGTAACTACCAAGTAATACCACG', False, 'clst', 'cluster10']], dtype=object)
c = [('1a27a9' in rid) for rid in prf['id']]
prf[c].values
array([['1a27a932-c851-4f13-9853-c55a56e50ba3_rs207_1064', 'TGGCTCGGGGTGTCTCCAATGATGGAGATTACCAAACTTCAGGTGACCAAAAACCAAAACAGTTATATAATACGGACACCTCCGATTGAGGGGTCAAAAATGAAGTATTTAAATTTCTATCAGTTAACAACCAGTAATAGCAACGCCAGCCAATACAGTAAAACTTGCATTAATAATAAAAAAAGCTGTACTAAACACTGACCATACAAATAAAGGTAATTTATGGAAAGTTAATCCCGGTGCTCGGCATTATAGACAGTAATAAAAAAATTCAATTGAACCTAATAATGCATTTTAGAAAATAGCCAAATGTGTAAACCAAAAATAGCGCTAAATCAACAGAAGGTCCTGAATGCGCTTGAACACTAGATAATGGTGGATATACAGTCCAACCTGTACCTGCAATCTCAGATTCTACACAGTTGATGAAACTAATAAAAAGTGATGGAGGTAATAACCAAAACTAATATTATTCATTCTAGGAAAAGCCATCTGGAGCACCTTAACATTAAAGGAATAAACCAATTACCAAAACCCCAATTAAAATAGGCATAACCATAAAAAAATATCATAATAAAAAAAGCATGAGCAGTTACTACAACATTATATAATTGATGATTTCCCCATAAAAAATTTGATTTCCAGGTTGTGCTAATTCCATACGGATTAATACAGATACGTAGTACCCACTACACCTCGAAAAAAGCAATCTAATGTTAGGTGCTAATTCCATACGGATTAATACAGATAATGTAGTACCTACTACACCCGAAAAAGCACCAAAATCATATATAGAGTACCGATATCTTTATGGATTTGTTGACCGTTCACCCGCGTAACTACCAAGTAATACCACG', "-13---(,01*12'%'*,4+3433521+.)&$$&0/'(*'%'(%)64-+587)0-,32)&%#$'&)('.*(0%3*)&$%'%)(%&&&$%%*-/(*)778*4363/-/68255,/3,&%'&''+(.'.##'$&)'&1*%&$#&*+,,&)+)+*+'%$%%$(300*+'%$,%'(.%(4(/6782/,--*)%)$$-7/**-(-44/1)*)12)')*&%'')(++(&%')'62&(+),(,*./0.&%$%#$%++0,22/0&-&%',&+783+&())%&+(*-131(*1()*.'&%((323-/,0.*$%$$&+0+''())47/4,,885*-(&&(($'..&*)-%&*/635675067111***+001425.,*.**,.*2.-1-42001-132-/412)+,+),'&&$)+%&&)-.+-()*)&+%))$+-21038122631,32,+&'(2224/2++5+.7,-'-731),-/)*+5-632584+**2*252*()$'&$$%0(+*/$'(.20.,/,)/1(,74078.1-,+*-.-+,60+.76-/,.,,)(&'&.-/..34+21/*4885(-+/,,+*,/+,85-(%),,6.10/,.)(%/$',*31*424330.1.0/43/**-,'*/.--*.68850&1+*'*+++0335621..*,*4+2321/00312)-(-(,57,,*/(-+4/-,/0((%&&#&$'$%**266,*$$&/(*,5.0/1+),+(((&(./+2+4+**/)2+4+/+046-+,1-+*(*(%*)+/0/15('',,/*466,*++.-3880-42235555220-01(53301()-('(+-.27300.427342/+*+41/./-0)(',&%'.&&(*'*-+*(&", 'samples', 'HAP39', 0.6971326164874552, 'cluster2', 0.8320090805902384]], dtype=object)