!pwd

/Users/etretiakov/src/GlobalStructure


%load_ext watermark


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


import hdbscan


from pathlib import Path


data = pd.read_csv('1_Raw/MitoBreakDB_12122019.csv')


data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1313 entries, 0 to 1312
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Deletions                        1313 non-null   object
 1   5' breakpoint                    1313 non-null   object
 2   3' breakpoint                    1313 non-null   object
 3   Deletion length - bp             1313 non-null   object
 4   Deletion of replication origins  1313 non-null   object
 5   Location of the deleted region   1313 non-null   object
 6   Single mtDNA deletions           153 non-null    object
 7   Multiple mtDNA deletions         331 non-null    object
 8   Healthy tissues                  308 non-null    object
 9   Parkinson Disease                26 non-null     object
 10  Inclusion Body Myositis          48 non-null     object
 11  Tumour                           94 non-null     object
 12  Other clinical features          558 non-null    object
 13  References                       1313 non-null   object
dtypes: object(14)
memory usage: 143.7+ KB


data.head()


breakpoints = data[["5' breakpoint", "3' breakpoint"]][:-1].copy()


breakpoints.columns = ['five', 'three']

breakpoints.five = breakpoints.five.astype(int)
breakpoints.three = breakpoints.three.astype(int)

breakpoints.loc[breakpoints.five < 110, 'five'] += 16569
breakpoints.loc[breakpoints.three < 110, 'three'] += 16569

breakpoints = breakpoints[(breakpoints.five > 5781)&(breakpoints.three > 5781)]


plt.figure(figsize=(10,7))
sns.scatterplot(data=breakpoints, x='five', y='three')

<Axes: xlabel='five', ylabel='three'>


def clustering(breakpoints, graph_fn = 'clusters.svg'):
    clusterer = hdbscan.HDBSCAN(cluster_selection_epsilon = 50
                            #, cluster_selection_method='leaf'
                            , min_samples=10                            
                           )
    clusters  =clusterer.fit_predict(breakpoints)
    colors = sns.color_palette('hls', len(np.unique(clusters))-1)
    sns.set_style('white')
    plt.figure(figsize=(10,7))
    plt.gca().invert_yaxis()
    x = breakpoints.five[clusters == -1]
    y = breakpoints.three[clusters == -1]
    plt.scatter(x, y, color='black', alpha=0.5, s=5)

    for c, color in list(zip(range(0, np.max(clusters)+1), colors)):
        x = breakpoints.five[clusters == c]
        y = breakpoints.three[clusters == c]
        plt.scatter(x, y, color=color, s=100)
    plt.xlabel("5'")
    plt.ylabel("3'")
    plt.title('Clusters')
    plt.savefig( graph_fn)# 
    return breakpoints, clusters


figures_path = Path('3_Results/')


data, clusters = clustering(breakpoints.copy(), str(figures_path / 'clusters_with_spike.svg'))


for c in np.unique(clusters[clusters >=0]):
    cluster_data = data[clusters == c]    
    
#     f, ax = plt.subplots(figsize=(6, 6))
#     sns.kdeplot(cluster_data.five, cluster_data.three, ax=ax)
#     #sns.jointplot(cluster_data.five, cluster_data.three, ax=ax)
#     sns.rugplot(cluster_data.five, color="g", ax=ax)
#     sns.rugplot(cluster_data.three, vertical=True, ax=ax);
    g = sns.jointplot(x="five", y="three", data=cluster_data, kind="kde")
    g.ax_joint.invert_yaxis()
    g.plot_joint(plt.scatter, c="r", s=30, linewidth=1, marker="+")
    g.ax_joint.collections[0].set_alpha(0)
    g.set_axis_labels("$5'$", "$3'$")    
    g.fig.suptitle('Cluster №{}:'.format(c+1), y= 1)
    plt.show()


data['c'] = clusters
cluster_stats = pd.concat([data[data.c >=0].groupby('c').mean(), data[data.c >=0].groupby('c').size()], axis=1)
cluster_stats.columns = ['five', 'three', 'size']
cluster_stats.to_csv(str(figures_path / '..'/ '3_Results'/ 'cluster_spike_stats.csv'), index_label=None)


cluster_stats


data.c.value_counts()

-1     579
 6      59
 7      43
 16     42
 4      32
 15     29
 23     29
 13     29
 3      22
 17     19
 22     18
 8      17
 21     17
 11     15
 2      15
 19     14
 20     13
 5      12
 14     12
 1      12
 12      9
 9       9
 0       5
 10      5
 18      5
Name: c, dtype: int64


top5 = data[data.c >=0].c.value_counts().sort_values(ascending = False)[:5]


colors = ["windows blue", "amber", "hot pink", "faded green", "neon blue"]
colors = sns.xkcd_palette(colors)
plt.figure(figsize=(10,7))
plt.gca().invert_yaxis()
rest_data = data[~data.c.isin(top5.index)]
plt.scatter(rest_data.five, rest_data.three, c='black', alpha=0.2)
for (cluster, cdata), color in zip(data[data.c .isin(top5.index)].groupby('c'), colors):
    plt.scatter(cdata.five, cdata.three, c = color, s =100)

plt.savefig(figures_path/ '5top_clusters.svg')

/var/folders/zs/11_d9tgx0495nq5hmmxwdl0c0000gn/T/ipykernel_42852/1184187372.py:8: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
  plt.scatter(cdata.five, cdata.three, c = color, s =100)


cc = []
for (cluster, cdata), color in zip(data[data.c .isin(top5.index)].groupby('c'), ["windows blue", "amber", "hot pink", "faded green", "neon blue"]):
    print(f'{color} - {cdata.five.min()} - {cdata.three.max()}')

windows blue - 8933 - 14158
amber - 6063 - 14420
hot pink - 7322 - 14135
faded green - 7562 - 15761
neon blue - 6961 - 16080


data.to_csv(figures_path / '..'/ '3_Results'/'data_clusters.csv', index_label=None)


counts, bins = np.histogram(breakpoints.three, bins=100)
counts, bins = np.histogram(breakpoints[(breakpoints.three > bins[np.argmax(counts)])&(breakpoints.three < bins[np.argmax(counts)+2])].three, bins=20)
breakpoints = breakpoints[
    (breakpoints.three < bins[np.argmax(counts)]) 
    | (breakpoints.three >  bins[np.argmax(counts)+2])]


data2, clusters2 = clustering(breakpoints.copy(), str(figures_path / 'clusters.svg'))


data2['c'] = clusters2

cluster_stats = pd.concat([data2[data2.c >=0].groupby('c').mean(), data2[data2.c >=0].groupby('c').size()], axis=1)

cluster_stats.columns = ['five', 'three', 'size']

cluster_stats.to_csv(str(figures_path / '..'/ '3_Results'/ 'cluster_stats.csv'), index_label=None)


cluster_stats


top5 = data2[data2.c >=0].c.value_counts().sort_values(ascending = False)[:5]


top5

6     60
14    56
13    51
7     43
3     32
Name: c, dtype: int64


colors = ["windows blue", "amber", "hot pink", "faded green", "neon blue"]
colors = sns.xkcd_palette(colors)
plt.figure(figsize=(10,7))
plt.gca().invert_yaxis()


rest_data = data2[~data2.c.isin(top5.index)]
plt.scatter(rest_data.five, rest_data.three, c='black', alpha=0.4, s=5)
for (cluster, cdata), color in zip(data2[data2.c .isin(top5.index)].groupby('c'), colors):
    plt.scatter(cdata.five, cdata.three, c = color, s=70, alpha=0.4)
plt.xlabel("5'")
plt.ylabel("3'")

plt.savefig(figures_path/ '5top_clusters_no_spike.svg')

/var/folders/zs/11_d9tgx0495nq5hmmxwdl0c0000gn/T/ipykernel_42852/1977634396.py:10: UserWarning: *c* argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with *x* & *y*.  Please use the *color* keyword-argument or provide a 2D array with a single row if you intend to specify the same RGB or RGBA value for all points.
  plt.scatter(cdata.five, cdata.three, c = color, s=70, alpha=0.4)


colors2 = sns.color_palette('hls', len(np.unique(clusters2))-1)
for i in np.arange(5):
    colors2[top5.index.sort_values().to_list()[i]] = colors[i]
sns.set_style('white')
plt.figure(figsize=(10,7))
plt.gca().invert_yaxis()
x = data2.five[clusters2 == -1]
y = data2.three[clusters2 == -1]
plt.scatter(x, y, color='black', alpha=0.4, s=5)

for c, color in list(zip(range(0, np.max(clusters2)+1), colors2)):
    x = data2.five[clusters2 == c]
    y = data2.three[clusters2 == c]
    plt.scatter(x, y, color=color, s=50, alpha=0.6)
plt.xlabel("5'")
plt.ylabel("3'")
plt.title('Clusters')
plt.savefig( figures_path/ 'clusters_5_colours2_no_spike.svg')


colors3 = sns.color_palette('husl', len(np.unique(clusters2))-1)
for i in np.arange(5):
    colors3[top5.index.sort_values().to_list()[i]] = colors[i]
sns.set_style('white')
plt.figure(figsize=(10,7))
plt.gca().invert_yaxis()
x = data2.five[clusters2 == -1]
y = data2.three[clusters2 == -1]
plt.scatter(x, y, color='black', alpha=0.4, s=5)

for c, color in list(zip(range(0, np.max(clusters2)+1), colors3)):
    x = data2.five[clusters2 == c]
    y = data2.three[clusters2 == c]
    plt.scatter(x, y, color=color, s=50, alpha=0.6)
plt.xlabel("5'")
plt.ylabel("3'")
plt.title('Clusters')
plt.savefig( figures_path/ 'clusters_5_colours3_no_spike.svg')


colors4 = sns.color_palette('pastel', len(np.unique(clusters2))-1)
for i in np.arange(5):
    colors4[top5.index.sort_values().to_list()[i]] = colors[i]
sns.set_style('white')
plt.figure(figsize=(10,7))
plt.gca().invert_yaxis()
x = data2.five[clusters2 == -1]
y = data2.three[clusters2 == -1]
plt.scatter(x, y, color='black', alpha=0.4, s=5)

for c, color in list(zip(range(0, np.max(clusters2)+1), colors4)):
    x = data2.five[clusters2 == c]
    y = data2.three[clusters2 == c]
    plt.scatter(x, y, color=color, s=50, alpha=0.6)
plt.xlabel("5'breakpoint")
plt.ylabel("3'breakpoint")
plt.title('Clusters')
plt.savefig( figures_path/ 'clusters_5_colours_no_spike.svg')


for c,name in zip(colors,["windows blue", "amber", "hot pink", "faded green", "neon blue"]):
    print(f'{name} - {int(c[0]*255)}, {int(c[1]*255)}, {int(c[2]*255)}')

windows blue - 55, 120, 191
amber - 254, 179, 8
hot pink - 255, 2, 141
faded green - 123, 178, 116
neon blue - 4, 217, 255


cc = []
for (cluster, cdata), color in zip(data2[data2.c .isin(top5.index)].groupby('c'), ["windows blue", "amber", "hot pink", "faded green", "neon blue"]):
    print(f'{color} - {cdata.five.min()} - {cdata.three.max()}')

windows blue - 8933 - 14158
amber - 5978 - 14420
hot pink - 7322 - 14135
faded green - 7322 - 15761
neon blue - 8298 - 16143


for c, color in list(zip(range(0, np.max(clusters2)+1), colors3)):
    cluster_data = data2[clusters2 == c]    
    
#     f, ax = plt.subplots(figsize=(6, 6))
#     sns.kdeplot(cluster_data.five, cluster_data.three, ax=ax)
#     #sns.jointplot(cluster_data.five, cluster_data.three, ax=ax)
#     sns.rugplot(cluster_data.five, color="g", ax=ax)
#     sns.rugplot(cluster_data.three, vertical=True, ax=ax);
    g = sns.jointplot(x="five", y="three",
                      data=cluster_data, kind="kde",
                      alpha=0.8)
    g.ax_joint.invert_yaxis()
    g.plot_joint(plt.scatter, color=color, s=50, alpha=0.6)
    g.ax_joint.collections[0].set_alpha(0)
    g.set_axis_labels("$5'$", "$3'$")    
    g.fig.suptitle('Cluster №{}:'.format(c+1), y= 1)
    plt.show()


data.to_csv(figures_path / '..'/ '3_Results'/'data_clusters_no_spike.csv', index_label=None)


%watermark

Last updated: 2023-04-02T20:57:08.088984+02:00

Python implementation: CPython
Python version       : 3.11.0
IPython version      : 8.7.0

Compiler    : Clang 14.0.6 
OS          : Darwin
Release     : 22.3.0
Machine     : arm64
Processor   : i386
CPU cores   : 8
Architecture: 64bit


%watermark -iv

numpy     : 1.24.2
matplotlib: 3.7.1
hdbscan   : 0.8.29
pandas    : 1.5.3
seaborn   : 0.12.2

	Deletions	5' breakpoint	3' breakpoint	Deletion length - bp	Deletion of replication origins	Location of the deleted region	Single mtDNA deletions	Multiple mtDNA deletions	Healthy tissues	Parkinson Disease	Inclusion Body Myositis	Tumour	Other clinical features	References
0	83:16095	83	16095	16011	OH and OL	Removing full minor and part o(...)	NaN	Patients with pathogenic MGME1(...)	NaN	NaN	NaN	NaN	NaN	[327]
1	105:112	105	112	6	None	Inside the major arc	NaN	NaN	Normal tissues (polymorphism)	NaN	NaN	NaN	NaN	[53-57]
2	114:12305	114	12305	12190	OH and OL	Removing full minor and part o(...)	NaN	Patients with pathogenic MGME1(...)	NaN	NaN	NaN	NaN	NaN	[327]
3	167:16223	167	16223	16055	OH and OL	Removing full minor and part o(...)	NaN	Patients with pathogenic MGME1(...)	NaN	NaN	NaN	NaN	NaN	[327]
4	174:3623	174	3623	3448	OH	Removing part of minor and maj(...)	NaN	NaN	NaN	NaN	NaN	NaN	Skeletal muscle biopsies and �(...)	[329]

	five	three	size
c
0	8026.800000	11441.200000	5
1	10906.083333	14036.916667	12
2	7771.266667	12376.466667	15
3	11015.590909	15476.818182	22
4	9449.875000	13827.406250	32
5	5816.833333	13936.916667	12
6	6384.372881	13952.406780	59
7	7642.093023	13696.395349	43
8	7300.235294	14468.470588	17
9	8506.222222	14035.222222	9
10	6373.200000	15513.000000	5
11	8460.000000	13465.800000	15
12	8248.222222	13413.000000	9
13	5811.793103	16071.379310	29
14	6153.166667	16072.083333	12
15	7906.724138	15512.517241	29
16	7327.071429	16053.357143	42
17	8580.631579	15663.473684	19
18	8884.000000	16070.200000	5
19	8644.142857	16078.500000	14
20	8398.076923	16072.000000	13
21	8259.882353	16069.411765	17
22	7812.666667	16073.500000	18
23	8004.206897	16072.896552	29

	five	three	size
c
0	10906.083333	14036.916667	12
1	11008.304348	15406.130435	23
2	7771.266667	12376.466667	15
3	9449.875000	13827.406250	32
4	6387.931034	15437.310345	29
5	5802.181818	13922.727273	11
6	6377.600000	13954.750000	60
7	7642.093023	13696.395349	43
8	7309.777778	14487.666667	18
9	8502.600000	14017.500000	10
10	7486.888889	15924.666667	9
11	8460.000000	13465.800000	15
12	8248.222222	13413.000000	9
13	7762.509804	15456.254902	51
14	8619.678571	15743.232143	56

Clusters before deletion of the "hairpin"¶

Plot clusters separately:¶

Statistics grouped by cluster¶

Clusters without the "hairpin":¶

Statistics grouped by cluster¶

Figure 2B. Top 5 clusters of deletions.¶

Plots all clusters for supplementary:¶

Plots clusters separately:¶

Save results¶