Obtaining scores for indels

Problem: When analysing indels in coding regions, we try to assing them the value of a stop. Some genes may not have any stop, or have a small amount of them. In such cases, we would like to infer a value from the rest of the scores in the element.

In this notebook we try to show the correlations we have found.

In [1]:
import bgdata
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
from gendas.engine import Gendas
from gendas.sources import TabixSource
from bokeh.io import output_notebook
from bokeh.models.sources import ColumnDataSource
from bokeh.plotting import figure, show
In [2]:
output_notebook()
Loading BokehJS ...
In [3]:
bokeh_tools = "pan,box_zoom,resize,wheel_zoom,reset,previewsave,crosshair"

Analysis for CADD scores

In [4]:
cadd_conf = {
    'header': ['CHR', 'POS', 'REF', 'ALT', 'RAW', 'PHRED'],
    'ctypes': [str, int, str, str, float, float],
    'sequence': 'CHR',
    'begin': 'POS',
    'end': 'POS'
}
In [5]:
def cadd_stops_score(gd, row):
    values = list(gd['cadd']['PHRED'])
    mean = np.mean(values) if len(values) > 0 else 0
    stop_values = list(gd['cadd'].merge(gd['stops'], on=['REF', 'ALT'])['cadd']['PHRED'])
    stop_mean = np.mean(stop_values) if len(stop_values) > 0 else 0
    
    row['mean'] = mean
    row['stop_mean'] = stop_mean
    row['points'] = len(values)
    row['stop_points'] = len(stop_values)
    return row

CADD 1.0

In [6]:
gd = Gendas('data/gendas/gendas.conf', workers=8, progress=100)
cadd_file = bgdata.get_path('genomicscores', 'cadd', '1.0')
gd['cadd'] = TabixSource(cadd_file, sequence=cadd_conf['sequence'], begin=cadd_conf['begin'], end=cadd_conf['end'], header=cadd_conf['header'], ctypes=cadd_conf['ctypes'])
In [7]:
query = gd.groupby(gd['cds']['GENE']).aggregate(cadd_stops_score)
l = len(set(gd['cds']['GENE']))
data = []
for r in tqdm(query, total=l):
    data.append(r)
df = pd.DataFrame(data)


fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above")
fig.circle(source=ColumnDataSource(df), x='mean', y='stop_mean')
show(fig)
100%|██████████| 20098/20098 [1:17:21<00:00,  4.33it/s]

For CADD 1.0 we assume an exponenial fit

In [8]:
fit_df = df[df['stop_points'] > 2].copy()
B, A, r_value, p_value, std_err = stats.linregress(fit_df['mean'],fit_df['stop_mean'].map(np.log))
A = np.exp(A)
fit_df['exp_reg']=fit_df['mean'].map(lambda x: A*np.exp(B*x))

print('{} * exp( {} * x), where x is the mean score in the gene'.format(A, B))

fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above", x_axis_label='gene mean score', y_axis_label='gene stops mean score')
fig.circle(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='stop_mean')
fig.line(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='exp_reg', color='green')
show(fig)
8.839715923111779 * exp( 0.08320104217076194 * x), where x is the mean score in the gene

CADD 1.3

In [9]:
gd = Gendas('data/gendas/gendas.conf', workers=8, progress=100)
cadd_file = bgdata.get_path('genomicscores', 'cadd', '1.3')
gd['cadd'] = TabixSource(cadd_file, sequence=cadd_conf['sequence'], begin=cadd_conf['begin'], end=cadd_conf['end'], header=cadd_conf['header'], ctypes=cadd_conf['ctypes'])
In [10]:
query = gd.groupby(gd['cds']['GENE']).aggregate(cadd_stops_score)
l = len(set(gd['cds']['GENE']))
data = []
for r in tqdm(query, total=l):
    data.append(r)
df = pd.DataFrame(data)


fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above")
fig.circle(source=ColumnDataSource(df), x='mean', y='stop_mean')
show(fig)
100%|██████████| 20098/20098 [1:17:09<00:00,  4.34it/s]

For CADD 1.3 we assume a linear fit

In [11]:
fit_df = df[df['stop_points'] > 2].copy()
B, A, r_value, p_value, std_err = stats.linregress(fit_df['mean'],fit_df['stop_mean'])

fit_df['exp_reg']=fit_df['mean'].map(lambda x: A+B*x)

print('{} + {} * x, where x is the mean score in the gene'.format(A, B))

fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above", x_axis_label='gene mean score', y_axis_label='gene stops mean score')
fig.circle(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='stop_mean')
fig.line(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='exp_reg', color='green')
show(fig)
22.616169662767973 + 0.7382404441072176 * x, where x is the mean score in the gene
In [ ]: