Problem: When analysing indels in coding regions, we try to assing them the value of a stop. Some genes may not have any stop, or have a small amount of them. In such cases, we would like to infer a value from the rest of the scores in the element.
In this notebook we try to show the correlations we have found.
import bgdata
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
from gendas.engine import Gendas
from gendas.sources import TabixSource
from bokeh.io import output_notebook
from bokeh.models.sources import ColumnDataSource
from bokeh.plotting import figure, show
output_notebook()
bokeh_tools = "pan,box_zoom,resize,wheel_zoom,reset,previewsave,crosshair"
cadd_conf = {
'header': ['CHR', 'POS', 'REF', 'ALT', 'RAW', 'PHRED'],
'ctypes': [str, int, str, str, float, float],
'sequence': 'CHR',
'begin': 'POS',
'end': 'POS'
}
def cadd_stops_score(gd, row):
values = list(gd['cadd']['PHRED'])
mean = np.mean(values) if len(values) > 0 else 0
stop_values = list(gd['cadd'].merge(gd['stops'], on=['REF', 'ALT'])['cadd']['PHRED'])
stop_mean = np.mean(stop_values) if len(stop_values) > 0 else 0
row['mean'] = mean
row['stop_mean'] = stop_mean
row['points'] = len(values)
row['stop_points'] = len(stop_values)
return row
gd = Gendas('data/gendas/gendas.conf', workers=8, progress=100)
cadd_file = bgdata.get_path('genomicscores', 'cadd', '1.0')
gd['cadd'] = TabixSource(cadd_file, sequence=cadd_conf['sequence'], begin=cadd_conf['begin'], end=cadd_conf['end'], header=cadd_conf['header'], ctypes=cadd_conf['ctypes'])
query = gd.groupby(gd['cds']['GENE']).aggregate(cadd_stops_score)
l = len(set(gd['cds']['GENE']))
data = []
for r in tqdm(query, total=l):
data.append(r)
df = pd.DataFrame(data)
fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above")
fig.circle(source=ColumnDataSource(df), x='mean', y='stop_mean')
show(fig)
For CADD 1.0 we assume an exponenial fit
fit_df = df[df['stop_points'] > 2].copy()
B, A, r_value, p_value, std_err = stats.linregress(fit_df['mean'],fit_df['stop_mean'].map(np.log))
A = np.exp(A)
fit_df['exp_reg']=fit_df['mean'].map(lambda x: A*np.exp(B*x))
print('{} * exp( {} * x), where x is the mean score in the gene'.format(A, B))
fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above", x_axis_label='gene mean score', y_axis_label='gene stops mean score')
fig.circle(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='stop_mean')
fig.line(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='exp_reg', color='green')
show(fig)
gd = Gendas('data/gendas/gendas.conf', workers=8, progress=100)
cadd_file = bgdata.get_path('genomicscores', 'cadd', '1.3')
gd['cadd'] = TabixSource(cadd_file, sequence=cadd_conf['sequence'], begin=cadd_conf['begin'], end=cadd_conf['end'], header=cadd_conf['header'], ctypes=cadd_conf['ctypes'])
query = gd.groupby(gd['cds']['GENE']).aggregate(cadd_stops_score)
l = len(set(gd['cds']['GENE']))
data = []
for r in tqdm(query, total=l):
data.append(r)
df = pd.DataFrame(data)
fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above")
fig.circle(source=ColumnDataSource(df), x='mean', y='stop_mean')
show(fig)
For CADD 1.3 we assume a linear fit
fit_df = df[df['stop_points'] > 2].copy()
B, A, r_value, p_value, std_err = stats.linregress(fit_df['mean'],fit_df['stop_mean'])
fit_df['exp_reg']=fit_df['mean'].map(lambda x: A+B*x)
print('{} + {} * x, where x is the mean score in the gene'.format(A, B))
fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above", x_axis_label='gene mean score', y_axis_label='gene stops mean score')
fig.circle(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='stop_mean')
fig.line(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='exp_reg', color='green')
show(fig)