Obtaining scores for indels¶

Problem: When analysing indels in coding regions, we try to assing them the value of a stop. Some genes may not have any stop, or have a small amount of them. In such cases, we would like to infer a value from the rest of the scores in the element.

In this notebook we try to show the correlations we have found.

import bgdata
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy import stats
from gendas.engine import Gendas
from gendas.sources import TabixSource
from bokeh.io import output_notebook
from bokeh.models.sources import ColumnDataSource
from bokeh.plotting import figure, show

output_notebook()

bokeh_tools = "pan,box_zoom,resize,wheel_zoom,reset,previewsave,crosshair"

Analysis for CADD scores¶

cadd_conf = {
    'header': ['CHR', 'POS', 'REF', 'ALT', 'RAW', 'PHRED'],
    'ctypes': [str, int, str, str, float, float],
    'sequence': 'CHR',
    'begin': 'POS',
    'end': 'POS'
}

def cadd_stops_score(gd, row):
    values = list(gd['cadd']['PHRED'])
    mean = np.mean(values) if len(values) > 0 else 0
    stop_values = list(gd['cadd'].merge(gd['stops'], on=['REF', 'ALT'])['cadd']['PHRED'])
    stop_mean = np.mean(stop_values) if len(stop_values) > 0 else 0
    
    row['mean'] = mean
    row['stop_mean'] = stop_mean
    row['points'] = len(values)
    row['stop_points'] = len(stop_values)
    return row

CADD 1.0¶

gd = Gendas('data/gendas/gendas.conf', workers=8, progress=100)
cadd_file = bgdata.get_path('genomicscores', 'cadd', '1.0')
gd['cadd'] = TabixSource(cadd_file, sequence=cadd_conf['sequence'], begin=cadd_conf['begin'], end=cadd_conf['end'], header=cadd_conf['header'], ctypes=cadd_conf['ctypes'])

query = gd.groupby(gd['cds']['GENE']).aggregate(cadd_stops_score)
l = len(set(gd['cds']['GENE']))
data = []
for r in tqdm(query, total=l):
    data.append(r)
df = pd.DataFrame(data)


fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above")
fig.circle(source=ColumnDataSource(df), x='mean', y='stop_mean')
show(fig)

100%|██████████| 20098/20098 [1:17:21<00:00,  4.33it/s]

For CADD 1.0 we assume an exponenial fit

fit_df = df[df['stop_points'] > 2].copy()
B, A, r_value, p_value, std_err = stats.linregress(fit_df['mean'],fit_df['stop_mean'].map(np.log))
A = np.exp(A)
fit_df['exp_reg']=fit_df['mean'].map(lambda x: A*np.exp(B*x))

print('{} * exp( {} * x), where x is the mean score in the gene'.format(A, B))

fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above", x_axis_label='gene mean score', y_axis_label='gene stops mean score')
fig.circle(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='stop_mean')
fig.line(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='exp_reg', color='green')
show(fig)

8.839715923111779 * exp( 0.08320104217076194 * x), where x is the mean score in the gene

CADD 1.3¶

gd = Gendas('data/gendas/gendas.conf', workers=8, progress=100)
cadd_file = bgdata.get_path('genomicscores', 'cadd', '1.3')
gd['cadd'] = TabixSource(cadd_file, sequence=cadd_conf['sequence'], begin=cadd_conf['begin'], end=cadd_conf['end'], header=cadd_conf['header'], ctypes=cadd_conf['ctypes'])

query = gd.groupby(gd['cds']['GENE']).aggregate(cadd_stops_score)
l = len(set(gd['cds']['GENE']))
data = []
for r in tqdm(query, total=l):
    data.append(r)
df = pd.DataFrame(data)


fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above")
fig.circle(source=ColumnDataSource(df), x='mean', y='stop_mean')
show(fig)

100%|██████████| 20098/20098 [1:17:09<00:00,  4.34it/s]

For CADD 1.3 we assume a linear fit

fit_df = df[df['stop_points'] > 2].copy()
B, A, r_value, p_value, std_err = stats.linregress(fit_df['mean'],fit_df['stop_mean'])

fit_df['exp_reg']=fit_df['mean'].map(lambda x: A+B*x)

print('{} + {} * x, where x is the mean score in the gene'.format(A, B))

fig = figure(width=600, plot_height=800, tools=bokeh_tools, toolbar_location="above", x_axis_label='gene mean score', y_axis_label='gene stops mean score')
fig.circle(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='stop_mean')
fig.line(source=ColumnDataSource(fit_df.sort_values('mean')), x='mean', y='exp_reg', color='green')
show(fig)

22.616169662767973 + 0.7382404441072176 * x, where x is the mean score in the gene