Source code for oncodrivefml.store

"""
This module contains the methods used to store the results.

3 different types of output are available:

    - **tsv** file
    - **png** graph: uses the *tsv* file and matplotlib
    - **html** graph: uses the *tsv* file and bokeh
"""

import logging
import gzip
import os

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from bokeh import __version__ as bokeh_version
from bokeh.plotting import output_notebook
from bokeh.embed import components
from bokeh.plotting import figure, output_file, show, ColumnDataSource
from bokeh.models import HoverTool, PrintfTickFormatter, CustomJS, Circle
from bokeh.models.widgets.inputs import TextInput
from bokeh.layouts import column, widgetbox

from oncodrivefml import __logger_name__

__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

logger = logging.getLogger(__logger_name__)



[docs]class QQPlot(object): """ QQPlot Args: input_file: tsv file with the data cutoff (bool): add cutoffs to the figure rename_fields (dict): column names from the input file can be renamed providing a dictionary {old_name : new_name} extra_fields (list): list of column names that want to be passed to the figure data. Need for example to search by them. """ def __init__(self, input_file, cutoff=True, rename_fields=None, extra_fields=None): basic_tools = "pan,box_zoom,wheel_zoom,reset,previewsave,crosshair" self.figure = figure(width=600, plot_height=600, tools=basic_tools, toolbar_location="above") # labels self.figure.xaxis.axis_label = 'Expected p-values' self.figure.yaxis.axis_label = 'Observed p-values' self.figure.axis.major_label_text_font_size = '14pt' #self.figure.xaxis[0].formatter = PrintfTickFormatter(format="%.1f") #%.0e #self.figure.yaxis[0].formatter = PrintfTickFormatter(format="%.1f") # 1e-%1.4f #self.figure.xaxis[0].major_label_orientation = pi / 4 self.layout = None self.__load_values(input_file, rename_fields=rename_fields) self.__create_basic_plot(extra_fields=extra_fields, cutoff=cutoff) def __load_values(self, input_file, rename_fields=None): """ Load the values for the plot as a :obj:`~pandas.DataFrame` Args: input_file: tsv file with data rename_fields (dict): pair of old column name - new column name """ self.data = pd.read_csv(input_file, header=0, sep="\t") if rename_fields is not None: self.data = self.data.rename(columns = rename_fields) def __create_basic_plot(self, extra_fields=None, cutoff = True): """ Creates the basic plot. Args: extra_fields (list): list of extra fields containen by each point cutoff (bool): add cutoff """ # Default settings min_samples = 2 min_pvalue = 10000 colors = ['royalblue', 'blue'] self.data['observed'] = self.data['pvalue'].map(lambda x: -np.log10(x) if x > 0 else -np.log10(1 / min_pvalue)) self.data['color'] = self.data['num_samples'].map(lambda x: colors[1] if x >= min_samples else colors[0]) self.data['alpha'] = self.data['num_samples'].map(lambda x: 0.7 if x >= min_samples else 0.3) self.data.sort_values(by=['observed'], inplace=True) exp_pvalues = -np.log10(np.arange(1, len(self.data) + 1) / float(len(self.data))) exp_pvalues.sort() self.data['expected'] = exp_pvalues dict_for_source = dict( x=self.data['expected'].tolist(), y=self.data['observed'].tolist(), color=self.data['color'].tolist(), alpha=self.data['alpha'].tolist(), pvalue=[str(x) for x in self.data["pvalue"]], qvalue=[str(x) for x in self.data["qvalue"]] ) if extra_fields is not None: for field in extra_fields: dict_for_source[field] = self.data[field].tolist() self._source = ColumnDataSource(data=dict_for_source) # Plot the first set of data if len(self.data['expected']) > 0: invisible_circle = Circle(x='x', y='y', fill_color='color', fill_alpha='alpha', line_color=None, size=10) visible_circle = Circle(x='x', y='y', fill_color='color', fill_alpha=0.9, line_color='red', size=10) self.glyph = self.figure.add_glyph(self._source, invisible_circle, selection_glyph=visible_circle, nonselection_glyph=invisible_circle) # Get the maximum pvalues (observed and expected) max_x = float(self.data[['expected']].apply(np.max)) max_y = float(self.data[['observed']].apply(np.max)) # Give some extra space (+-10%) max_x *= 1.1 # Give some extra space (+-20%) max_y *= 1.2 # Add a dashed diagonal from (min_x, max_x) to (min_y, max_y) self.figure.line(np.linspace(0, min(max_x, max_y)), np.linspace(0, min(max_x, max_y)), color='red', line_width=2, line_dash=[5, 5]) # Set the grid self.figure.grid.grid_line_alpha = 0.8 self.figure.grid.grid_line_dash = [6, 4] #cutoff if cutoff: # FDR for fdr_cutoff, fdr_color in zip((0.25, 0.1), ('green', 'red')): fdr = self.data[self.data['qvalue'] < fdr_cutoff]['observed'] if len(fdr) > 0: fdr_y = np.min(fdr) fdr_x = np.min(self.data[self.data['observed'] == fdr_y]['expected']) self.figure.line((fdr_x - max_x * 0.025, fdr_x + max_x * 0.025), (fdr_y, fdr_y), color=fdr_color, line_width=2)
[docs] def add_tooltip(self): """ Adds tooltip to show the parameters of each glyph in the figure """ tooltip = \ """ <div> <span style='font-size: 17px; font-weight: bold;'>@HugoID</span> <span style='font-size: 15px; color: #966;'>[@EnsemblID]</span> </div> <div> <span style='font-size: 15px;'>p, q-value</span> <span style='font-size: 10px; color: #696;'>(@pvalue, @qvalue)</span> </div> </br> """ self.figure.add_tools(HoverTool(tooltips=tooltip, mode='mouse', renderers=[self.glyph]))
[docs] def add_tooltip_enhanced(self): """ The tooltip is shown via JavaScript to avoid been block in areas with a high density of points """ tooltip = \ """ "<div> \\ <span style='font-size: 17px; font-weight: bold;'>\"+s.HugoID[index]+\"</span> \\ <span style='font-size: 15px; color: #966;'>[\"+s.EnsemblID[index]+\"]</span> \\ </div> \\ <div> \\ <span style='font-size: 15px;'>p, q-value</span> \\ <span style='font-size: 10px; color: #696;'>(\"+s.pvalue[index]+\", \"+s.qvalue[index]+\")</span> \\ </div> \\ </br>" """ code = \ """ if (cb_data.index['1d'].indices.length > 0) { var s = source.get('data'); if ( $( ".bk-tooltip.bk-tooltip-custom.bk-left" ).length == 0 ){ $( ".bk-canvas-overlays" ).append( '<div class="bk-tooltip bk-tooltip-custom bk-left" style="z-index: 1010; top: 0px; left: 0px; display: block;"></div>' ); } var inner = "<div><div><div>"; for(i in cb_data.index['1d'].indices){ index = cb_data.index['1d'].indices[i]; if (i > 2) break; inner = inner + """ + tooltip + """ } if (i>2) { inner = inner + "<div><span style='font-size: 15px; color: #009;'>TOTAL OF: " + cb_data.index['1d'].indices.length + "</span></div>"; } inner = inner + "</div></div></div>"; $('.bk-tooltip.bk-tooltip-custom.bk-left')[0].innerHTML = inner; $('.bk-tooltip.bk-tooltip-custom.bk-left').attr('style', 'left:' + (cb_data.geometry.sx+10) + 'px; top:' + (cb_data.geometry.sy-5-$('.bk-tooltip.bk-tooltip-custom.bk-left').height()/2) + 'px; z-index: 1010; display: block;'); }else { $("div").remove(".bk-tooltip.bk-tooltip-custom.bk-left"); } """ callback = CustomJS(args={'source': self._source}, code=code) self.figure.add_tools(HoverTool(tooltips=None, callback=callback, renderers=[self.glyph], mode='mouse'))
[docs] def add_search_widget(self, fields): """ Add text input for each field. Args: fields (:obj:`str` or :obj:`list`): list of fields to do a search. """ if not fields: return if isinstance(fields, str): callback = CustomJS(args=dict(source=self._source), code=self.__get_search_box_code(fields)) widgets = TextInput(value="", name=fields, title=fields, callback = callback) else: widgets = [] for field in fields: callback = CustomJS(args=dict(source=self._source), code=self.__get_search_box_code(field)) text_input = TextInput(value="", name=field, title=field, callback=callback) widgets.append(text_input) widgets = widgetbox(widgets) self.layout = column(widgets, self.figure)
@staticmethod def __get_search_box_code(field): """ Code for a search box widget Args: field (str): field name Returns: str: Javascript code """ code = \ """ origSearch = cb_obj.get('value'); search = origSearch.toUpperCase(); var selected = source.get('selected')['1d'].indices; searcher = source.get('data').""" + field + """ selected.length = 0 for(index in searcher) { if ( searcher[index].toUpperCase().indexOf(search) > -1) { selected.push(index); } } if (selected.length == 0) { swal("Error", "Value not found: '"+origSearch+"'", "error") } source.trigger('change'); """ return code
[docs] def show(self, output_path, showit=True, notebook=False): """ Show the figure Args: output_path: file where to store the figure showit (bool): the figure is displayed (widgets and the like are not shown) or is fully saved. Defaults to True. notebook (bool): if is is called form a notebook or not. Defaults to False. """ # Import modules if notebook: output_notebook() else: if output_path is not None: output_file(output_path) if self.layout is not None: layout = self.layout else: layout = self.figure if showit: show(self.figure) else: script, div = components(layout) html = """ <!DOCTYPE html> <html> <script src="https://code.jquery.com/jquery-1.11.3.min.js"> </script><script src="https://cdnjs.cloudflare.com/ajax/libs/sweetalert/1.1.3/sweetalert.min.js"></script> <link rel="stylesheet" type="text/css" href="https://cdnjs.cloudflare.com/ajax/libs/sweetalert/1.1.3/sweetalert.min.css"> <link href="http://cdn.pydata.org/bokeh/release/bokeh-""" + bokeh_version + """.min.css" rel="stylesheet" type="text/css"> <script src="http://cdn.pydata.org/bokeh/release/bokeh-""" + bokeh_version + """.min.js"></script> <link href="http://cdn.pydata.org/bokeh/release/bokeh-widgets-""" + bokeh_version + """.min.css" rel="stylesheet" type="text/css"> <script src="http://cdn.pydata.org/bokeh/release/bokeh-widgets-""" + bokeh_version + """.min.js"></script> <style>.bk-root .bk-toolbar-above {right: auto;}</style>\n""" + \ script + \ """<body>\n""" + \ div + """ </body> </html>""" text_file = open(output_path, "w") text_file.write(html) text_file.close()
[docs]def eliminate_duplicates(df): colors = df['color'].tolist() x = df['x'].tolist()[0] y = df['y'].tolist()[0] if 'r' in colors: return x, y, 'red' elif 'g' in colors: return x, y, 'green' else: return x, y, 'grey'
[docs]def add_symbol(df): ensemble_file = os.path.join(__location__, "ensembl_genes_75.txt.gz") gene_conversion = {line.split("\t")[0]: line.strip().split("\t")[-1] for line in gzip.open(ensemble_file, 'rt').readlines()} gene_symbols = df.GENE_ID.apply(lambda e: gene_conversion.get(e, e)) df.SYMBOL.fillna(gene_symbols, inplace=True) return df
[docs]def store_png(input_file, output_file, showit=False): """ Creates a figure from the resutls. Args: input_file: tsv file with the results output_file: file where to store the figure showit (bool): calls :func:`~matplotlib.pyplot.show` before returning. Defaults to False. """ pvalue='P_VALUE' qvalue='Q_VALUE' min_samples = 2 draw_greys = True annotate = True cut = True ############################# MIN_PVALUE = 10000 MAX_ANNOTATED_GENES = 50 df = pd.read_csv(input_file, header=0, sep="\t") df.dropna(subset=[pvalue], inplace=True) # Define the shape of the figure NROW = 1 NCOL = 1 fig = plt.figure(figsize=(6, 6)) axs = [plt.subplot2grid((NROW, NCOL), (item // NCOL, item % NCOL)) for item in range(NROW * NCOL)] # Plot is on the right or on the left? dx_side = True ax = axs[0] colors = ['royalblue', 'blue'] obs_pvalues = df[pvalue].map(lambda x: -np.log10(x) if x > 0 else -np.log10(1 / MIN_PVALUE)) #gens with num_samples >= min_samples have different values for color and alpha obs_color = df['SAMPLES'].map(lambda x: colors[1] if x >= min_samples else colors[0]) obs_alpha = df['SAMPLES'].map(lambda x: 0.7 if x >= min_samples else 0.3) data = pd.DataFrame({'HugoID': df['SYMBOL'], 'observed': obs_pvalues, 'color': obs_color, 'alpha': obs_alpha, 'fdr': df[qvalue] if qvalue is not None else 1 }) data.sort_values(by=['observed'], inplace=True) exp_pvalues = -np.log10(np.arange(1, len(data) + 1) / float(len(data))) exp_pvalues.sort() data['expected'] = exp_pvalues # Get the maximum pvalues (observed and expected) max_x = float(data[['expected']].apply(np.max)) max_y = float(data[['observed']].apply(np.max)) # Give some extra space (+-5%) min_x = max_x * -0.05 min_y = max_y * -0.05 max_x *= 1.1 max_y *= 1.1 grey = data[data['color'] == colors[0]] blue = data[data['color'] == colors[1]] # Plot the data if draw_greys and len(grey['expected']) > 0: ax.scatter(grey['expected'].tolist(), grey['observed'].tolist(), color=grey['color'].tolist(), alpha=grey['alpha'].tolist()[0], s=30) if len(blue['expected']) > 0: ax.scatter(blue['expected'].tolist(), blue['observed'].tolist(), color=blue['color'].tolist(), alpha=blue['alpha'].tolist()[0], s=30) # Get the data that are significant with a FDR < 0.1 and FDR 0.25 genes_to_annotate = [] for fdr_cutoff, fdr_color in zip((0.25, 0.1), ('g-', 'r-')): fdr = data[data['fdr'] < fdr_cutoff]['observed'] if len(fdr) > 0: fdr_y = np.min(fdr) fdr_x = np.min(data[data['observed'] == fdr_y]['expected']) ax.plot((fdr_x - max_x * 0.025, fdr_x + max_x * 0.025), (fdr_y, fdr_y), fdr_color) # Add the name of the significant genes genes = data[(data['observed'] >= fdr_y) & (data['expected'] >= fdr_x)] for count, line in genes.iterrows(): if line['color'] == colors[0]: continue genes_to_annotate.append({'x': line['expected'], 'y': line['observed'], 'HugoID': line['HugoID'], 'color': colors[0] if line['color'] == colors[0] else fdr_color[0]}) # Annotate the genes genes_annotated = 0 if annotate and len(genes_to_annotate) > 0: genes_to_annotate = pd.DataFrame(genes_to_annotate) # Get rid of redundant genes grouped = genes_to_annotate.groupby(['HugoID']) grouped = grouped.apply(eliminate_duplicates) grouped = pd.DataFrame({'HugoID': grouped.index.tolist(), 'x': [x[0] for x in grouped], 'y': [y[1] for y in grouped], 'color': [c[2] for c in grouped]}) grouped.sort_values(by=['y', 'x'], inplace=True, ascending=[False, False]) x_text = max_x * 1.1 if dx_side is True else min_x - (max_x * 0.2) y_text = np.floor(max_y) distance_between_genes = max_y * 0.05 for count, line in grouped.iterrows(): x, y = line['x'], line['y'] ax.annotate(line['HugoID'], xy=(x, y), xytext=(x_text, y_text), arrowprops=dict(facecolor="black", shrink=0.03, width=1, headwidth=6, alpha=0.3), horizontalalignment="left", verticalalignment="center", color=line['color'], weight = 'normal' ) y_text -= distance_between_genes # This avoid the crash for ValueError: width and height must each be below 32768 genes_annotated += 1 if genes_annotated >= MAX_ANNOTATED_GENES: logger.warning("Annotations cut to %d genes", MAX_ANNOTATED_GENES) break # Add labels ax.set_xlabel("expected pvalues") ax.set_ylabel("observed pvalues") # Add the dashed diagonal ax.plot(np.linspace(0, np.floor(max(max_x, max_y))), np.linspace(0, np.floor(max(max_x, max_y))), 'r--') ax.grid(True) # Redefine the limits of the plot if cut: ax.set_xlim(min_x, max_x) ax.set_ylim(min_y, max_y) # Set the title: project, cancer_type #ax.set_title(title) # Adjust the plot try: plt.tight_layout() except ValueError as e: logger.warning('Ignoring tight_layout()') # Save the plot if output_file: plt.savefig(output_file, bbox_inches='tight') # Show the plot if showit: plt.show() # Close the figure plt.close()
[docs]def store_html(input_file, output_path): """ Create the QQPlot and save it. Args: input_file: tsv filw with the data output_path: file where to store the graph showit (bool): defaults to False. See :meth:`~oncodrivefml.store.QQPlot.show`. """ search_by_fields = ['HugoID', 'EnsemblID'] qqp = QQPlot(input_file=input_file, rename_fields={'SAMPLES': 'num_samples','SYMBOL': 'HugoID', 'GENE_ID': 'EnsemblID', 'P_VALUE': 'pvalue', 'Q_VALUE': 'qvalue'}, extra_fields=search_by_fields, cutoff=True) qqp.add_tooltip_enhanced() qqp.add_search_widget(search_by_fields) qqp.show(output_path = output_path, notebook = False, showit=False)
[docs]def store_tsv(results, result_file): """ Saves the results in a tsv file sorted by pvalue Args: results (:obj:`~pandas.DataFrame`): results of the analysis result_file: file where to store the results """ results.index.names = ['GENE_ID'] results.sort_values(by='pvalue', inplace=True) fields = ['muts', 'muts_recurrence', 'samples_mut', 'pvalue', 'qvalue', 'pvalue_neg', 'qvalue_neg', 'snps', 'mnps', 'indels', 'symbol'] df = results[fields].copy() df.reset_index(inplace=True) df.rename(columns={'muts': 'MUTS', 'muts_recurrence': 'MUTS_RECURRENCE', 'samples_mut': 'SAMPLES', 'pvalue': 'P_VALUE', 'qvalue': 'Q_VALUE','pvalue_neg': 'P_VALUE_NEG', 'qvalue_neg': 'Q_VALUE_NEG', 'snps': 'SNP', 'mnps':'MNP', 'indels': 'INDELS', 'symbol': 'SYMBOL'}, inplace=True) df = add_symbol(df) with open(result_file, 'wt') as fd: df.to_csv(fd, sep="\t", header=True, index=False)