Source code for oncodrivefml.load

"""
This module contains the methods used to
load and parse the input files: elements and mutations

.. _elements dict:

elements (:obj:`dict`)
    contains all the segments related to one element. The information is taken from
    the :file:`elements_file`.
    Basic structure:

    .. code-block:: python

        { element_id:
            [
                {
                'CHROMOSOME': chromosome,
                'START': start_position_of_the_segment,
                'END': end_position_of_the_segment,
                'STRAND': strand (+ -> positive | - -> negative)
                'ELEMENT': element_id,
                'SEGMENT': segment_id,
                'SYMBOL': symbol_id
                }
            ]
        }


.. _mutations dict:

mutations (:obj:`dict`)
    contains all the mutations for each element. Most of the information is taken from
    the mutations_file but the *element_id* and the *segment* that are taken from the **elements**.
    More information is added during the execution.
    Basic structure:

    .. code-block:: python

        { element_id:
            [
                {
                'CHROMOSOME': chromosome,
                'POSITION': position_where_the_mutation_occurs,
                'REF': reference_sequence,
                'ALT': alteration_sequence,
                'SAMPLE': sample_id,
                'ALT_TYPE': type_of_the_mutation,
                'CANCER_TYPE': group to which the mutation belongs to,
                'SIGNATURE': a different grouping category,
                }
            ]
        }

.. _mutations data dict:

mutations_data (:obj:`dict`)
    contains the `mutations dict`_ and some metadata information about the mutations.
    Currently, the number of substitutions and indels.
    Basic structure:

    .. code-block:: python

        {
            'data':
                {
                    `mutations dict`_
                },
            'metadata':
                {
                    'snp': amount of SNP mutations
                    'mnp': amount of MNP mutations
                    'mnp_length': total length of the MNP mutations
                    'indel': amount of indels
                }
        }

"""

import gzip
import logging
import pickle

from bgcache import bgcache
from bgparsers import readers
from collections import defaultdict
from intervaltree import IntervalTree

from oncodrivefml import __logger_name__

logger = logging.getLogger(__logger_name__)


[docs]def mutations(file, blacklist=None, metadata_dict=None): """ Parsed the mutations file Args: file: mutations file (see :class:`~oncodrivefml.main.OncodriveFML`) metadata_dict (dict): dict that the function will fill with useful information blacklist (optional): file with blacklisted samples (see :class:`~oncodrivefml.main.OncodriveFML`). Defaults to None. Yields: One line from the mutations file as a dictionary. Each of the inner elements of :ref:`mutations <mutations dict>` """ # Set of samples to blacklist samples_blacklisted = set([s.strip() for s in open(blacklist).readlines()]) if blacklist is not None else set() snp = 0 indel = 0 mnp = 0 mnp_length = 0 for row in readers.variants(file, extra=['CANCER_TYPE', 'SIGNATURE'], required=['CHROMOSOME', 'POSITION', 'REF', 'ALT', 'SAMPLE']): if row['SAMPLE'] in samples_blacklisted: continue if row['ALT_TYPE'] == 'snp': snp += 1 elif row['ALT_TYPE'] == 'mnp': mnp += 1 mnp_length += len(row['REF']) elif row['ALT_TYPE'] == 'indel': # very long indels are discarded if max(len(row['REF']), len(row['ALT'])) > 20: continue indel += 1 yield row if metadata_dict is not None: metadata_dict['snp'] = snp metadata_dict['indel'] = indel metadata_dict['mnp'] = mnp metadata_dict['mnp_length'] = mnp_length
[docs]def snp(file, blacklist=None): """Load only SNP""" for row in mutations(file, blacklist=blacklist): if row['ALT_TYPE'] == 'snp': yield row
[docs]def build_regions_tree(regions): """ Generates a binary tree with the intervals of the regions Args: regions (dict): segments grouped by :ref:`elements <elements dict>`. Returns: dict of :obj:`~intervaltree.IntervalTree`: for each chromosome, it get one :obj:`~intervaltree.IntervalTree` which is a binary tree. The leafs are intervals [low limit, high limit) and the value associated with each interval is the :obj:`tuple` (element, segment). It can be interpreted as: .. code-block:: python { chromosome: (start_position, end_position +1): (element, segment) } """ logger.info('Building regions tree') regions_tree = {} i = 0 for i, (k, allr) in enumerate(regions.items()): if i % 7332 == 0: logger.info("[%d of %d]", i+1, len(regions)) for r in allr: tree = regions_tree.get(r['CHROMOSOME'], IntervalTree()) tree[r['START']:(r['END']+1)] = r['ELEMENT'] regions_tree[r['CHROMOSOME']] = tree logger.info("[%d of %d]", i+1, len(regions)) return regions_tree
@bgcache def elements_tree(elements_file): elements = readers.elements_dict(elements_file, required=['CHROMOSOME', 'START', 'END', 'ELEMENT']) return build_regions_tree(elements)
[docs]def mutations_and_elements(variants_file, elements_file, blacklist=None): """ From the elements and variants file, get dictionaries with the segments grouped by element ID and the mutations grouped in the same way, as well as some information related to the mutations. Args: variants_file: mutations file (see :class:`~oncodrivefml.main.OncodriveFML`) elements_file: elements file (see :class:`~oncodrivefml.main.OncodriveFML`) blacklist (optional): file with blacklisted samples (see :class:`~oncodrivefml.main.OncodriveFML`). Defaults to None. If the blacklist option is passed, the mutations are not loaded from a pickle file. Returns: tuple: mutations and elements Elements: `elements dict`_ Mutations: `mutations data dict`_ The process is done in 3 steps: 1. :meth:`load_regions` #. :meth:`build_regions_tree`. #. each mutation (:meth:`mutations`) is associated with the right element ID """ # Load elements file # TODO add extra and required elements = readers.elements_dict(elements_file, required=['CHROMOSOME', 'START', 'END', 'ELEMENT'], extra=['SEGMENT', 'SYMBOL', 'STRAND']) # If the input file is a pickle file do nothing if variants_file.endswith(".pickle.gz"): with gzip.open(variants_file, 'rb') as fd: return pickle.load(fd), elements # Loading elements tree elements_tree_ = elements_tree(elements_file) # Mapping mutations variants_dict = defaultdict(list) variants_metadata_dict = {} logger.info("Mapping mutations") i = 0 show_small_progress_at = 100000 show_big_progress_at = 1000000 indels_mapped_multiple_of_3 = 0 snp_mapped = 0 mnp_mapped = 0 indels_mapped = 0 for i, r in enumerate(mutations(variants_file, metadata_dict=variants_metadata_dict, blacklist=blacklist)): if r['CHROMOSOME'] not in elements_tree_: continue if i % show_small_progress_at == 0: print('*', end='', flush=True) if i % show_big_progress_at == 0: print(' [{} muts]'.format(i), flush=True) # Get the interval that include that position in the same chromosome intervals = elements_tree_[r['CHROMOSOME']][r['POSITION']] for interval in intervals: element = interval.data variants_dict[element].append(r) if intervals: if r['ALT_TYPE'] == 'snp': snp_mapped += 1 elif r['ALT_TYPE'] == 'mnp': mnp_mapped += 1 else: indels_mapped += 1 if max(len(r['REF']), len(r['ALT'])) % 3 == 0: indels_mapped_multiple_of_3 += 1 if i > show_small_progress_at: print('{} [{} muts]'.format(' '*(((show_big_progress_at-(i % show_big_progress_at)) // show_small_progress_at)+1), i), flush=True) variants_metadata_dict['snp_mapped'] = snp_mapped variants_metadata_dict['mnp_mapped'] = mnp_mapped variants_metadata_dict['indels_mapped'] = indels_mapped variants_metadata_dict['indels_mapped_multiple_of_3'] = indels_mapped_multiple_of_3 mutations_data_dict = {'data': variants_dict, 'metadata': variants_metadata_dict} return mutations_data_dict, elements