Source code for elaspic.elaspic_predictor

# -*- coding: utf-8 -*-
"""
Created on Wed Sep 30 16:54:21 2015

@author: strokach
"""
import os.path as op
import pickle
import logging
import json

import pandas as pd

from . import conf, call_foldx

logger = logging.getLogger(__name__)
configs = conf.Configs()


# %%
secondary_structure_to_int = {
    '-': 0,  # Coil (none of the below)
    'C': 0,  # Coil (none of the below)
    'B': 1,  # Isolated bridge
    'b': 1,  # Isolated bridge
    'E': 2,  # Extended conformation
    'G': 3,  # 3-10 helix
    'H': 4,  # Alpha helix
    'I': 5,
    'S': 6,
    'T': 7,  # Turn
}


[docs]def format_mutation_features(feature_df, core_or_interface): """ Converts columns containing comma-separated lists of FoldX features and physicochemical features into a DataFrame where each feature has its own column. Parameters ---------- feature_df : DataFrame A pandas DataFrame containing a subset of rows from the :ref:`uniprot_domain_mutation` or the :ref:`uniprot_domain_pair_mutation` tables. core_or_interface : int or str If 0 or 'core', the `feature_df` DataFrame contains columns from the :ref:`uniprot_domain_mutation` table. If 1 or 'interface, the feature_df DataFrame contains columns from the :ref:`uniprot_domain_pair_mutation` table. Returns ------- DataFrame Contains the same data as `feature_df`, but with columns containing comma-separated lists of features converted to columns containing a single feature each. """ if core_or_interface in [False, 0, 'core']: foldx_column_name = 'stability_energy' foldx_feature_names_wt = call_foldx.names_stability_wt foldx_feature_names_mut = call_foldx.names_stability_mut elif core_or_interface in [True, 1, 'interface']: foldx_column_name = 'analyse_complex_energy' foldx_feature_names_wt = call_foldx.names_stability_complex_wt foldx_feature_names_mut = call_foldx.names_stability_complex_mut # Drop rows that have missing FoldX information # (should not happen when callced from inside the pipeline because we have only one column) feature_df = feature_df.dropna(subset=[foldx_column_name + '_wt', foldx_column_name + '_mut']) # FoldX output for column_index, column_name in enumerate(foldx_feature_names_wt): feature_df[column_name] = feature_df[foldx_column_name + '_wt'].apply( lambda x: float(x.split(',')[column_index])) del feature_df[foldx_column_name + '_wt'] for column_index, column_name in enumerate(foldx_feature_names_mut): feature_df[column_name] = feature_df[foldx_column_name + '_mut'].apply( lambda x: float(x.split(',')[column_index])) del feature_df[foldx_column_name + '_mut'] # PhysicoChemical properties names_phys_chem = ['pcv_salt_equal', 'pcv_salt_opposite', 'pcv_hbond', 'pcv_vdw'] for column_index, column_name in enumerate(names_phys_chem): feature_df[column_name + '_wt'] = feature_df['physchem_wt'].apply( lambda x: int(x.split(',')[column_index])) feature_df[column_name + '_self_wt'] = feature_df['physchem_wt_ownchain'].apply( lambda x: int(x.split(',')[column_index])) feature_df[column_name + '_mut'] = feature_df['physchem_mut'].apply( lambda x: int(x.split(',')[column_index])) feature_df[column_name + '_self_mut'] = feature_df['physchem_mut_ownchain'].apply( lambda x: int(x.split(',')[column_index])) del feature_df['physchem_wt'] del feature_df['physchem_wt_ownchain'] del feature_df['physchem_mut'] del feature_df['physchem_mut_ownchain'] for col in feature_df.columns: if 'secondary_structure' in col: feature_df[col] = feature_df[col].apply(lambda x: secondary_structure_to_int[x]) return feature_df
[docs]def convert_features_to_differences(df, keep_mut=False): """ Creates a new set of features (ending in `_change`) that describe the difference between values of the wildtype (features ending in `_wt`) and mutant (features ending in `_mut`) features. If `keep_mut` is `False`, removes all mutant features (features ending in `_mut`). """ column_list = [] for column_name, column in df.iteritems(): if ('_mut' in column_name and column_name.replace('_mut', '_wt') in df.columns and df[column_name].dtype != object): if keep_mut: column_list.append(column) new_column = column - df[column_name.replace('_mut', '_wt')] if 'secondary_structure' in column_name: new_column = new_column.apply(lambda x: 1 if x else 0) new_column.name = column_name.replace('_mut', '_change') column_list.append(new_column) else: column_list.append(column) # new_df = pd.DataFrame(column_list).T new_df = pd.concat(column_list, axis=1) return new_df # %%
class Predictor: feature_name_conversion = { 'normDOPE': 'norm_dope', 'seq_id_avg': 'alignment_identity' } def __init__(self): def _load_data(filename): if op.splitext(filename)[-1] in ['.pkl', '.pickle']: with open(op.join(configs['data_dir'], filename), 'rb') as ifh: return pickle.load(ifh) elif op.splitext(filename)[-1] in ['.jsn', '.json']: with open(op.join(configs['data_dir'], filename), 'r') as ifh: return json.load(ifh) self.clf_domain = _load_data('ml_clf_core_p1.pickle') self.clf_domain_features = _load_data('ml_features_core_p1.json') self.clf_interface = _load_data('ml_clf_interface_p1.pickle') self.clf_interface_features = _load_data('ml_features_interface_p1.json') self.clf_domain_p1 = _load_data('ml_clf_core_p1.pickle') self.clf_domain_features_p1 = _load_data('ml_features_core_p1.json') self.clf_interface_p1 = _load_data('ml_clf_interface_p1.pickle') self.clf_interface_features_p1 = _load_data('ml_features_interface_p1.json')
[docs] def score(self, df, core_or_interface): """ Parameters ---------- df : DataFrame One or more rows with all data required to predict $\Delta \Delta G$ score. Like something that you would get when you join the appropriate rows in the database. Returns ------- df : Dataframe Same as the input dataframe, except with one additional column: `ddg`. """ if core_or_interface in ['core', 0]: clf = self.clf_domain clf_features = self.clf_domain_features elif core_or_interface in ['interface', 1]: clf = self.clf_interface clf_features = self.clf_interface_features feature_name_conversion = { 'normDOPE': 'norm_dope', 'seq_id_avg': 'alignment_identity'} clf_features = [feature_name_conversion.get(x, x) for x in clf_features] df_features = format_mutation_features(df, core_or_interface) # keep mut, remove it in next step df_features_asdifferences = convert_features_to_differences(df_features, True) df_features_asdifferences = df_features_asdifferences[clf_features] ddg = clf.predict(df_features_asdifferences)[0] return ddg