Source code for elaspic.elaspic_predictor
# -*- coding: utf-8 -*-
"""
Created on Wed Sep 30 16:54:21 2015
@author: strokach
"""
import os.path as op
import pickle
import logging
import json
import pandas as pd
from . import conf, call_foldx
logger = logging.getLogger(__name__)
configs = conf.Configs()
# %%
secondary_structure_to_int = {
'-': 0, # Coil (none of the below)
'C': 0, # Coil (none of the below)
'B': 1, # Isolated bridge
'b': 1, # Isolated bridge
'E': 2, # Extended conformation
'G': 3, # 3-10 helix
'H': 4, # Alpha helix
'I': 5,
'S': 6,
'T': 7, # Turn
}
[docs]def format_mutation_features(feature_df, core_or_interface):
"""
Converts columns containing comma-separated lists of FoldX features and physicochemical
features into a DataFrame where each feature has its own column.
Parameters
----------
feature_df : DataFrame
A pandas DataFrame containing a subset of rows from the :ref:`uniprot_domain_mutation`
or the :ref:`uniprot_domain_pair_mutation` tables.
core_or_interface : int or str
If 0 or 'core', the `feature_df` DataFrame contains columns from the
:ref:`uniprot_domain_mutation` table.
If 1 or 'interface, the feature_df DataFrame contains columns from the
:ref:`uniprot_domain_pair_mutation` table.
Returns
-------
DataFrame
Contains the same data as `feature_df`, but with columns containing comma-separated lists
of features converted to columns containing a single feature each.
"""
if core_or_interface in [False, 0, 'core']:
foldx_column_name = 'stability_energy'
foldx_feature_names_wt = call_foldx.names_stability_wt
foldx_feature_names_mut = call_foldx.names_stability_mut
elif core_or_interface in [True, 1, 'interface']:
foldx_column_name = 'analyse_complex_energy'
foldx_feature_names_wt = call_foldx.names_stability_complex_wt
foldx_feature_names_mut = call_foldx.names_stability_complex_mut
# Drop rows that have missing FoldX information
# (should not happen when callced from inside the pipeline because we have only one column)
feature_df = feature_df.dropna(subset=[foldx_column_name + '_wt', foldx_column_name + '_mut'])
# FoldX output
for column_index, column_name in enumerate(foldx_feature_names_wt):
feature_df[column_name] = feature_df[foldx_column_name + '_wt'].apply(
lambda x: float(x.split(',')[column_index]))
del feature_df[foldx_column_name + '_wt']
for column_index, column_name in enumerate(foldx_feature_names_mut):
feature_df[column_name] = feature_df[foldx_column_name + '_mut'].apply(
lambda x: float(x.split(',')[column_index]))
del feature_df[foldx_column_name + '_mut']
# PhysicoChemical properties
names_phys_chem = ['pcv_salt_equal', 'pcv_salt_opposite', 'pcv_hbond', 'pcv_vdw']
for column_index, column_name in enumerate(names_phys_chem):
feature_df[column_name + '_wt'] = feature_df['physchem_wt'].apply(
lambda x: int(x.split(',')[column_index]))
feature_df[column_name + '_self_wt'] = feature_df['physchem_wt_ownchain'].apply(
lambda x: int(x.split(',')[column_index]))
feature_df[column_name + '_mut'] = feature_df['physchem_mut'].apply(
lambda x: int(x.split(',')[column_index]))
feature_df[column_name + '_self_mut'] = feature_df['physchem_mut_ownchain'].apply(
lambda x: int(x.split(',')[column_index]))
del feature_df['physchem_wt']
del feature_df['physchem_wt_ownchain']
del feature_df['physchem_mut']
del feature_df['physchem_mut_ownchain']
for col in feature_df.columns:
if 'secondary_structure' in col:
feature_df[col] = feature_df[col].apply(lambda x: secondary_structure_to_int[x])
return feature_df
[docs]def convert_features_to_differences(df, keep_mut=False):
"""
Creates a new set of features (ending in `_change`) that describe the difference between values
of the wildtype (features ending in `_wt`) and mutant (features ending in `_mut`) features.
If `keep_mut` is `False`, removes all mutant features (features ending in `_mut`).
"""
column_list = []
for column_name, column in df.iteritems():
if ('_mut' in column_name and
column_name.replace('_mut', '_wt') in df.columns and
df[column_name].dtype != object):
if keep_mut:
column_list.append(column)
new_column = column - df[column_name.replace('_mut', '_wt')]
if 'secondary_structure' in column_name:
new_column = new_column.apply(lambda x: 1 if x else 0)
new_column.name = column_name.replace('_mut', '_change')
column_list.append(new_column)
else:
column_list.append(column)
# new_df = pd.DataFrame(column_list).T
new_df = pd.concat(column_list, axis=1)
return new_df
# %%
class Predictor:
feature_name_conversion = {
'normDOPE': 'norm_dope',
'seq_id_avg': 'alignment_identity'
}
def __init__(self):
def _load_data(filename):
if op.splitext(filename)[-1] in ['.pkl', '.pickle']:
with open(op.join(configs['data_dir'], filename), 'rb') as ifh:
return pickle.load(ifh)
elif op.splitext(filename)[-1] in ['.jsn', '.json']:
with open(op.join(configs['data_dir'], filename), 'r') as ifh:
return json.load(ifh)
self.clf_domain = _load_data('ml_clf_core_p1.pickle')
self.clf_domain_features = _load_data('ml_features_core_p1.json')
self.clf_interface = _load_data('ml_clf_interface_p1.pickle')
self.clf_interface_features = _load_data('ml_features_interface_p1.json')
self.clf_domain_p1 = _load_data('ml_clf_core_p1.pickle')
self.clf_domain_features_p1 = _load_data('ml_features_core_p1.json')
self.clf_interface_p1 = _load_data('ml_clf_interface_p1.pickle')
self.clf_interface_features_p1 = _load_data('ml_features_interface_p1.json')
[docs] def score(self, df, core_or_interface):
"""
Parameters
----------
df : DataFrame
One or more rows with all data required to predict $\Delta \Delta G$ score.
Like something that you would get when you join the appropriate rows in the database.
Returns
-------
df : Dataframe
Same as the input dataframe, except with one additional column: `ddg`.
"""
if core_or_interface in ['core', 0]:
clf = self.clf_domain
clf_features = self.clf_domain_features
elif core_or_interface in ['interface', 1]:
clf = self.clf_interface
clf_features = self.clf_interface_features
feature_name_conversion = {
'normDOPE': 'norm_dope',
'seq_id_avg': 'alignment_identity'}
clf_features = [feature_name_conversion.get(x, x) for x in clf_features]
df_features = format_mutation_features(df, core_or_interface)
# keep mut, remove it in next step
df_features_asdifferences = convert_features_to_differences(df_features, True)
df_features_asdifferences = df_features_asdifferences[clf_features]
ddg = clf.predict(df_features_asdifferences)[0]
return ddg