Source code for elaspic.structure_tools

# -*- coding: utf-8 -*-
import os.path as op
import gzip
import string
import logging
import urllib.request
import re

from functools import wraps
from tempfile import NamedTemporaryFile
from collections import defaultdict, OrderedDict

import six
from fastcache import clru_cache

import numpy as np

import Bio
from Bio.PDB import PDBIO, Select, NeighborSearch
from Bio.PDB.PDBParser import PDBParser
from Bio.PDB.MMCIFParser import MMCIFParser
from Bio.PDB.Polypeptide import PPBuilder
from Bio.Alphabet import IUPAC
from Bio.Seq import Seq

from . import errors, helper

logger = logging.getLogger(__name__)

A_DICT = {
    'A': 'ALA', 'R': 'ARG', 'N': 'ASN', 'D': 'ASP', 'C': 'CYS', 'E': 'GLU',
    'Q': 'GLN', 'G': 'GLY', 'H': 'HIS', 'I': 'ILE', 'L': 'LEU', 'K': 'LYS',
    'M': 'MET', 'F': 'PHE', 'P': 'PRO', 'S': 'SER', 'T': 'THR', 'W': 'TRP',
    'Y': 'TYR', 'V': 'VAL', 'U': 'SEC', 'O': 'PYL',
    'B': 'ASX', 'Z': 'GLX', 'J': 'XLE', 'X': 'XAA', '*': 'TER'
}
AAA_DICT = dict([(value, key) for key, value in list(A_DICT.items())])
AAA_DICT['UNK'] = 'X'
AAA_DICT['MSE'] = 'M'
AAA_DICT['CSD'] = 'C'
# Phosphorylated residues
# AAA_DICT['SEP'] = 'S' # PHOSPHOSERINE
# AAA_DICT['TPO'] = 'T' # PHOSPHOTHREONINE
# AAA_DICT['SEP'] = 'Y' # O-PHOSPHOTYROSINE

# Methylated lysines
AAA_DICT['MLZ'] = 'K'
AAA_DICT['MLY'] = 'K'
AAA_DICT['M3L'] = 'K'

AMINO_ACIDS = list(AAA_DICT.keys())

METHYLATED_LYSINES = ['MLZ', 'MLY', 'M3L']
LYSINE_ATOMS = ['N', 'CA', 'CB', 'CG', 'CD', 'CE', 'NZ', 'C', 'O']


# %% Functions for downloading and parsing pdb files
class MMCIFParserMod(MMCIFParser):
    def __init__(self, temp_dir):
        self.temp_dir = temp_dir

[docs] def get_structure(self, structure_id, gzip_fh): """ Altered `get_structure` method which accepts gzip file handles as input. """ with NamedTemporaryFile(mode='w', dir=self.temp_dir) as temp_fh: temp_fh.writelines(gzip_fh.readlines()) temp_fh.flush() temp_fh.seek(0) return super(MMCIFParserMod, self).get_structure(structure_id, temp_fh.name)
[docs]def get_pdb_id(pdb_file): pdb_id = op.basename(pdb_file) for ext in ['\.gz$', '\.pdb$', '\.ent$', '\.cif$', '^pdb']: pdb_id = re.sub(ext, '', pdb_id) pdb_id = pdb_id.upper() return pdb_id.upper()
[docs]def get_pdb_file(pdb_id, pdb_database_dir, pdb_type='ent'): """Get PDB file from a local mirror of the PDB database. """ if pdb_type == 'ent': # Original PDB structure. prefix = 'pdb' suffix = '.ent.gz' relative_pdb_file = ( pdb_id[1:3].lower() + '/' + prefix + pdb_id.lower() + suffix ) elif pdb_type == 'cif': # mmCIF pdb structure. prefix = '' suffix = '.cif.gz' relative_pdb_file = ( '../mmCIF/' + pdb_id[1:3].lower() + '/' + prefix + pdb_id.lower() + suffix ) elif pdb_type == 'pdb': # The first biological unit. prefix = '' suffix = '.pdb1.gz' relative_pdb_file = ( '../../../biounit/coordinates/divided/' + pdb_id[1:3].lower() + '/' + prefix + pdb_id.lower() + suffix ) elif pdb_type == 'raw': # Just a PDB file in some folder. relative_pdb_file = '' else: raise Exception pdb_file = op.join(pdb_database_dir, relative_pdb_file) return pdb_file
[docs]def download_pdb_file(pdb_id, output_dir): """Move PDB structure to the local working directory. """ PDB_URL = 'http://www.rcsb.org/pdb/files/{}.pdb' PDB_EURO_URL = 'http://www.ebi.ac.uk/pdbe/entry-files/download/pdb{}.ent' output_pdb_filename = op.join(output_dir, pdb_id + '.pdb') # If the PDB already exists, do nothing... if op.isfile(output_pdb_filename): logger.debug('PDB file {} already exists...'.format(output_pdb_filename)) return output_pdb_filename # Download the PDB file from the internet... logger.info('Downloading PDB {}...'.format(pdb_id + '.pdb')) try: pdb_url = PDB_URL.format(pdb_id) logger.debug(pdb_url) response = urllib.request.urlopen(pdb_url) except urllib.error.URLError as e: logger.debug(e) pdb_url = PDB_EURO_URL.format(pdb_id.lower()) logger.debug(pdb_url) response = urllib.request.urlopen(pdb_url) with open(output_pdb_filename, 'wb') as ofh: ofh.write(response.read()) return output_pdb_filename
[docs]def get_pdb_parser(pdb_type, temp_dir='/tmp'): """Get PDB parser that can work with structures of the specified type. """ if pdb_type in {'ent', 'pdb', 'raw'}: parser = PDBParser(QUIET=True) elif pdb_type in {'cif'}: parser = MMCIFParserMod(temp_dir=temp_dir) else: raise Exception('Unsupported ``pdb_type`` {}'.format(pdb_type)) return parser
@clru_cache(maxsize=128, typed=False)
[docs]def get_pdb(pdb_id, pdb_path, temp_dir='/tmp', pdb_type='ent', use_external=True): """ Parse a pdb file with biopythons PDBParser() and return the structure. Parameters ---------- pdb_code : str Four letter code of the PDB file pdb_path : str Biopython pdb structure temp_dir : str, optional, default='/tmp/' Path to the folder for storing temporary files pdb_type : 'ent'/'pdb'/'cif', optional, default='ent' The extension of the pdb to use Raises ------ PDBNotFoundError If the pdb file could not be retrieved from the local (and remote) databases """ pdb_file = get_pdb_file(pdb_id, pdb_path, pdb_type) pdb_parser = get_pdb_parser(pdb_type, temp_dir) try: structure = pdb_parser.get_structure(pdb_id, gzip.open(pdb_file, 'rt')) except IOError: error_message = ( 'PDB not found! (pdb_id: {}, pdb_path: {}, pdb_type: {})' .format(pdb_id, pdb_path, pdb_type) ) if not use_external: raise errors.PDBNotFoundError(error_message) print('Retrieving pdb from the wwpdb ftp server...') temp_filename = download_pdb_file(pdb_id, temp_dir) structure = pdb_parser.get_structure(pdb_id, gzip.open(temp_filename, 'rt')) return structure
[docs]def get_pdb_structure(pdb_file, pdb_id=None): """Set QUIET to False to output warnings like incomplete chains etc. """ logger.debug('pdb_file: {}'.format(pdb_file)) logger.debug('pdb_id: {}'.format(pdb_id)) if pdb_id is None: pdb_id = get_pdb_id(pdb_file) parser = PDBParser(get_header=True, QUIET=True) if pdb_file.endswith('.gz'): with gzip.open(pdb_file, 'rt') as ifh: structure = parser.get_structure(pdb_id, ifh) else: structure = parser.get_structure(pdb_id, pdb_file) return structure # %%
[docs]def euclidean_distance(a, b): """Calculate the Euclidean distance between two lists or tuples of arbitrary length. """ return np.sqrt(sum((a - b)**2 for a, b in zip(a, b)))
[docs]def calculate_distance(atom_1, atom_2, cutoff=None): """Calculate the distance between two points in 3D space. Parameters ---------- cutoff : float, optional The maximum distance allowable between two points. """ if ((type(atom_1) == type(atom_2) == list) or (type(atom_1) == type(atom_2) == tuple)): a = atom_1 b = atom_2 elif hasattr(atom_1, 'coord') and hasattr(atom_2, 'coord'): a = atom_1.coord b = atom_2.coord else: raise Exception('Unsupported format {} {}'.format(type(atom_1), type(atom_2))) assert(len(a) == 3 and len(b) == 3) if cutoff is None or all(abs(p - q) <= cutoff for p, q in zip(a, b)): return euclidean_distance(a, b) # %%
[docs]def get_chain_seqres_sequence(chain, aa_only=False): """Get the amino acid sequence for the construct coding for the given chain. Extracts a sequence from a PDB file. Usefull when interested in the sequence that was used for crystallization and not the ATOM sequence. Parameters ---------- aa_only : bool If aa_only is set to `False`, selenomethionines will be included in the sequence. See: http://biopython.org/DIST/docs/api/Bio.PDB.Polypeptide-module.html. """ sequence = Seq('', IUPAC.protein) for pb in PPBuilder().build_peptides(chain, aa_only=aa_only): sequence += sequence + pb.get_sequence() return sequence
[docs]def get_chain_sequence_and_numbering(chain, domain_def_tuple=None, include_hetatms=False): """Get the amino acid sequence and a list of residue ids for the given chain. Parameters ---------- chain : Bio.PDB.Chain.Chain The chain for which to get the amino acid sequence and numbering. """ if domain_def_tuple is not None: start_resid, end_resid = domain_def_tuple chain_numbering = [] chain_numbering_extended = [] chain_sequence = [] inside_domain = False for res in chain: # resid = str(res.id[1]) + res.id[2].strip() if domain_def_tuple is None or resid == start_resid: inside_domain = True if inside_domain and (include_hetatms or res.resname in AMINO_ACIDS): chain_numbering.append(res.id[1]) chain_numbering_extended.append(resid) chain_sequence.append(AAA_DICT.get(res.resname, '.')) if domain_def_tuple is not None and resid == end_resid: inside_domain = False chain_sequence = ''.join(chain_sequence) return chain_sequence, chain_numbering_extended
[docs]def convert_position_to_resid(chain, positions, domain_def_tuple=None): """Convert mutation_domain to mutation_modeller. In mutation_modeller, the first amino acid in a chain may start with something other than 1. """ __, chain_numbering = get_chain_sequence_and_numbering( chain, domain_def_tuple) logger.debug('chain_numbering: {}'.format(chain_numbering)) logger.debug('positions: {}'.format(positions)) return [chain_numbering[p-1] for p in positions]
[docs]def get_structure_sequences(file_or_structure, seqres_sequence=False): """ Convenience function returining a dictionary of sequences for a given file or a Biopython Structure, Model or Chain. """ if isinstance(file_or_structure, six.string_types): structure = get_pdb_structure(file_or_structure) model = structure[0] elif isinstance(file_or_structure, Bio.PDB.Structure.Structure): model = file_or_structure[0] elif isinstance(file_or_structure, Bio.PDB.Model.Model): model = file_or_structure elif isinstance(file_or_structure, Bio.PDB.Chain.Chain): model = [file_or_structure] else: raise Exception( 'Unexpected type {} for input ``file_or_structure`` {}!' .format(file_or_structure, type(file_or_structure))) chain_sequences = defaultdict(list) for chain in model: if seqres_sequence: chain_sequence = get_chain_seqres_sequence(chain) else: chain_sequence, __ = get_chain_sequence_and_numbering(chain) chain_sequences[chain.id] = chain_sequence return chain_sequences # %%
[docs]def suppress_logger(fn): @wraps(fn) def fn_quiet(*args, **kwargs): level = logger.level logger.setLevel(logging.WARNING) try: return fn(*args, **kwargs) finally: logger.setLevel(level) return fn_quiet
[docs]def convert_aa(aa, quiet=False): """Convert amino acids from three letter code to one letter code or vice versa .. note:: Deprecated! Use ``''.join(AAA_DICT[aaa] for aaa in aa)`` and ``''.join(A_DICT[a] for a in aa)``. """ if quiet: return suppress_logger(convert_aa)(aa) if len(aa) == 3: try: return AAA_DICT[aa.upper()] except KeyError: if not quiet: logger.debug('Not a valid amino acid: {}'.format(aa)) return if len(aa) == 1: try: return A_DICT[aa.upper()] except KeyError: if not quiet: logger.debug('Not a valid amino acid: {}'.format(aa)) return if not quiet: logger.debug('Not a valid amino acid: {}'.format(aa))
[docs]def convert_resnum_alphanumeric_to_numeric(resnum): """ Convert residue numbering that has letters (i.e. 1A, 1B, 1C...) to residue numbering without letters (i.e. 1, 2, 3...). .. note:: Deprecated! Use ``get_chain_sequence_and_numbering()``. """ idx_increment = 0 while string.ascii_letters.find(resnum[-1]) != -1: idx_increment += string.ascii_letters.find(resnum[-1]) resnum = resnum[:-1] resnum = int(resnum) + idx_increment return resnum # %% STANDALONE FUNCTIONS
[docs]def get_interactions(model, chain_id, r_cutoff=6): """ """ interactions = {} for chain_id_2, chain_2 in model.child_dict.items(): if chain_id == chain_id_2: continue interactions[chain_id_2] = ( get_interactions_between_chains(model, chain_id, chain_id_2, r_cutoff) ) return {k: v for (k, v) in interactions.items() if v}
[docs]def get_interactions_between_chains(model, chain_id_1, chain_id_2, r_cutoff=6): """ Calculate interactions between residues in pdb_chain_1 and pdb_chain_2. An interaction is defines as a pair of residues where at least one pair of atom is closer than r_cutoff. The default value for r_cutoff is 5 Angstroms. .. deprecated:: 1.0 Use python:fn:`get_interacting_residues` instead. It gives you both the residue index and the resnum. Returns ------- OrderedDict Keys are (residue_number, residue_amino_acid) tuples (e.g. ('0', 'M'), ('1', 'Q'), ...). Values are lists of (residue_number, residue_amino_acid) tuples. (e.g. [('0', 'M'), ('1', 'Q'), ...]). """ try: from Bio.PDB import NeighborSearch except ImportError as e: print('Importing Biopython NeighborSearch returned an error: {}'.format(e)) print('Using the the slow version of the neighbour-finding algorithm...') return get_interactions_between_chains_slow(model, chain_id_1, chain_id_2, r_cutoff) # Extract the chains of interest from the model chain_1 = None chain_2 = None for child in model.get_list(): if child.id == chain_id_1: chain_1 = child if child.id == chain_id_2: chain_2 = child if chain_1 is None or chain_2 is None: raise Exception('Chains %s and %s were not found in the model' % (chain_id_1, chain_id_2)) ns = NeighborSearch(list(chain_2.get_atoms())) interactions_between_chains = OrderedDict() for idx, residue_1 in enumerate(chain_1): if residue_1.resname in AMINO_ACIDS and residue_1.id[0] == ' ': resnum_1 = str(residue_1.id[1]) + residue_1.id[2].strip() resaa_1 = convert_aa(residue_1.get_resname(), quiet=True) interacting_residues = set() for atom_1 in residue_1: interacting_residues.update(ns.search(atom_1.get_coord(), r_cutoff, 'R')) interacting_resids = [] for residue_2 in interacting_residues: resnum_2 = str(residue_2.id[1]) + residue_2.id[2].strip() resaa_2 = convert_aa(residue_2.get_resname(), quiet=True) if residue_2.resname in AMINO_ACIDS and residue_2.id[0] == ' ': interacting_resids.append((resnum_2, resaa_2,)) if interacting_resids: interacting_resids.sort( key=lambda x: int(''.join([c for c in x[0] if c.isdigit()]))) interactions_between_chains[(resnum_1, resaa_1)] = interacting_resids return interactions_between_chains
[docs]def get_interactions_between_chains_slow(model, pdb_chain_1, pdb_chain_2, r_cutoff=5): """ Calculate interactions between residues in pdb_chain_1 and pdb_chain_2. An interaction is defines as a pair of residues where at least one pair of atom is closer than r_cutoff. The default value for r_cutoff is 5 Angstroms. .. deprecated:: 1.0 Use :func:`get_interacting_residues` instead. It gives you both the residue index and the resnum. """ # Extract the chains of interest from the model chain_1 = None chain_2 = None for child in model.get_list(): if child.id == pdb_chain_1: chain_1 = child if child.id == pdb_chain_2: chain_2 = child if chain_1 is None or chain_2 is None: raise Exception( 'Chains %s and %s were not found in the model' % (pdb_chain_1, pdb_chain_2)) interactions_between_chains = OrderedDict() for idx, residue_1 in enumerate(chain_1): if residue_1.resname in AMINO_ACIDS and residue_1.id[0] == ' ': resnum_1 = str(residue_1.id[1]) + residue_1.id[2].strip() resaa_1 = convert_aa(residue_1.get_resname()) interacting_resids = [] for residue_2 in chain_2: resnum_2 = str(residue_2.id[1]) + residue_2.id[2].strip() resaa_2 = convert_aa(residue_2.get_resname()) r_min = None if residue_2.resname in AMINO_ACIDS and residue_2.id[0] == ' ': for atom_1 in residue_1: for atom_2 in residue_2: r = calculate_distance(atom_1, atom_2, r_cutoff) if r is not None: if r_min and r < r_min: r_min = r elif not r_min: r_min = r if r_min: interacting_resids.append((resnum_2, resaa_2, r_min,)) if interacting_resids: interactions_between_chains[(resnum_1, resaa_1)] = interacting_resids return interactions_between_chains # %%
[docs]def chain_is_hetatm(chain): """Return True if the chain is made up entirely of HETATMs. """ hetatms = [None] * len(chain) for i in range(len(chain.child_list)): res = chain.child_list[i] hetatms[i] = res.resname not in AAA_DICT if all(hetatms): return True elif not any(hetatms): return False else: # Something went wrong. sequence, numbering = get_chain_sequence_and_numbering(chain) message = ( 'Some but not all residues in chain {} are hetatms!\n'.format(chain.id) + 'sequence: {}\n'.format(sequence) + 'numbering: {}\n'.format(numbering) ) logger.debug(message) False
[docs]def get_aa_residues(chain): aa_residues = [ residue.id for residue in chain if residue.resname in AAA_DICT and not residue.id[0].split() ] return aa_residues
[docs]def get_interacting_residues(model, r_cutoff=5, skip_hetatm_chains=True): """Returns all interactions between residues on different chains in `model`. Returns ------- dict A dictionary of interactions between chains i (0..n-1) and j (i+1..n). Keys are (chain_idx, chain_id, residue_idx, residue_resnum, residue_amino_acid) tuples. (e.g. (0, 'A', 0, '0', 'M'), (0, 1, '2', 'K'), ...) Values are a list of tuples having the same format as the keys. You can reverse the order of keys and values like this:: complement = dict() for key, values in get_interacting_chains(model): for value in values: complement.setdefault(value, set()).add(key) You can get a list of all interacting chains using this command:: {(key[0], value[0]) for (key, values) in get_interacting_chains(model).items() for value in values} """ from Bio.PDB import NeighborSearch interactions_between_chains = dict() # Chain 1 for chain_1_idx, chain_1 in enumerate(model): if skip_hetatm_chains and chain_is_hetatm(chain_1): message = ( "Skipping chain_1 with idx {} because it contains only hetatms." .format(chain_1_idx) ) logger.debug(message) continue chain_1_residue_ids = get_aa_residues(chain_1) # Chain 2 for j, chain_2 in enumerate(model.child_list[chain_1_idx + 1:]): chain_2_idx = chain_1_idx + 1 + j if skip_hetatm_chains and chain_is_hetatm(chain_2): message = ( "Skipping chain_2 with idx {} because it contains only hetatms." .format(chain_2_idx) ) logger.debug(message) continue chain_2_residue_ids = get_aa_residues(chain_2) ns = NeighborSearch(list(chain_2.get_atoms())) # Residue 1 for residue_1 in chain_1: try: residue_1_idx = chain_1_residue_ids.index(residue_1.id) except ValueError: continue residue_1_resnum = str(residue_1.id[1]) + residue_1.id[2].strip() residue_1_aa = convert_aa(residue_1.resname, quiet=True) residue_1_key = ( chain_1_idx, chain_1.id, residue_1_idx, residue_1_resnum, residue_1_aa ) interacting_residues = set() for atom_1 in residue_1: interacting_residues.update(ns.search(atom_1.get_coord(), r_cutoff, 'R')) # Residue 2 interacting_residue_ids = [] for residue_2 in interacting_residues: try: residue_2_idx = chain_2_residue_ids.index(residue_2.id) except ValueError: continue residue_2_resnum = str(residue_2.id[1]) + residue_2.id[2].strip() residue_2_aa = convert_aa(residue_2.get_resname(), quiet=True) residue_2_key = ( chain_2_idx, chain_2.id, residue_2_idx, residue_2_resnum, residue_2_aa ) interacting_residue_ids.append(residue_2_key) if interacting_residue_ids: interactions_between_chains\ .setdefault(residue_1_key, set())\ .update(interacting_residue_ids) return interactions_between_chains # %% Additions for `pipeline_structure`
class SelectChains(Select): """Only accept the specified chains when saving. """ def __init__(self, chain_letters, ns_chain_letters=None, ns=None, r_cutoff=None): self.chain_letters = chain_letters self.ns_chain_letters = ns_chain_letters self.ns = ns self.r_cutoff = r_cutoff
[docs] def accept_residue(self, residue): chain_id = residue.parent.id if chain_id in self.chain_letters: return True elif (self.ns_chain_letters and self.ns) and (chain_id in self.ns_chain_letters): for atom in residue: if self.ns.search(atom.get_coord(), self.r_cutoff, 'C'): return True return False
class StructureParser: """ Attributes ---------- pdb_id : ___ domain_boundaries : list of lists of lists Elements in the outer list correspond to domains in each chain of the pdb. Elements of the inner list contain the start and end of each fragment of each domain. For example, if there is only one chain with pdb domain boundaries 1-10:20-45, this would correspond to domain_boundaries [[[1,10],[20,45]]]. """ def __init__(self, pdb_file, chain_ids=None, domain_defs=[]): """ Parameters ---------- pdb_file : str Full path and filename of the structure. output_dir : str Folder where to save extracted structures and sequences. chain_ids : list Chains of the structure that should be kept. """ self.pdb_id = get_pdb_id(pdb_file) self.pdb_file = pdb_file self.input_structure = get_pdb_structure(self.pdb_file, self.pdb_id) if chain_ids is None: self.chain_ids = [chain.id for chain in self.input_structure[0].child_list] elif isinstance(chain_ids, str): self.chain_ids = chain_ids.split(',') elif isinstance(chain_ids, list) or isinstance(chain_ids, tuple): self.chain_ids = list(chain_ids) else: raise Exception self.r_cutoff = 6 # remove hetatms more than x A away from the main chain(s) self.domain_boundaries = [] for domain_def in domain_defs: self.domain_boundaries.append( helper.decode_domain_def(domain_def, merge=False, return_string=True) ) self.unique_id = ('pdb_id: {}, chain_ids: {}'.format(self.pdb_id, self.chain_ids))
[docs] def extract(self): """Extract the wanted chains out of the PDB file. Removes water atoms and selects the domain regions (i.e. selects only those parts of the domain that are within the domain boundaries specified). """ logger.debug('Extracting {}...'.format(self.unique_id)) model = self.input_structure[0] # assuming that model 0 is always the desired one new_structure = Bio.PDB.Structure.Structure(self.pdb_id) new_model = Bio.PDB.Model.Model(0) # Always assigning hetatms to chain 'Z' may lead to undesirable performance # when the PDB stucture actually has a chain 'Z'. # As of 2015, there are ~1300 structures with chain 'Z' in the elaspic.domain table. # TODO: Convert `pdb_chain` tables in the database to use binary collation. # I think the Bio.PDB module may have to be upgraded too as it currently does not support # lowercase chain ids. hetatm_chain_id = 'Z' hetatm_chain = Bio.PDB.Chain.Chain(hetatm_chain_id) # Loop over every chain and every residue and make sure that everything is ok chain_idx = 0 while chain_idx < len(self.chain_ids): chain_id = self.chain_ids[chain_idx] chain = model[chain_id] chain_numbering, domain_start_idxs, domain_end_idxs = ( self._get_domain_def_idxs_for_chain(chain, chain_idx) ) logger.debug( 'domain_def: {}, domain_start_idxs: {}, domain_end_idxs: {}'.format( self.domain_boundaries, domain_start_idxs, domain_end_idxs) ) res_idx = 0 while res_idx < len(chain): res = chain.child_list[res_idx] original_res_id = res.id # Move water to the hetatm chain if res.id[0] == 'W': self._move_hetatm_to_hetatm_chain(chain, hetatm_chain, res, echo=False) continue # # Move heteroatoms to the hetatm chain # if res.id[0] != ' ': # self._move_hetatm_to_hetatm_chain(chain, hetatm_chain, res, echo=True) # continue # Now treating all unusual amino acids as hetatms # Convert methylated lysines to regular lysines if res.resname in METHYLATED_LYSINES: self._correct_methylated_lysines(res) # Move hetatms to the hetatm chain if res.resname not in AMINO_ACIDS: self._move_hetatm_to_hetatm_chain(chain, hetatm_chain, res) continue # Cut each chain to domain boundaries residue_is_outside_domain = ( self._residue_outside_domain( chain, chain_numbering, domain_start_idxs, domain_end_idxs, res) ) if residue_is_outside_domain: chain.detach_child(original_res_id) continue res_idx += 1 if len(chain): new_model.add(chain) chain_idx += 1 else: logger.debug('Chain {} is empty! Removing...'.format(chain.id)) self.chain_ids.remove(chain.id) # Make sure that the new model is not empty if not list(new_model.get_atoms()): raise errors.PDBEmptySequenceError(self.unique_id) # Remove hetatms if they are > 6A away from the chains of interest. self._remove_distant_hatatms(new_model, hetatm_chain) if hetatm_chain: logger.debug('Adding hetatm chain of length {}'.format(len(hetatm_chain))) new_model.add(hetatm_chain) self.hetatm_chain_id = hetatm_chain_id else: self.hetatm_chain_id = None # If the hetatm chain is not empty, add it to the model new_structure.add(new_model) self.structure = new_structure logger.debug('PDB {} extracted successfully.'.format(self.pdb_id)) self.interactions_between_chains = ( get_interacting_residues(self.structure[0], self.r_cutoff, True) ) self.interacting_chain_ids = { (key[1], value[1]) for (key, values) in self.interactions_between_chains.items() for value in values } self.interacting_chain_idxs = { (key[0], value[0]) for (key, values) in self.interactions_between_chains.items() for value in values }
[docs] def get_chain_sequence_and_numbering(self, chain_id, *args, **varargs): """Call ``get_chain_sequence_and_numbering`` using chain with id ``chain_id`` """ chain = self.structure[0][chain_id] return get_chain_sequence_and_numbering(chain, *args, **varargs)
[docs] def get_chain_seqres_sequence(self, chain_id, *args, **varargs): """Call ``get_chain_seqres_sequence`` using chain with id ``chain_id`` """ chain = self.structure[0][chain_id] return get_chain_seqres_sequence(chain, *args, **varargs)
[docs] def save_structure(self, output_dir='', remove_disordered=False): if remove_disordered: self._unset_disordered_flags() io = PDBIO() io.set_structure(self.structure) try: # Save all chains together outFile = op.join(output_dir, self.pdb_id + ''.join(self.chain_ids) + '.pdb') io.save(outFile) if len(self.chain_ids) > 1: # Save each chain individually for chain_id in self.chain_ids: chain = self.structure[0][chain_id] if chain_is_hetatm(chain): continue outFile = op.join(output_dir, self.pdb_id + chain_id + '.pdb') atom_list = [atom for atom in self.structure[0][chain_id].get_atoms()] hetatm_chain_ns = NeighborSearch(atom_list) select = SelectChains( chain_id, self.hetatm_chain_id, hetatm_chain_ns, self.r_cutoff) io.save(outFile, select=select) if len(self.chain_ids) > 2: # Save each interacting chain pair. for chain_ids in self.interacting_chain_ids: outFile = op.join(output_dir, self.pdb_id + ''.join(chain_ids) + '.pdb') atom_list = [ atom for atom in self.structure[0][chain_id].get_atoms() for chain_id in chain_ids ] hetatm_chain_ns = NeighborSearch(atom_list) select = SelectChains( chain_ids, self.hetatm_chain_id, hetatm_chain_ns, self.r_cutoff) io.save(outFile, select=select) except AttributeError as e: if remove_disordered: raise(e) self.save_structure(output_dir=output_dir, remove_disordered=True)
[docs] def save_sequences(self, output_dir=''): self.chain_numbering_extended_dict = {} self.chain_sequence_dict = {} for chain_id in self.chain_ids: chain_sequence, chain_numbering_extended = ( self.get_chain_sequence_and_numbering(chain_id) ) self.chain_numbering_extended_dict[chain_id] = chain_numbering_extended self.chain_sequence_dict[chain_id] = chain_sequence with open(op.join(output_dir, self.pdb_id + chain_id + '.fasta'), 'w') as f: f.write('>' + self.pdb_id + chain_id + '\n') f.write(chain_sequence + '\n') f.write('\n')
def _get_domain_def_idxs_for_chain(self, chain, chain_idx): if not self.domain_boundaries or not self.domain_boundaries[chain_idx]: return None, None, None __, chain_numbering = get_chain_sequence_and_numbering(chain) try: domain_start_idxs, domain_end_idxs = [ tuple(chain_numbering.index(resid) for resid in resids) for resids in zip(*self.domain_boundaries[chain_idx])] except Exception as e: print(str(e)) raise errors.PDBDomainDefsError(self.unique_id) return chain_numbering, domain_start_idxs, domain_end_idxs def _correct_methylated_lysines(self, res): new_resname = 'LYS' new_resid = (' ', res.id[1], res.id[2]) logger.debug( 'Renaming residue {} {} to {} {}' .format(res.resname, res.id, new_resname, new_resid)) res.resname = new_resname res.id = new_resid atom_idx = 0 while atom_idx < len(res): atom_id = res.child_list[atom_idx].id if atom_id not in LYSINE_ATOMS: logger.debug( 'Removing atom {} from residue {} {}.'.format(atom_id, res.resname, res.id) ) res.detach_child(atom_id) else: atom_idx += 1 def _move_hetatm_to_hetatm_chain(self, chain, hetatm_chain, res, echo=False): # logger.debug( # 'Moving hetatm residue {} {} to the hetatm chain' # .format(res.resname, res.id)) chain.detach_child(res.id) hetatm_res = res hetatm_res.id = (hetatm_res.id[0], len(hetatm_chain)+1, hetatm_res.id[2],) hetatm_chain.add(hetatm_res) def _residue_outside_domain( self, chain, chain_numbering, domain_start_idxs, domain_end_idxs, res): """Returns `True` if residue ``res`` is outside the domain. """ if domain_start_idxs is None or domain_end_idxs is None: return False resid = str(res.id[1]) + res.id[2].strip() resid_idx = chain_numbering.index(resid) for domain_start_idx, domain_end_idx in zip(domain_start_idxs, domain_end_idxs): if resid_idx >= domain_start_idx and resid_idx <= domain_end_idx: # Residue is inside the domain return False # Residue is outside the domain return True def _remove_distant_hatatms(self, new_model, hetatm_chain): """Detach hetatms that are more than ``self.r_cutoff`` away from the main chain(s) """ ns = NeighborSearch(list(new_model.get_atoms())) hetatm_chain.id = [c for c in reversed(helper.uppercase) if c not in self.chain_ids][0] res_idx = 0 while res_idx < len(hetatm_chain): res_1 = hetatm_chain.child_list[res_idx] in_contact = False for atom_1 in res_1: interacting_residues = ns.search(atom_1.get_coord(), self.r_cutoff, 'R') if interacting_residues: # logger.debug(res_1.id) # logger.debug(interacting_residues) in_contact = True if in_contact: res_idx += 1 continue # logger.debug('Detaching child: {}'.format(res_1.id)) hetatm_chain.detach_child(res_1.id) def _unset_disordered_flags(self): """Change atom and residue ``disordered`` flag to `False`. Otherwise, Biopython may crash when saving the PDB structure. """ logger.debug('Setting all residues and atoms marked as disorded to non-disorded') for m in self.structure: for c in m: for r in c: if r.is_disordered() or r.disordered: logger.debug( 'Changing disordered_flag on residue {} from {} to 0' .format(r, r.disordered)) r.disordered = 0 for a in r: if a.is_disordered() or a.disordered_flag: logger.debug( 'Changing disordered_flag on atom {} from {} to 0' .format(a, a.disordered_flag)) a.disordered_flag = 0